diff --git "a/sft/665K/Full_competesmoe_gate_drop/checkpoint-16632/trainer_state.json" "b/sft/665K/Full_competesmoe_gate_drop/checkpoint-16632/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/665K/Full_competesmoe_gate_drop/checkpoint-16632/trainer_state.json" @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05028656, + "auxiliary_loss_mlp": 0.02213947, + "balance_loss_clip": 2.4357667, + "balance_loss_mlp": 1.76858139, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.28746413750459, + "language_loss": 2.84979177, + "learning_rate": 0.0, + "loss": 1.94464183, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 25.9375, + "router_z_loss_mlp": 4.453125, + "step": 1, + "time_per_iteration": 17.736632347106934 + }, + { + "auxiliary_loss_clip": 0.03379921, + "auxiliary_loss_mlp": 0.01459714, + "balance_loss_clip": 1.62786901, + "balance_loss_mlp": 1.18982434, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.676051504722004, + "language_loss": 1.82812691, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87652326, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 17.515625, + "router_z_loss_mlp": 2.69921875, + "step": 2, + "time_per_iteration": 2.460376262664795 + }, + { + "auxiliary_loss_clip": 0.03318647, + "auxiliary_loss_mlp": 0.01442332, + "balance_loss_clip": 1.62532449, + "balance_loss_mlp": 1.18999028, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 32.834694055964185, + "language_loss": 1.57299364, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62060344, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 16.90625, + "router_z_loss_mlp": 2.5234375, + "step": 3, + "time_per_iteration": 2.4114320278167725 + }, + { + "auxiliary_loss_clip": 0.03361811, + "auxiliary_loss_mlp": 0.0145171, + "balance_loss_clip": 1.62435222, + "balance_loss_mlp": 1.15492654, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.36819748453711, + "language_loss": 1.6767683, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72490346, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 17.359375, + "router_z_loss_mlp": 2.96679688, + "step": 4, + "time_per_iteration": 2.4126992225646973 + }, + { + "auxiliary_loss_clip": 0.03402332, + "auxiliary_loss_mlp": 0.01504807, + "balance_loss_clip": 1.62491655, + "balance_loss_mlp": 1.21679699, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.91001166740947, + "language_loss": 1.9161427, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.96521413, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 17.78125, + "router_z_loss_mlp": 2.88085938, + "step": 5, + "time_per_iteration": 2.5408289432525635 + }, + { + "auxiliary_loss_clip": 0.03371369, + "auxiliary_loss_mlp": 0.01514277, + "balance_loss_clip": 1.61574674, + "balance_loss_mlp": 1.21978199, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.00287914659169, + "language_loss": 1.60825157, + "learning_rate": 1.153628246576487e-06, + "loss": 1.65710807, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 17.578125, + "router_z_loss_mlp": 2.94726562, + "step": 6, + "time_per_iteration": 2.592978000640869 + }, + { + "auxiliary_loss_clip": 0.03355113, + "auxiliary_loss_mlp": 0.01488761, + "balance_loss_clip": 1.61586237, + "balance_loss_mlp": 1.20437527, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 24.665856675007273, + "language_loss": 1.53399837, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58243716, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 17.390625, + "router_z_loss_mlp": 2.84375, + "step": 7, + "time_per_iteration": 2.599067449569702 + }, + { + "auxiliary_loss_clip": 0.03321337, + "auxiliary_loss_mlp": 0.01442469, + "balance_loss_clip": 1.61186397, + "balance_loss_mlp": 1.16514075, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 32.20614369319242, + "language_loss": 1.43750739, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48514545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 17.078125, + "router_z_loss_mlp": 2.77148438, + "step": 8, + "time_per_iteration": 2.7441461086273193 + }, + { + "auxiliary_loss_clip": 0.03370257, + "auxiliary_loss_mlp": 0.0149909, + "balance_loss_clip": 1.61187387, + "balance_loss_mlp": 1.21470428, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 56.753119683172784, + "language_loss": 1.49736547, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54605889, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 17.578125, + "router_z_loss_mlp": 2.84765625, + "step": 9, + "time_per_iteration": 2.9300026893615723 + }, + { + "auxiliary_loss_clip": 0.03309313, + "auxiliary_loss_mlp": 0.01477233, + "balance_loss_clip": 1.61568809, + "balance_loss_mlp": 1.20772433, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.028548144078183, + "language_loss": 1.44519591, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49306142, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 16.9375, + "router_z_loss_mlp": 2.69335938, + "step": 10, + "time_per_iteration": 2.685797929763794 + }, + { + "auxiliary_loss_clip": 0.03366136, + "auxiliary_loss_mlp": 0.01492841, + "balance_loss_clip": 1.6210196, + "balance_loss_mlp": 1.21894526, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.467751210200078, + "language_loss": 1.45345998, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.50204968, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 17.46875, + "router_z_loss_mlp": 2.73828125, + "step": 11, + "time_per_iteration": 2.661346197128296 + }, + { + "auxiliary_loss_clip": 0.03293803, + "auxiliary_loss_mlp": 0.01448577, + "balance_loss_clip": 1.60774803, + "balance_loss_mlp": 1.17239273, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.695419364806003, + "language_loss": 1.45230246, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.4997263, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 16.8125, + "router_z_loss_mlp": 2.76367188, + "step": 12, + "time_per_iteration": 2.5367844104766846 + }, + { + "auxiliary_loss_clip": 0.03336623, + "auxiliary_loss_mlp": 0.01409009, + "balance_loss_clip": 1.61996639, + "balance_loss_mlp": 1.14999163, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 13.68876776146265, + "language_loss": 1.28554821, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.33300459, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 17.140625, + "router_z_loss_mlp": 2.58984375, + "step": 13, + "time_per_iteration": 2.7279367446899414 + }, + { + "auxiliary_loss_clip": 0.03290965, + "auxiliary_loss_mlp": 0.01471568, + "balance_loss_clip": 1.61291313, + "balance_loss_mlp": 1.20244145, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.801331281243603, + "language_loss": 1.208179, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.2558043, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 16.734375, + "router_z_loss_mlp": 2.69335938, + "step": 14, + "time_per_iteration": 2.59757137298584 + }, + { + "auxiliary_loss_clip": 0.03276164, + "auxiliary_loss_mlp": 0.01431719, + "balance_loss_clip": 1.61809194, + "balance_loss_mlp": 1.16812348, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.387661570987522, + "language_loss": 1.12982655, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.17690539, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 16.5625, + "router_z_loss_mlp": 2.63671875, + "step": 15, + "time_per_iteration": 2.7350969314575195 + }, + { + "auxiliary_loss_clip": 0.03242181, + "auxiliary_loss_mlp": 0.0141297, + "balance_loss_clip": 1.60306942, + "balance_loss_mlp": 1.16272593, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.779004720297118, + "language_loss": 1.11437607, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.16092765, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 16.3828125, + "router_z_loss_mlp": 2.5, + "step": 16, + "time_per_iteration": 2.645951271057129 + }, + { + "auxiliary_loss_clip": 0.03229734, + "auxiliary_loss_mlp": 0.01420155, + "balance_loss_clip": 1.60933697, + "balance_loss_mlp": 1.18001974, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 12.157915137366007, + "language_loss": 1.12849021, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17498922, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 16.1640625, + "router_z_loss_mlp": 2.40234375, + "step": 17, + "time_per_iteration": 2.633683204650879 + }, + { + "auxiliary_loss_clip": 0.03166645, + "auxiliary_loss_mlp": 0.01379819, + "balance_loss_clip": 1.60709286, + "balance_loss_mlp": 1.14826739, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.889021456072088, + "language_loss": 1.08284903, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12831378, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 15.59375, + "router_z_loss_mlp": 2.3125, + "step": 18, + "time_per_iteration": 2.66680908203125 + }, + { + "auxiliary_loss_clip": 0.03194649, + "auxiliary_loss_mlp": 0.01401816, + "balance_loss_clip": 1.60630774, + "balance_loss_mlp": 1.13535941, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.680505722673957, + "language_loss": 1.02399337, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06995797, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 15.8828125, + "router_z_loss_mlp": 2.6640625, + "step": 19, + "time_per_iteration": 4.114797115325928 + }, + { + "auxiliary_loss_clip": 0.03137828, + "auxiliary_loss_mlp": 0.01344061, + "balance_loss_clip": 1.60779345, + "balance_loss_mlp": 1.12280869, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 3.826585981901521, + "language_loss": 1.16778171, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21260059, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 15.28125, + "router_z_loss_mlp": 2.21289062, + "step": 20, + "time_per_iteration": 4.019640207290649 + }, + { + "auxiliary_loss_clip": 0.03125126, + "auxiliary_loss_mlp": 0.01380008, + "balance_loss_clip": 1.58922851, + "balance_loss_mlp": 1.12976408, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 4.361473748792682, + "language_loss": 1.06398368, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.10903502, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 15.3515625, + "router_z_loss_mlp": 2.49804688, + "step": 21, + "time_per_iteration": 2.606827974319458 + }, + { + "auxiliary_loss_clip": 0.03027464, + "auxiliary_loss_mlp": 0.01384375, + "balance_loss_clip": 1.57093358, + "balance_loss_mlp": 1.14900827, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.7455255218874464, + "language_loss": 1.06397462, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10809302, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 14.5703125, + "router_z_loss_mlp": 2.35351562, + "step": 22, + "time_per_iteration": 2.6168293952941895 + }, + { + "auxiliary_loss_clip": 0.02979909, + "auxiliary_loss_mlp": 0.01339605, + "balance_loss_clip": 1.57362306, + "balance_loss_mlp": 1.12741208, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.722576219666673, + "language_loss": 0.92059511, + "learning_rate": 2.018794797290208e-06, + "loss": 0.9637903, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 14.0703125, + "router_z_loss_mlp": 2.12402344, + "step": 23, + "time_per_iteration": 2.6969518661499023 + }, + { + "auxiliary_loss_clip": 0.02946176, + "auxiliary_loss_mlp": 0.01365093, + "balance_loss_clip": 1.56547642, + "balance_loss_mlp": 1.14279115, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 3.0158118954104496, + "language_loss": 1.08269382, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12580657, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 13.796875, + "router_z_loss_mlp": 2.22363281, + "step": 24, + "time_per_iteration": 2.5640604496002197 + }, + { + "auxiliary_loss_clip": 0.02835598, + "auxiliary_loss_mlp": 0.01331344, + "balance_loss_clip": 1.55876446, + "balance_loss_mlp": 1.11962795, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 4.0765892248174564, + "language_loss": 1.01396263, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.055632, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 12.7734375, + "router_z_loss_mlp": 2.1171875, + "step": 25, + "time_per_iteration": 2.6246516704559326 + }, + { + "auxiliary_loss_clip": 0.02828904, + "auxiliary_loss_mlp": 0.0131067, + "balance_loss_clip": 1.5613811, + "balance_loss_mlp": 1.10028923, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8103466682124125, + "language_loss": 1.06648111, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10787678, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 12.6796875, + "router_z_loss_mlp": 2.10351562, + "step": 26, + "time_per_iteration": 2.6476187705993652 + }, + { + "auxiliary_loss_clip": 0.0277361, + "auxiliary_loss_mlp": 0.01325792, + "balance_loss_clip": 1.55174983, + "balance_loss_mlp": 1.12447095, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 2.667217970194494, + "language_loss": 0.95522666, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99622071, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 12.21875, + "router_z_loss_mlp": 2.01464844, + "step": 27, + "time_per_iteration": 2.662899971008301 + }, + { + "auxiliary_loss_clip": 0.02754382, + "auxiliary_loss_mlp": 0.01315648, + "balance_loss_clip": 1.55704904, + "balance_loss_mlp": 1.13177967, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 3.104719921307272, + "language_loss": 1.06398213, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10468245, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 11.96875, + "router_z_loss_mlp": 1.83789062, + "step": 28, + "time_per_iteration": 2.5629706382751465 + }, + { + "auxiliary_loss_clip": 0.02714384, + "auxiliary_loss_mlp": 0.01319831, + "balance_loss_clip": 1.54085982, + "balance_loss_mlp": 1.13243437, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.539651245264333, + "language_loss": 1.02607131, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.0664134, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 11.734375, + "router_z_loss_mlp": 1.87402344, + "step": 29, + "time_per_iteration": 2.5935893058776855 + }, + { + "auxiliary_loss_clip": 0.02706702, + "auxiliary_loss_mlp": 0.01312764, + "balance_loss_clip": 1.53677464, + "balance_loss_mlp": 1.12431824, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.5296988093099837, + "language_loss": 1.19472742, + "learning_rate": 2.189868360711334e-06, + "loss": 1.23492205, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 11.6953125, + "router_z_loss_mlp": 1.88574219, + "step": 30, + "time_per_iteration": 2.5239057540893555 + }, + { + "auxiliary_loss_clip": 0.0263781, + "auxiliary_loss_mlp": 0.01341384, + "balance_loss_clip": 1.52484179, + "balance_loss_mlp": 1.15684795, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 3.1527663719832493, + "language_loss": 1.02647173, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06626368, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 11.125, + "router_z_loss_mlp": 1.84472656, + "step": 31, + "time_per_iteration": 2.640669107437134 + }, + { + "auxiliary_loss_clip": 0.02602866, + "auxiliary_loss_mlp": 0.01333731, + "balance_loss_clip": 1.52571726, + "balance_loss_mlp": 1.15148377, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 3.280444683485652, + "language_loss": 0.95542908, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99479502, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 10.765625, + "router_z_loss_mlp": 1.82324219, + "step": 32, + "time_per_iteration": 2.5570242404937744 + }, + { + "auxiliary_loss_clip": 0.02587697, + "auxiliary_loss_mlp": 0.01308993, + "balance_loss_clip": 1.52181959, + "balance_loss_mlp": 1.13904786, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 4.644740932838608, + "language_loss": 0.95101076, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.98997766, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 10.65625, + "router_z_loss_mlp": 1.70019531, + "step": 33, + "time_per_iteration": 2.643563985824585 + }, + { + "auxiliary_loss_clip": 0.02438311, + "auxiliary_loss_mlp": 0.01308611, + "balance_loss_clip": 1.48852789, + "balance_loss_mlp": 1.14820302, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 2.5913583646941274, + "language_loss": 0.9142316, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95170087, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 9.5078125, + "router_z_loss_mlp": 1.60449219, + "step": 34, + "time_per_iteration": 2.6432738304138184 + }, + { + "auxiliary_loss_clip": 0.02396157, + "auxiliary_loss_mlp": 0.01274917, + "balance_loss_clip": 1.45458913, + "balance_loss_mlp": 1.11670268, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 3.38699334580986, + "language_loss": 0.7667402, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80345094, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 9.4140625, + "router_z_loss_mlp": 1.58203125, + "step": 35, + "time_per_iteration": 2.884291172027588 + }, + { + "auxiliary_loss_clip": 0.02365539, + "auxiliary_loss_mlp": 0.01275803, + "balance_loss_clip": 1.46788907, + "balance_loss_mlp": 1.12998569, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.2917109046596873, + "language_loss": 0.88902307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.9254365, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 8.984375, + "router_z_loss_mlp": 1.45898438, + "step": 36, + "time_per_iteration": 2.65860915184021 + }, + { + "auxiliary_loss_clip": 0.02306188, + "auxiliary_loss_mlp": 0.01340077, + "balance_loss_clip": 1.4541775, + "balance_loss_mlp": 1.19006419, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 2.7521459511702133, + "language_loss": 0.93180829, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.9682709, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 8.5078125, + "router_z_loss_mlp": 1.5, + "step": 37, + "time_per_iteration": 2.604191541671753 + }, + { + "auxiliary_loss_clip": 0.0227285, + "auxiliary_loss_mlp": 0.01280021, + "balance_loss_clip": 1.44989419, + "balance_loss_mlp": 1.15480351, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 4.162220299107636, + "language_loss": 1.03974032, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07526898, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 8.2421875, + "router_z_loss_mlp": 1.25195312, + "step": 38, + "time_per_iteration": 2.5721800327301025 + }, + { + "auxiliary_loss_clip": 0.02244742, + "auxiliary_loss_mlp": 0.01260048, + "balance_loss_clip": 1.44428754, + "balance_loss_mlp": 1.13368642, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 3.1170019232039676, + "language_loss": 0.85480803, + "learning_rate": 2.358792165262154e-06, + "loss": 0.88985598, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 8.00390625, + "router_z_loss_mlp": 1.26367188, + "step": 39, + "time_per_iteration": 2.616281270980835 + }, + { + "auxiliary_loss_clip": 0.02212444, + "auxiliary_loss_mlp": 0.0125346, + "balance_loss_clip": 1.4336524, + "balance_loss_mlp": 1.12185287, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.542217737525079, + "language_loss": 0.90345502, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93811411, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 7.78515625, + "router_z_loss_mlp": 1.31445312, + "step": 40, + "time_per_iteration": 2.539642572402954 + }, + { + "auxiliary_loss_clip": 0.02165046, + "auxiliary_loss_mlp": 0.012767, + "balance_loss_clip": 1.42381334, + "balance_loss_mlp": 1.16101909, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 2.41731132115918, + "language_loss": 0.93637908, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.97079659, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 7.4140625, + "router_z_loss_mlp": 1.15673828, + "step": 41, + "time_per_iteration": 2.5855774879455566 + }, + { + "auxiliary_loss_clip": 0.02131825, + "auxiliary_loss_mlp": 0.01256477, + "balance_loss_clip": 1.41589165, + "balance_loss_mlp": 1.15028489, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 2.3104004413923227, + "language_loss": 0.9734624, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00734544, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 7.15234375, + "router_z_loss_mlp": 1.06103516, + "step": 42, + "time_per_iteration": 2.5752508640289307 + }, + { + "auxiliary_loss_clip": 0.02086768, + "auxiliary_loss_mlp": 0.01303926, + "balance_loss_clip": 1.41655707, + "balance_loss_mlp": 1.19458687, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.3375697814800422, + "language_loss": 0.97532856, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.0092355, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 6.70703125, + "router_z_loss_mlp": 1.09375, + "step": 43, + "time_per_iteration": 2.6771445274353027 + }, + { + "auxiliary_loss_clip": 0.02104973, + "auxiliary_loss_mlp": 0.0131928, + "balance_loss_clip": 1.41461658, + "balance_loss_mlp": 1.20483923, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.3183311380375824, + "language_loss": 0.93588579, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.9701283, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 6.90625, + "router_z_loss_mlp": 1.14453125, + "step": 44, + "time_per_iteration": 2.632737636566162 + }, + { + "auxiliary_loss_clip": 0.02071238, + "auxiliary_loss_mlp": 0.01275991, + "balance_loss_clip": 1.4115696, + "balance_loss_mlp": 1.17523587, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.2959934871704295, + "language_loss": 0.98774511, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02121747, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 6.58984375, + "router_z_loss_mlp": 1.0078125, + "step": 45, + "time_per_iteration": 2.5576863288879395 + }, + { + "auxiliary_loss_clip": 0.02046977, + "auxiliary_loss_mlp": 0.01226261, + "balance_loss_clip": 1.39611733, + "balance_loss_mlp": 1.13694978, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 2.0111915571650796, + "language_loss": 1.02670431, + "learning_rate": 2.465079122983384e-06, + "loss": 1.0594368, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 6.51171875, + "router_z_loss_mlp": 0.89355469, + "step": 46, + "time_per_iteration": 2.6503326892852783 + }, + { + "auxiliary_loss_clip": 0.02005938, + "auxiliary_loss_mlp": 0.01272497, + "balance_loss_clip": 1.3881712, + "balance_loss_mlp": 1.17999089, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.4131101714810743, + "language_loss": 0.88044322, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91322762, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 6.1875, + "router_z_loss_mlp": 0.92480469, + "step": 47, + "time_per_iteration": 2.7154290676116943 + }, + { + "auxiliary_loss_clip": 0.01967248, + "auxiliary_loss_mlp": 0.01254866, + "balance_loss_clip": 1.3772608, + "balance_loss_mlp": 1.16717589, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 1.849267422866272, + "language_loss": 0.87908995, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91131103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 5.8984375, + "router_z_loss_mlp": 0.87695312, + "step": 48, + "time_per_iteration": 2.6182329654693604 + }, + { + "auxiliary_loss_clip": 0.01961741, + "auxiliary_loss_mlp": 0.0124418, + "balance_loss_clip": 1.36555982, + "balance_loss_mlp": 1.1515305, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.4677374309502147, + "language_loss": 0.89662629, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.92868555, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 5.9609375, + "router_z_loss_mlp": 0.92626953, + "step": 49, + "time_per_iteration": 2.6595852375030518 + }, + { + "auxiliary_loss_clip": 0.01954076, + "auxiliary_loss_mlp": 0.01235157, + "balance_loss_clip": 1.3598783, + "balance_loss_mlp": 1.14946938, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 3.492808496291168, + "language_loss": 0.90920842, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94110078, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 5.94140625, + "router_z_loss_mlp": 0.85644531, + "step": 50, + "time_per_iteration": 2.616753339767456 + }, + { + "auxiliary_loss_clip": 0.01953722, + "auxiliary_loss_mlp": 0.01201527, + "balance_loss_clip": 1.36467767, + "balance_loss_mlp": 1.11831915, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 4.959502268457795, + "language_loss": 0.86963522, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90118766, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 5.890625, + "router_z_loss_mlp": 0.83154297, + "step": 51, + "time_per_iteration": 2.674222469329834 + }, + { + "auxiliary_loss_clip": 0.01914944, + "auxiliary_loss_mlp": 0.01204551, + "balance_loss_clip": 1.35381925, + "balance_loss_mlp": 1.12429929, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.2264731076854383, + "language_loss": 0.9517194, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98291439, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 5.60546875, + "router_z_loss_mlp": 0.80273438, + "step": 52, + "time_per_iteration": 2.781916856765747 + }, + { + "auxiliary_loss_clip": 0.01902331, + "auxiliary_loss_mlp": 0.01243145, + "balance_loss_clip": 1.35022974, + "balance_loss_mlp": 1.16232157, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.0230993515919815, + "language_loss": 0.92191362, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95336843, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 5.515625, + "router_z_loss_mlp": 0.80810547, + "step": 53, + "time_per_iteration": 2.5918333530426025 + }, + { + "auxiliary_loss_clip": 0.01893925, + "auxiliary_loss_mlp": 0.0119439, + "balance_loss_clip": 1.35364413, + "balance_loss_mlp": 1.11328006, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.885236584478566, + "language_loss": 0.82824993, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85913312, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 5.40625, + "router_z_loss_mlp": 0.81103516, + "step": 54, + "time_per_iteration": 2.5866806507110596 + }, + { + "auxiliary_loss_clip": 0.01890189, + "auxiliary_loss_mlp": 0.01208228, + "balance_loss_clip": 1.34259081, + "balance_loss_mlp": 1.12893057, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.7872436210505285, + "language_loss": 0.81487292, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84585708, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 5.4765625, + "router_z_loss_mlp": 0.79296875, + "step": 55, + "time_per_iteration": 2.67464542388916 + }, + { + "auxiliary_loss_clip": 0.0187859, + "auxiliary_loss_mlp": 0.01199749, + "balance_loss_clip": 1.33693826, + "balance_loss_mlp": 1.11968756, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.8942284745300237, + "language_loss": 0.86900949, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.89979291, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.80078125, + "step": 56, + "time_per_iteration": 2.6406679153442383 + }, + { + "auxiliary_loss_clip": 0.01874513, + "auxiliary_loss_mlp": 0.0115968, + "balance_loss_clip": 1.32771468, + "balance_loss_mlp": 1.08572221, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.259530255994699, + "language_loss": 0.92747742, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95781934, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 5.48046875, + "router_z_loss_mlp": 0.73925781, + "step": 57, + "time_per_iteration": 2.652461528778076 + }, + { + "auxiliary_loss_clip": 0.01842172, + "auxiliary_loss_mlp": 0.01211292, + "balance_loss_clip": 1.33307981, + "balance_loss_mlp": 1.1382885, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.1860786409287774, + "language_loss": 0.99521375, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02574837, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 5.0859375, + "router_z_loss_mlp": 0.72998047, + "step": 58, + "time_per_iteration": 2.613384246826172 + }, + { + "auxiliary_loss_clip": 0.01823081, + "auxiliary_loss_mlp": 0.01195439, + "balance_loss_clip": 1.32074666, + "balance_loss_mlp": 1.12405622, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.3408286347137834, + "language_loss": 0.88045287, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91063809, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 5.0234375, + "router_z_loss_mlp": 0.71435547, + "step": 59, + "time_per_iteration": 2.5599350929260254 + }, + { + "auxiliary_loss_clip": 0.01846598, + "auxiliary_loss_mlp": 0.01160308, + "balance_loss_clip": 1.32901788, + "balance_loss_mlp": 1.08692241, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.5374645453526936, + "language_loss": 0.93366528, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96373439, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 5.171875, + "router_z_loss_mlp": 0.73388672, + "step": 60, + "time_per_iteration": 2.587898015975952 + }, + { + "auxiliary_loss_clip": 0.01824266, + "auxiliary_loss_mlp": 0.01171743, + "balance_loss_clip": 1.31180966, + "balance_loss_mlp": 1.10355556, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 2.7961732273101143, + "language_loss": 0.89909536, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.92905545, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 5.125, + "router_z_loss_mlp": 0.68212891, + "step": 61, + "time_per_iteration": 2.586806297302246 + }, + { + "auxiliary_loss_clip": 0.01806784, + "auxiliary_loss_mlp": 0.01142553, + "balance_loss_clip": 1.30856884, + "balance_loss_mlp": 1.07536709, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 1.9300258215400483, + "language_loss": 0.88297927, + "learning_rate": 2.657264485425803e-06, + "loss": 0.91247267, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 4.9765625, + "router_z_loss_mlp": 0.671875, + "step": 62, + "time_per_iteration": 4.012999773025513 + }, + { + "auxiliary_loss_clip": 0.01787897, + "auxiliary_loss_mlp": 0.01161485, + "balance_loss_clip": 1.29985452, + "balance_loss_mlp": 1.09105611, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.5023043318728013, + "language_loss": 0.96200013, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99149394, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 4.8671875, + "router_z_loss_mlp": 0.70458984, + "step": 63, + "time_per_iteration": 5.378275156021118 + }, + { + "auxiliary_loss_clip": 0.01798049, + "auxiliary_loss_mlp": 0.01172559, + "balance_loss_clip": 1.30646133, + "balance_loss_mlp": 1.10484803, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.560834861500607, + "language_loss": 0.98898512, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01869118, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 4.9140625, + "router_z_loss_mlp": 0.67724609, + "step": 64, + "time_per_iteration": 3.9892961978912354 + }, + { + "auxiliary_loss_clip": 0.01802369, + "auxiliary_loss_mlp": 0.01152064, + "balance_loss_clip": 1.30682015, + "balance_loss_mlp": 1.08354211, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.2606234491820456, + "language_loss": 0.85429311, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.8838374, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 4.953125, + "router_z_loss_mlp": 0.68554688, + "step": 65, + "time_per_iteration": 2.5712692737579346 + }, + { + "auxiliary_loss_clip": 0.01779167, + "auxiliary_loss_mlp": 0.01156747, + "balance_loss_clip": 1.29398894, + "balance_loss_mlp": 1.08903599, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 1.9870636387457725, + "language_loss": 0.8527534, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88211262, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 4.8515625, + "router_z_loss_mlp": 0.67773438, + "step": 66, + "time_per_iteration": 2.5591142177581787 + }, + { + "auxiliary_loss_clip": 0.01783682, + "auxiliary_loss_mlp": 0.01152981, + "balance_loss_clip": 1.2916739, + "balance_loss_mlp": 1.0769254, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.919319458869329, + "language_loss": 0.96643776, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99580431, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 4.92578125, + "router_z_loss_mlp": 0.76123047, + "step": 67, + "time_per_iteration": 2.5820701122283936 + }, + { + "auxiliary_loss_clip": 0.01754221, + "auxiliary_loss_mlp": 0.01154245, + "balance_loss_clip": 1.28604603, + "balance_loss_mlp": 1.08162236, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.49833158684788, + "language_loss": 0.9435606, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97264516, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.72558594, + "step": 68, + "time_per_iteration": 2.5891027450561523 + }, + { + "auxiliary_loss_clip": 0.01747878, + "auxiliary_loss_mlp": 0.01155135, + "balance_loss_clip": 1.2828486, + "balance_loss_mlp": 1.08551693, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.396389537082038, + "language_loss": 0.95860672, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.9876368, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 4.65234375, + "router_z_loss_mlp": 0.69628906, + "step": 69, + "time_per_iteration": 2.647395372390747 + }, + { + "auxiliary_loss_clip": 0.01743135, + "auxiliary_loss_mlp": 0.01160179, + "balance_loss_clip": 1.28612185, + "balance_loss_mlp": 1.09466159, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.5525142090404427, + "language_loss": 0.97979581, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.008829, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 4.5625, + "router_z_loss_mlp": 0.65527344, + "step": 70, + "time_per_iteration": 2.536085605621338 + }, + { + "auxiliary_loss_clip": 0.01748062, + "auxiliary_loss_mlp": 0.0114498, + "balance_loss_clip": 1.27644682, + "balance_loss_mlp": 1.07574332, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.5225444673086903, + "language_loss": 0.93847048, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96740097, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 4.71484375, + "router_z_loss_mlp": 0.69287109, + "step": 71, + "time_per_iteration": 2.559063673019409 + }, + { + "auxiliary_loss_clip": 0.01830284, + "auxiliary_loss_mlp": 0.0130652, + "balance_loss_clip": 1.43981123, + "balance_loss_mlp": 1.26789618, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4319478815117215, + "language_loss": 0.6569078, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68827581, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 3.90234375, + "router_z_loss_mlp": 0.38574219, + "step": 72, + "time_per_iteration": 3.0839271545410156 + }, + { + "auxiliary_loss_clip": 0.01810636, + "auxiliary_loss_mlp": 0.01269976, + "balance_loss_clip": 1.43034232, + "balance_loss_mlp": 1.2320199, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.261294962954016, + "language_loss": 0.63780713, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.6686132, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 3.796875, + "router_z_loss_mlp": 0.37890625, + "step": 73, + "time_per_iteration": 3.1836323738098145 + }, + { + "auxiliary_loss_clip": 0.01725217, + "auxiliary_loss_mlp": 0.01141558, + "balance_loss_clip": 1.26821113, + "balance_loss_mlp": 1.07351327, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.7414859437427523, + "language_loss": 0.85869449, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88736224, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 4.5703125, + "router_z_loss_mlp": 0.68017578, + "step": 74, + "time_per_iteration": 2.595327615737915 + }, + { + "auxiliary_loss_clip": 0.01725378, + "auxiliary_loss_mlp": 0.0116158, + "balance_loss_clip": 1.26793385, + "balance_loss_mlp": 1.09200931, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 3.494006168051868, + "language_loss": 0.96987236, + "learning_rate": 2.779824149153005e-06, + "loss": 0.99874192, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 4.57421875, + "router_z_loss_mlp": 0.69628906, + "step": 75, + "time_per_iteration": 2.569359064102173 + }, + { + "auxiliary_loss_clip": 0.01704828, + "auxiliary_loss_mlp": 0.0114304, + "balance_loss_clip": 1.26372194, + "balance_loss_mlp": 1.07642615, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.1102855327021586, + "language_loss": 0.87633622, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90481496, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 4.40625, + "router_z_loss_mlp": 0.66601562, + "step": 76, + "time_per_iteration": 2.5893476009368896 + }, + { + "auxiliary_loss_clip": 0.01709551, + "auxiliary_loss_mlp": 0.01148331, + "balance_loss_clip": 1.26286781, + "balance_loss_mlp": 1.07885575, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 2.1571307562652393, + "language_loss": 0.91788161, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94646049, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 4.46484375, + "router_z_loss_mlp": 0.6953125, + "step": 77, + "time_per_iteration": 2.687319755554199 + }, + { + "auxiliary_loss_clip": 0.01697206, + "auxiliary_loss_mlp": 0.01165222, + "balance_loss_clip": 1.26317072, + "balance_loss_mlp": 1.09441125, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1056220730490565, + "language_loss": 0.92180002, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95042425, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 4.34375, + "router_z_loss_mlp": 0.70800781, + "step": 78, + "time_per_iteration": 2.531283140182495 + }, + { + "auxiliary_loss_clip": 0.0169223, + "auxiliary_loss_mlp": 0.01154889, + "balance_loss_clip": 1.25890899, + "balance_loss_mlp": 1.08589101, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.2923971926045295, + "language_loss": 0.82512456, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85359573, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 4.33203125, + "router_z_loss_mlp": 0.69042969, + "step": 79, + "time_per_iteration": 2.553987979888916 + }, + { + "auxiliary_loss_clip": 0.01709209, + "auxiliary_loss_mlp": 0.01135556, + "balance_loss_clip": 1.26020515, + "balance_loss_mlp": 1.06522238, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.513652433684637, + "language_loss": 0.9115954, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94004309, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 4.48828125, + "router_z_loss_mlp": 0.70361328, + "step": 80, + "time_per_iteration": 2.5918593406677246 + }, + { + "auxiliary_loss_clip": 0.01675185, + "auxiliary_loss_mlp": 0.01143246, + "balance_loss_clip": 1.25418043, + "balance_loss_mlp": 1.07253146, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.0187488680678656, + "language_loss": 0.95008987, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97827411, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 4.21875, + "router_z_loss_mlp": 0.70751953, + "step": 81, + "time_per_iteration": 2.5739221572875977 + }, + { + "auxiliary_loss_clip": 0.01690743, + "auxiliary_loss_mlp": 0.01148733, + "balance_loss_clip": 1.25886941, + "balance_loss_mlp": 1.08178473, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 3.2983712002531163, + "language_loss": 0.95982122, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.98821598, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 4.3125, + "router_z_loss_mlp": 0.66943359, + "step": 82, + "time_per_iteration": 2.546844959259033 + }, + { + "auxiliary_loss_clip": 0.01672649, + "auxiliary_loss_mlp": 0.01157777, + "balance_loss_clip": 1.24667799, + "balance_loss_mlp": 1.08744395, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 3.5362653266195143, + "language_loss": 0.86647528, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89477956, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 4.2578125, + "router_z_loss_mlp": 0.70361328, + "step": 83, + "time_per_iteration": 2.583744764328003 + }, + { + "auxiliary_loss_clip": 0.01668356, + "auxiliary_loss_mlp": 0.01156503, + "balance_loss_clip": 1.24910533, + "balance_loss_mlp": 1.0862174, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.286550545239096, + "language_loss": 0.9173981, + "learning_rate": 2.852791070641559e-06, + "loss": 0.9456467, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 4.18359375, + "router_z_loss_mlp": 0.703125, + "step": 84, + "time_per_iteration": 2.549896001815796 + }, + { + "auxiliary_loss_clip": 0.01651, + "auxiliary_loss_mlp": 0.01205702, + "balance_loss_clip": 1.35525835, + "balance_loss_mlp": 1.16979647, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.403135596478043, + "language_loss": 0.62592185, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65448892, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.359375, + "step": 85, + "time_per_iteration": 3.16408371925354 + }, + { + "auxiliary_loss_clip": 0.01652846, + "auxiliary_loss_mlp": 0.01127108, + "balance_loss_clip": 1.2386384, + "balance_loss_mlp": 1.05605948, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.6262370706660547, + "language_loss": 0.90660137, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93440092, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 4.140625, + "router_z_loss_mlp": 0.71142578, + "step": 86, + "time_per_iteration": 2.588958740234375 + }, + { + "auxiliary_loss_clip": 0.01658712, + "auxiliary_loss_mlp": 0.01161306, + "balance_loss_clip": 1.24556768, + "balance_loss_mlp": 1.08930349, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.561234069613073, + "language_loss": 0.8212862, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84948635, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 4.12890625, + "router_z_loss_mlp": 0.72021484, + "step": 87, + "time_per_iteration": 2.5900540351867676 + }, + { + "auxiliary_loss_clip": 0.01647309, + "auxiliary_loss_mlp": 0.01156702, + "balance_loss_clip": 1.24342215, + "balance_loss_mlp": 1.08732271, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8444120611105255, + "language_loss": 0.95810032, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98614043, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.69433594, + "step": 88, + "time_per_iteration": 2.573615789413452 + }, + { + "auxiliary_loss_clip": 0.01663911, + "auxiliary_loss_mlp": 0.01149428, + "balance_loss_clip": 1.24261665, + "balance_loss_mlp": 1.08128846, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 2.1922350464901528, + "language_loss": 0.85996652, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88809991, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 4.2109375, + "router_z_loss_mlp": 0.68164062, + "step": 89, + "time_per_iteration": 2.597578763961792 + }, + { + "auxiliary_loss_clip": 0.01650411, + "auxiliary_loss_mlp": 0.01134582, + "balance_loss_clip": 1.23604083, + "balance_loss_mlp": 1.06696677, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 2.6968949405290887, + "language_loss": 0.91460216, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94245207, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 4.14257812, + "router_z_loss_mlp": 0.67675781, + "step": 90, + "time_per_iteration": 2.616183042526245 + }, + { + "auxiliary_loss_clip": 0.01632353, + "auxiliary_loss_mlp": 0.01134398, + "balance_loss_clip": 1.23281169, + "balance_loss_mlp": 1.06516159, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.179351640476536, + "language_loss": 0.85746646, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88513398, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 3.99609375, + "router_z_loss_mlp": 0.69287109, + "step": 91, + "time_per_iteration": 2.556391716003418 + }, + { + "auxiliary_loss_clip": 0.01632778, + "auxiliary_loss_mlp": 0.01138931, + "balance_loss_clip": 1.22810209, + "balance_loss_mlp": 1.07222128, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.4875795219838057, + "language_loss": 0.86800027, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89571744, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 4.046875, + "router_z_loss_mlp": 0.66699219, + "step": 92, + "time_per_iteration": 2.547135829925537 + }, + { + "auxiliary_loss_clip": 0.01624264, + "auxiliary_loss_mlp": 0.01179132, + "balance_loss_clip": 1.21922636, + "balance_loss_mlp": 1.11084914, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 3.3380094213499154, + "language_loss": 0.92012566, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94815964, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 4.0546875, + "router_z_loss_mlp": 0.68310547, + "step": 93, + "time_per_iteration": 2.52939772605896 + }, + { + "auxiliary_loss_clip": 0.01635374, + "auxiliary_loss_mlp": 0.01152647, + "balance_loss_clip": 1.22467399, + "balance_loss_mlp": 1.08083582, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 3.7566993645815367, + "language_loss": 0.87334728, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90122747, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 4.10742188, + "router_z_loss_mlp": 0.71777344, + "step": 94, + "time_per_iteration": 2.519104242324829 + }, + { + "auxiliary_loss_clip": 0.01570858, + "auxiliary_loss_mlp": 0.01055231, + "balance_loss_clip": 1.31215835, + "balance_loss_mlp": 1.01922929, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.373743181079908, + "language_loss": 0.68169868, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70795953, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.36083984, + "step": 95, + "time_per_iteration": 3.0208895206451416 + }, + { + "auxiliary_loss_clip": 0.01613168, + "auxiliary_loss_mlp": 0.0115077, + "balance_loss_clip": 1.21328688, + "balance_loss_mlp": 1.08296371, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.264353846105608, + "language_loss": 0.9025656, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.93020499, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 4.00390625, + "router_z_loss_mlp": 0.67822266, + "step": 96, + "time_per_iteration": 2.5512144565582275 + }, + { + "auxiliary_loss_clip": 0.01605888, + "auxiliary_loss_mlp": 0.01143661, + "balance_loss_clip": 1.21636486, + "balance_loss_mlp": 1.08062351, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.2673008696195676, + "language_loss": 0.89728683, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92478228, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 3.8984375, + "router_z_loss_mlp": 0.63085938, + "step": 97, + "time_per_iteration": 2.574248790740967 + }, + { + "auxiliary_loss_clip": 0.01593974, + "auxiliary_loss_mlp": 0.01139145, + "balance_loss_clip": 1.20697165, + "balance_loss_mlp": 1.07005155, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 1.9127266627312243, + "language_loss": 0.76370454, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79103571, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 3.87304688, + "router_z_loss_mlp": 0.69091797, + "step": 98, + "time_per_iteration": 2.5217349529266357 + }, + { + "auxiliary_loss_clip": 0.01542275, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.29548049, + "balance_loss_mlp": 1.00711513, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0339013241069803, + "language_loss": 0.65465295, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68049443, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.34814453, + "step": 99, + "time_per_iteration": 3.240787982940674 + }, + { + "auxiliary_loss_clip": 0.01592484, + "auxiliary_loss_mlp": 0.01142818, + "balance_loss_clip": 1.20737648, + "balance_loss_mlp": 1.07176888, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.036092677605886, + "language_loss": 0.90879595, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.936149, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 3.85351562, + "router_z_loss_mlp": 0.71044922, + "step": 100, + "time_per_iteration": 2.6379735469818115 + }, + { + "auxiliary_loss_clip": 0.01603948, + "auxiliary_loss_mlp": 0.0115024, + "balance_loss_clip": 1.21032441, + "balance_loss_mlp": 1.08300579, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.806506514493287, + "language_loss": 0.91076094, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93830281, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 3.93359375, + "router_z_loss_mlp": 0.671875, + "step": 101, + "time_per_iteration": 2.618497610092163 + }, + { + "auxiliary_loss_clip": 0.015922, + "auxiliary_loss_mlp": 0.01150148, + "balance_loss_clip": 1.21006227, + "balance_loss_mlp": 1.07914686, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.3440290760827103, + "language_loss": 0.9067167, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93414021, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 3.8203125, + "router_z_loss_mlp": 0.71044922, + "step": 102, + "time_per_iteration": 2.579169273376465 + }, + { + "auxiliary_loss_clip": 0.0158641, + "auxiliary_loss_mlp": 0.01133833, + "balance_loss_clip": 1.20686507, + "balance_loss_mlp": 1.06831574, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 3.488112010687028, + "language_loss": 0.87939245, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90659487, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 3.79492188, + "router_z_loss_mlp": 0.65478516, + "step": 103, + "time_per_iteration": 2.598677396774292 + }, + { + "auxiliary_loss_clip": 0.01588733, + "auxiliary_loss_mlp": 0.01136394, + "balance_loss_clip": 1.20869446, + "balance_loss_mlp": 1.06896901, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 2.2040455502874012, + "language_loss": 0.93913114, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96638244, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 0.67431641, + "step": 104, + "time_per_iteration": 2.5202975273132324 + }, + { + "auxiliary_loss_clip": 0.01577419, + "auxiliary_loss_mlp": 0.01144777, + "balance_loss_clip": 1.19935787, + "balance_loss_mlp": 1.07992673, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.481999777719057, + "language_loss": 0.96419168, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99141359, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 3.78125, + "router_z_loss_mlp": 0.6484375, + "step": 105, + "time_per_iteration": 2.5748915672302246 + }, + { + "auxiliary_loss_clip": 0.0157917, + "auxiliary_loss_mlp": 0.01159349, + "balance_loss_clip": 1.19873548, + "balance_loss_mlp": 1.08706081, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 4.287001179889889, + "language_loss": 0.86961257, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89699781, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 3.8046875, + "router_z_loss_mlp": 0.72265625, + "step": 106, + "time_per_iteration": 5.379364490509033 + }, + { + "auxiliary_loss_clip": 0.01561718, + "auxiliary_loss_mlp": 0.01142501, + "balance_loss_clip": 1.18516695, + "balance_loss_mlp": 1.07407486, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 5.226788265808182, + "language_loss": 0.83134091, + "learning_rate": 3.008611048208843e-06, + "loss": 0.85838306, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 3.76367188, + "router_z_loss_mlp": 0.68408203, + "step": 107, + "time_per_iteration": 3.9757018089294434 + }, + { + "auxiliary_loss_clip": 0.01486086, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.25748312, + "balance_loss_mlp": 1.00633097, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9910869430722535, + "language_loss": 0.64818984, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67343068, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.31640625, + "step": 108, + "time_per_iteration": 3.134566307067871 + }, + { + "auxiliary_loss_clip": 0.0155302, + "auxiliary_loss_mlp": 0.01133416, + "balance_loss_clip": 1.18851912, + "balance_loss_mlp": 1.06351137, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 2.0672615813985096, + "language_loss": 0.97621691, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00308132, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 3.64648438, + "router_z_loss_mlp": 0.69921875, + "step": 109, + "time_per_iteration": 2.5781922340393066 + }, + { + "auxiliary_loss_clip": 0.01552494, + "auxiliary_loss_mlp": 0.0114244, + "balance_loss_clip": 1.19018817, + "balance_loss_mlp": 1.07549167, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 2.2211537886853474, + "language_loss": 0.84070742, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86765671, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.66894531, + "step": 110, + "time_per_iteration": 2.7177088260650635 + }, + { + "auxiliary_loss_clip": 0.01547193, + "auxiliary_loss_mlp": 0.01149411, + "balance_loss_clip": 1.18426013, + "balance_loss_mlp": 1.08184314, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 1.9488302083026958, + "language_loss": 0.82907391, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85604, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 3.62890625, + "router_z_loss_mlp": 0.67578125, + "step": 111, + "time_per_iteration": 2.708139657974243 + }, + { + "auxiliary_loss_clip": 0.01548443, + "auxiliary_loss_mlp": 0.01133622, + "balance_loss_clip": 1.18834794, + "balance_loss_mlp": 1.06891537, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9032450887075019, + "language_loss": 0.94055992, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96738064, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 3.59960938, + "router_z_loss_mlp": 0.64697266, + "step": 112, + "time_per_iteration": 2.6939945220947266 + }, + { + "auxiliary_loss_clip": 0.01550789, + "auxiliary_loss_mlp": 0.01137784, + "balance_loss_clip": 1.18411565, + "balance_loss_mlp": 1.07312477, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4010296342484083, + "language_loss": 0.79597187, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.82285756, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 0.64697266, + "step": 113, + "time_per_iteration": 2.5756547451019287 + }, + { + "auxiliary_loss_clip": 0.01542711, + "auxiliary_loss_mlp": 0.011359, + "balance_loss_clip": 1.18070817, + "balance_loss_mlp": 1.06923795, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.5056842287482453, + "language_loss": 0.93220437, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95899051, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.66601562, + "step": 114, + "time_per_iteration": 2.5802462100982666 + }, + { + "auxiliary_loss_clip": 0.01547145, + "auxiliary_loss_mlp": 0.01133644, + "balance_loss_clip": 1.18044472, + "balance_loss_mlp": 1.07337165, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.392599933170616, + "language_loss": 0.94491452, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97172236, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 3.66601562, + "router_z_loss_mlp": 0.60205078, + "step": 115, + "time_per_iteration": 2.5768964290618896 + }, + { + "auxiliary_loss_clip": 0.01540491, + "auxiliary_loss_mlp": 0.01119417, + "balance_loss_clip": 1.17791426, + "balance_loss_mlp": 1.05213559, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 4.247125032080281, + "language_loss": 0.81816351, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84476256, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 3.625, + "router_z_loss_mlp": 0.67285156, + "step": 116, + "time_per_iteration": 2.5572736263275146 + }, + { + "auxiliary_loss_clip": 0.01534383, + "auxiliary_loss_mlp": 0.0112832, + "balance_loss_clip": 1.17914069, + "balance_loss_mlp": 1.06342292, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.4205576508556836, + "language_loss": 0.88001323, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90664029, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.64892578, + "step": 117, + "time_per_iteration": 2.5662648677825928 + }, + { + "auxiliary_loss_clip": 0.01531017, + "auxiliary_loss_mlp": 0.01148562, + "balance_loss_clip": 1.17595851, + "balance_loss_mlp": 1.08189988, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.343690652923608, + "language_loss": 0.84469438, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87149012, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.66699219, + "step": 118, + "time_per_iteration": 2.565958261489868 + }, + { + "auxiliary_loss_clip": 0.01543983, + "auxiliary_loss_mlp": 0.01162836, + "balance_loss_clip": 1.17763638, + "balance_loss_mlp": 1.09731841, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.343624934457044, + "language_loss": 0.99267089, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.01973915, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 0.65478516, + "step": 119, + "time_per_iteration": 2.4973297119140625 + }, + { + "auxiliary_loss_clip": 0.01545208, + "auxiliary_loss_mlp": 0.01122801, + "balance_loss_clip": 1.17460322, + "balance_loss_mlp": 1.05947673, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.817564663542095, + "language_loss": 0.89493346, + "learning_rate": 3.082437012097686e-06, + "loss": 0.92161357, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 3.70507812, + "router_z_loss_mlp": 0.6328125, + "step": 120, + "time_per_iteration": 2.616960287094116 + }, + { + "auxiliary_loss_clip": 0.01529481, + "auxiliary_loss_mlp": 0.0113146, + "balance_loss_clip": 1.1755389, + "balance_loss_mlp": 1.06680131, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.9110392797794094, + "language_loss": 0.93304372, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.95965314, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 3.53515625, + "router_z_loss_mlp": 0.64648438, + "step": 121, + "time_per_iteration": 2.6090073585510254 + }, + { + "auxiliary_loss_clip": 0.01532694, + "auxiliary_loss_mlp": 0.0114714, + "balance_loss_clip": 1.17562389, + "balance_loss_mlp": 1.08300543, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.6868177113372442, + "language_loss": 0.90268236, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92948067, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 3.5703125, + "router_z_loss_mlp": 0.64111328, + "step": 122, + "time_per_iteration": 2.6000373363494873 + }, + { + "auxiliary_loss_clip": 0.01525523, + "auxiliary_loss_mlp": 0.01143255, + "balance_loss_clip": 1.16982746, + "balance_loss_mlp": 1.07497168, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.2579800497644, + "language_loss": 0.92417777, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95086557, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 3.55664062, + "router_z_loss_mlp": 0.68310547, + "step": 123, + "time_per_iteration": 2.518935203552246 + }, + { + "auxiliary_loss_clip": 0.01524935, + "auxiliary_loss_mlp": 0.01127529, + "balance_loss_clip": 1.16455603, + "balance_loss_mlp": 1.06449103, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 3.0974272098229876, + "language_loss": 0.7121591, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73868376, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 3.60351562, + "router_z_loss_mlp": 0.62988281, + "step": 124, + "time_per_iteration": 2.702256441116333 + }, + { + "auxiliary_loss_clip": 0.01514107, + "auxiliary_loss_mlp": 0.01125786, + "balance_loss_clip": 1.16681194, + "balance_loss_mlp": 1.06126976, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.0383513336057124, + "language_loss": 0.88326323, + "learning_rate": 3.108720342404542e-06, + "loss": 0.90966219, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.64501953, + "step": 125, + "time_per_iteration": 2.5765724182128906 + }, + { + "auxiliary_loss_clip": 0.01524966, + "auxiliary_loss_mlp": 0.01140526, + "balance_loss_clip": 1.16608989, + "balance_loss_mlp": 1.07658255, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 3.01430062443429, + "language_loss": 0.82099426, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84764922, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 3.58789062, + "router_z_loss_mlp": 0.63916016, + "step": 126, + "time_per_iteration": 2.7225403785705566 + }, + { + "auxiliary_loss_clip": 0.015213, + "auxiliary_loss_mlp": 0.01139985, + "balance_loss_clip": 1.16587651, + "balance_loss_mlp": 1.07732868, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 2.629385590485317, + "language_loss": 0.67350632, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.70011914, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.62597656, + "step": 127, + "time_per_iteration": 2.566335678100586 + }, + { + "auxiliary_loss_clip": 0.01509178, + "auxiliary_loss_mlp": 0.01134384, + "balance_loss_clip": 1.16679502, + "balance_loss_mlp": 1.06958175, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 2.052447253345759, + "language_loss": 0.88172615, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.9081617, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.64746094, + "step": 128, + "time_per_iteration": 2.6317827701568604 + }, + { + "auxiliary_loss_clip": 0.01513821, + "auxiliary_loss_mlp": 0.01145763, + "balance_loss_clip": 1.16322351, + "balance_loss_mlp": 1.08153319, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.7846607813304372, + "language_loss": 0.84641278, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87300861, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.64257812, + "step": 129, + "time_per_iteration": 2.566009759902954 + }, + { + "auxiliary_loss_clip": 0.01506911, + "auxiliary_loss_mlp": 0.01135761, + "balance_loss_clip": 1.16213155, + "balance_loss_mlp": 1.07129228, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023217831217703, + "language_loss": 0.97194743, + "learning_rate": 3.133972684206866e-06, + "loss": 0.99837416, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.64453125, + "step": 130, + "time_per_iteration": 2.5658955574035645 + }, + { + "auxiliary_loss_clip": 0.01500813, + "auxiliary_loss_mlp": 0.01138045, + "balance_loss_clip": 1.15854573, + "balance_loss_mlp": 1.07300472, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 1.873924899991786, + "language_loss": 0.8251164, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85150492, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 3.42578125, + "router_z_loss_mlp": 0.65039062, + "step": 131, + "time_per_iteration": 2.5213260650634766 + }, + { + "auxiliary_loss_clip": 0.01507235, + "auxiliary_loss_mlp": 0.01129252, + "balance_loss_clip": 1.15979922, + "balance_loss_mlp": 1.06740642, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.6089324872895765, + "language_loss": 0.83128214, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85764694, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.61767578, + "step": 132, + "time_per_iteration": 2.599637746810913 + }, + { + "auxiliary_loss_clip": 0.01497757, + "auxiliary_loss_mlp": 0.01128796, + "balance_loss_clip": 1.15500271, + "balance_loss_mlp": 1.0661869, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.3427485077785923, + "language_loss": 0.95368028, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.97994578, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.62548828, + "step": 133, + "time_per_iteration": 2.5123353004455566 + }, + { + "auxiliary_loss_clip": 0.01493201, + "auxiliary_loss_mlp": 0.01127706, + "balance_loss_clip": 1.16341281, + "balance_loss_mlp": 1.06667089, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.9288168054304187, + "language_loss": 0.734074, + "learning_rate": 3.153484849651286e-06, + "loss": 0.76028299, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.60986328, + "step": 134, + "time_per_iteration": 2.6709542274475098 + }, + { + "auxiliary_loss_clip": 0.01489829, + "auxiliary_loss_mlp": 0.01133309, + "balance_loss_clip": 1.15239573, + "balance_loss_mlp": 1.06645668, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.3312107063932213, + "language_loss": 0.88764954, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91388094, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.66845703, + "step": 135, + "time_per_iteration": 2.516296625137329 + }, + { + "auxiliary_loss_clip": 0.01493598, + "auxiliary_loss_mlp": 0.0113286, + "balance_loss_clip": 1.15581894, + "balance_loss_mlp": 1.06719995, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.0284523290681897, + "language_loss": 0.88864899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91491359, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 3.37890625, + "router_z_loss_mlp": 0.65673828, + "step": 136, + "time_per_iteration": 2.59936785697937 + }, + { + "auxiliary_loss_clip": 0.01489627, + "auxiliary_loss_mlp": 0.01110363, + "balance_loss_clip": 1.15081465, + "balance_loss_mlp": 1.04985189, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.190595298966038, + "language_loss": 0.84031188, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86631173, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.60473633, + "step": 137, + "time_per_iteration": 2.5928525924682617 + }, + { + "auxiliary_loss_clip": 0.01487042, + "auxiliary_loss_mlp": 0.01118084, + "balance_loss_clip": 1.15039778, + "balance_loss_mlp": 1.05714393, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.774853617958662, + "language_loss": 0.90161937, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92767066, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.60986328, + "step": 138, + "time_per_iteration": 2.5749876499176025 + }, + { + "auxiliary_loss_clip": 0.01476785, + "auxiliary_loss_mlp": 0.01125725, + "balance_loss_clip": 1.14920878, + "balance_loss_mlp": 1.05996966, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.817067762095726, + "language_loss": 0.91286075, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93888593, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.65771484, + "step": 139, + "time_per_iteration": 2.616903781890869 + }, + { + "auxiliary_loss_clip": 0.01490867, + "auxiliary_loss_mlp": 0.01125228, + "balance_loss_clip": 1.15511584, + "balance_loss_mlp": 1.06171286, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.339009684732292, + "language_loss": 0.85503203, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88119298, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 3.359375, + "router_z_loss_mlp": 0.63525391, + "step": 140, + "time_per_iteration": 2.5938096046447754 + }, + { + "auxiliary_loss_clip": 0.01478055, + "auxiliary_loss_mlp": 0.01122406, + "balance_loss_clip": 1.14980888, + "balance_loss_mlp": 1.06041694, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.234121381844106, + "language_loss": 0.84369636, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86970103, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 3.28515625, + "router_z_loss_mlp": 0.61962891, + "step": 141, + "time_per_iteration": 2.6905977725982666 + }, + { + "auxiliary_loss_clip": 0.01481803, + "auxiliary_loss_mlp": 0.01133515, + "balance_loss_clip": 1.1470437, + "balance_loss_mlp": 1.07081115, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.181971541732563, + "language_loss": 0.81167108, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.83782423, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.62695312, + "step": 142, + "time_per_iteration": 2.4704389572143555 + }, + { + "auxiliary_loss_clip": 0.01382211, + "auxiliary_loss_mlp": 0.0102761, + "balance_loss_clip": 1.19546735, + "balance_loss_mlp": 1.00124133, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0691340669362306, + "language_loss": 0.6693145, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69341266, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.26416016, + "step": 143, + "time_per_iteration": 3.2691643238067627 + }, + { + "auxiliary_loss_clip": 0.01473707, + "auxiliary_loss_mlp": 0.01126109, + "balance_loss_clip": 1.14716172, + "balance_loss_mlp": 1.06435847, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.079772248444265, + "language_loss": 0.83946687, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86546504, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 3.26953125, + "router_z_loss_mlp": 0.61767578, + "step": 144, + "time_per_iteration": 2.5053513050079346 + }, + { + "auxiliary_loss_clip": 0.0146438, + "auxiliary_loss_mlp": 0.01114994, + "balance_loss_clip": 1.1419642, + "balance_loss_mlp": 1.05014396, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.0696229633557097, + "language_loss": 0.88524246, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91103613, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.6484375, + "step": 145, + "time_per_iteration": 2.5453848838806152 + }, + { + "auxiliary_loss_clip": 0.01475639, + "auxiliary_loss_mlp": 0.011299, + "balance_loss_clip": 1.14246154, + "balance_loss_mlp": 1.06547904, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 2.6360207623552085, + "language_loss": 0.85826385, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88431925, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 3.33398438, + "router_z_loss_mlp": 0.64404297, + "step": 146, + "time_per_iteration": 2.5634846687316895 + }, + { + "auxiliary_loss_clip": 0.0136572, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.18448973, + "balance_loss_mlp": 1.00052249, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8594018233777897, + "language_loss": 0.60104454, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62496781, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.26123047, + "step": 147, + "time_per_iteration": 3.2152035236358643 + }, + { + "auxiliary_loss_clip": 0.014682, + "auxiliary_loss_mlp": 0.01125599, + "balance_loss_clip": 1.14672542, + "balance_loss_mlp": 1.06647158, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.9293864581954705, + "language_loss": 0.84808123, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87401921, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.59082031, + "step": 148, + "time_per_iteration": 2.508284091949463 + }, + { + "auxiliary_loss_clip": 0.01472456, + "auxiliary_loss_mlp": 0.01144204, + "balance_loss_clip": 1.14709568, + "balance_loss_mlp": 1.07859087, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.141412926674079, + "language_loss": 0.88777399, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91394055, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.65673828, + "step": 149, + "time_per_iteration": 4.022315979003906 + }, + { + "auxiliary_loss_clip": 0.01470533, + "auxiliary_loss_mlp": 0.01111448, + "balance_loss_clip": 1.14329672, + "balance_loss_mlp": 1.05217719, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 1.8956218834332368, + "language_loss": 0.93039846, + "learning_rate": 3.226108474846181e-06, + "loss": 0.9562183, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.59277344, + "step": 150, + "time_per_iteration": 5.331084251403809 + }, + { + "auxiliary_loss_clip": 0.01456591, + "auxiliary_loss_mlp": 0.01112075, + "balance_loss_clip": 1.13778687, + "balance_loss_mlp": 1.05452085, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 2.792776397367251, + "language_loss": 0.74236298, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76804966, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.57568359, + "step": 151, + "time_per_iteration": 2.6747825145721436 + }, + { + "auxiliary_loss_clip": 0.01470063, + "auxiliary_loss_mlp": 0.01125418, + "balance_loss_clip": 1.1418488, + "balance_loss_mlp": 1.06552756, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 6.924018040886112, + "language_loss": 0.88188607, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90784085, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 3.28320312, + "router_z_loss_mlp": 0.59912109, + "step": 152, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01466518, + "auxiliary_loss_mlp": 0.01126602, + "balance_loss_clip": 1.14733756, + "balance_loss_mlp": 1.06480336, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 2.953080160831717, + "language_loss": 0.84163058, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86756182, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.61767578, + "step": 153, + "time_per_iteration": 2.5877561569213867 + }, + { + "auxiliary_loss_clip": 0.01457341, + "auxiliary_loss_mlp": 0.01133887, + "balance_loss_clip": 1.13884425, + "balance_loss_mlp": 1.07237506, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 2.0428597927635446, + "language_loss": 0.89989489, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92580724, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.61474609, + "step": 154, + "time_per_iteration": 2.57311749458313 + }, + { + "auxiliary_loss_clip": 0.01462234, + "auxiliary_loss_mlp": 0.01156095, + "balance_loss_clip": 1.1419543, + "balance_loss_mlp": 1.0956558, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0072296271745813, + "language_loss": 0.89699543, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92317867, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.60424805, + "step": 155, + "time_per_iteration": 2.670257806777954 + }, + { + "auxiliary_loss_clip": 0.01464654, + "auxiliary_loss_mlp": 0.01120315, + "balance_loss_clip": 1.13919425, + "balance_loss_mlp": 1.06156921, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.9572684634086053, + "language_loss": 0.86826289, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89411259, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.58642578, + "step": 156, + "time_per_iteration": 2.5838682651519775 + }, + { + "auxiliary_loss_clip": 0.01464896, + "auxiliary_loss_mlp": 0.01120237, + "balance_loss_clip": 1.14320445, + "balance_loss_mlp": 1.06125224, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.285024053831635, + "language_loss": 0.99677092, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02262235, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.58984375, + "step": 157, + "time_per_iteration": 2.5491034984588623 + }, + { + "auxiliary_loss_clip": 0.01446917, + "auxiliary_loss_mlp": 0.01144337, + "balance_loss_clip": 1.1372602, + "balance_loss_mlp": 1.08487558, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 1.941904449959261, + "language_loss": 0.88430297, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.9102155, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.59472656, + "step": 158, + "time_per_iteration": 2.559807538986206 + }, + { + "auxiliary_loss_clip": 0.01454181, + "auxiliary_loss_mlp": 0.01126498, + "balance_loss_clip": 1.1373775, + "balance_loss_mlp": 1.06598735, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.5961482679584122, + "language_loss": 0.8658967, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89170349, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.60546875, + "step": 159, + "time_per_iteration": 2.536163806915283 + }, + { + "auxiliary_loss_clip": 0.01443144, + "auxiliary_loss_mlp": 0.01136632, + "balance_loss_clip": 1.13224447, + "balance_loss_mlp": 1.075454, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.909375632521051, + "language_loss": 0.86625338, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.8920511, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.61230469, + "step": 160, + "time_per_iteration": 2.5059242248535156 + }, + { + "auxiliary_loss_clip": 0.01449735, + "auxiliary_loss_mlp": 0.01121477, + "balance_loss_clip": 1.13767958, + "balance_loss_mlp": 1.06525767, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.3963288739671507, + "language_loss": 0.91324407, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.93895614, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.56201172, + "step": 161, + "time_per_iteration": 2.560790777206421 + }, + { + "auxiliary_loss_clip": 0.01455692, + "auxiliary_loss_mlp": 0.01112868, + "balance_loss_clip": 1.13799143, + "balance_loss_mlp": 1.05645764, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 3.4703688138904143, + "language_loss": 0.91567618, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94136178, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.56396484, + "step": 162, + "time_per_iteration": 2.51029372215271 + }, + { + "auxiliary_loss_clip": 0.01315382, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.14674449, + "balance_loss_mlp": 1.00566149, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.1778117936195802, + "language_loss": 0.72357321, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74702299, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.23925781, + "step": 163, + "time_per_iteration": 3.1917357444763184 + }, + { + "auxiliary_loss_clip": 0.01440984, + "auxiliary_loss_mlp": 0.0111667, + "balance_loss_clip": 1.13435841, + "balance_loss_mlp": 1.05897307, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.0588751892134307, + "language_loss": 0.84509939, + "learning_rate": 3.283560135133457e-06, + "loss": 0.87067592, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.57617188, + "step": 164, + "time_per_iteration": 2.727854013442993 + }, + { + "auxiliary_loss_clip": 0.01431609, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_clip": 1.12625504, + "balance_loss_mlp": 1.04813409, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.223540337354243, + "language_loss": 0.89172584, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91708499, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.56152344, + "step": 165, + "time_per_iteration": 2.5965805053710938 + }, + { + "auxiliary_loss_clip": 0.01433799, + "auxiliary_loss_mlp": 0.01131197, + "balance_loss_clip": 1.12628245, + "balance_loss_mlp": 1.07087719, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.702530379291079, + "language_loss": 0.79601693, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82166696, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.60351562, + "step": 166, + "time_per_iteration": 2.685149669647217 + }, + { + "auxiliary_loss_clip": 0.01439996, + "auxiliary_loss_mlp": 0.01128174, + "balance_loss_clip": 1.13065207, + "balance_loss_mlp": 1.06828308, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 3.285357357204044, + "language_loss": 0.91920507, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.9448868, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.59863281, + "step": 167, + "time_per_iteration": 2.6246297359466553 + }, + { + "auxiliary_loss_clip": 0.01429769, + "auxiliary_loss_mlp": 0.01136887, + "balance_loss_clip": 1.12783408, + "balance_loss_mlp": 1.08019114, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 2.567250563987067, + "language_loss": 0.90633631, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93200284, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.56689453, + "step": 168, + "time_per_iteration": 2.5182180404663086 + }, + { + "auxiliary_loss_clip": 0.01423716, + "auxiliary_loss_mlp": 0.0110448, + "balance_loss_clip": 1.12363386, + "balance_loss_mlp": 1.04659235, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.7056977792480135, + "language_loss": 0.87062156, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89590359, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 3.00585938, + "router_z_loss_mlp": 0.57910156, + "step": 169, + "time_per_iteration": 2.6546707153320312 + }, + { + "auxiliary_loss_clip": 0.01426567, + "auxiliary_loss_mlp": 0.01111899, + "balance_loss_clip": 1.12636578, + "balance_loss_mlp": 1.05401063, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 1.913074243465345, + "language_loss": 0.84669828, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87208295, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.57910156, + "step": 170, + "time_per_iteration": 2.56455397605896 + }, + { + "auxiliary_loss_clip": 0.01437218, + "auxiliary_loss_mlp": 0.01134642, + "balance_loss_clip": 1.1265862, + "balance_loss_mlp": 1.07632506, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.2749244428270035, + "language_loss": 0.89829373, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.9240123, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.58300781, + "step": 171, + "time_per_iteration": 2.5768661499023438 + }, + { + "auxiliary_loss_clip": 0.01429899, + "auxiliary_loss_mlp": 0.01109739, + "balance_loss_clip": 1.12876725, + "balance_loss_mlp": 1.05425954, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1.8757665274956918, + "language_loss": 0.89040387, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91580027, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.55493164, + "step": 172, + "time_per_iteration": 2.5970547199249268 + }, + { + "auxiliary_loss_clip": 0.01418588, + "auxiliary_loss_mlp": 0.0112636, + "balance_loss_clip": 1.12194073, + "balance_loss_mlp": 1.06980705, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 2.306627813955057, + "language_loss": 0.80911279, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83456224, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.56518555, + "step": 173, + "time_per_iteration": 2.589643716812134 + }, + { + "auxiliary_loss_clip": 0.01431332, + "auxiliary_loss_mlp": 0.0110778, + "balance_loss_clip": 1.12743521, + "balance_loss_mlp": 1.05404043, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.9099897800739045, + "language_loss": 0.82524335, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85063446, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.53759766, + "step": 174, + "time_per_iteration": 2.6023941040039062 + }, + { + "auxiliary_loss_clip": 0.01425182, + "auxiliary_loss_mlp": 0.01123517, + "balance_loss_clip": 1.12396479, + "balance_loss_mlp": 1.06753671, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.356692831933001, + "language_loss": 0.72777462, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75326163, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.55932617, + "step": 175, + "time_per_iteration": 2.542300224304199 + }, + { + "auxiliary_loss_clip": 0.01427061, + "auxiliary_loss_mlp": 0.01132545, + "balance_loss_clip": 1.12513113, + "balance_loss_mlp": 1.07427585, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.785719466227258, + "language_loss": 0.97908264, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00467873, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.58300781, + "step": 176, + "time_per_iteration": 2.5059375762939453 + }, + { + "auxiliary_loss_clip": 0.01412415, + "auxiliary_loss_mlp": 0.01132607, + "balance_loss_clip": 1.12127256, + "balance_loss_mlp": 1.07853317, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3428476488020062, + "language_loss": 0.77001047, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.7954607, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.54052734, + "step": 177, + "time_per_iteration": 2.529176712036133 + }, + { + "auxiliary_loss_clip": 0.01421557, + "auxiliary_loss_mlp": 0.01103898, + "balance_loss_clip": 1.11950731, + "balance_loss_mlp": 1.04903758, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.575310920775973, + "language_loss": 0.76822972, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79348421, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.5480957, + "step": 178, + "time_per_iteration": 2.5914723873138428 + }, + { + "auxiliary_loss_clip": 0.01430825, + "auxiliary_loss_mlp": 0.01116754, + "balance_loss_clip": 1.1257534, + "balance_loss_mlp": 1.05743504, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.4347432371931967, + "language_loss": 0.84217024, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86764598, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 3.046875, + "router_z_loss_mlp": 0.59326172, + "step": 179, + "time_per_iteration": 2.6074578762054443 + }, + { + "auxiliary_loss_clip": 0.01421804, + "auxiliary_loss_mlp": 0.0111223, + "balance_loss_clip": 1.11901486, + "balance_loss_mlp": 1.05534303, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.0208039729811413, + "language_loss": 0.83546066, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86080104, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.56884766, + "step": 180, + "time_per_iteration": 2.6297781467437744 + }, + { + "auxiliary_loss_clip": 0.01420465, + "auxiliary_loss_mlp": 0.01122612, + "balance_loss_clip": 1.12126422, + "balance_loss_mlp": 1.0666554, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 2.408766657106649, + "language_loss": 0.77793449, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80336529, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.55908203, + "step": 181, + "time_per_iteration": 2.5219783782958984 + }, + { + "auxiliary_loss_clip": 0.01418871, + "auxiliary_loss_mlp": 0.01131308, + "balance_loss_clip": 1.11858737, + "balance_loss_mlp": 1.07632899, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 4.259908104846668, + "language_loss": 0.76172382, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.7872256, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.54907227, + "step": 182, + "time_per_iteration": 2.5760631561279297 + }, + { + "auxiliary_loss_clip": 0.01412771, + "auxiliary_loss_mlp": 0.01124838, + "balance_loss_clip": 1.11738753, + "balance_loss_mlp": 1.06818962, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.090765318034416, + "language_loss": 0.87494057, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.9003166, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.56689453, + "step": 183, + "time_per_iteration": 2.5103540420532227 + }, + { + "auxiliary_loss_clip": 0.01415579, + "auxiliary_loss_mlp": 0.01113421, + "balance_loss_clip": 1.12026143, + "balance_loss_mlp": 1.06199384, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2807129693658448, + "language_loss": 0.86734462, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89263463, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.5144043, + "step": 184, + "time_per_iteration": 2.542156219482422 + }, + { + "auxiliary_loss_clip": 0.01411488, + "auxiliary_loss_mlp": 0.01109242, + "balance_loss_clip": 1.12042141, + "balance_loss_mlp": 1.05173492, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8812199636513467, + "language_loss": 0.83628476, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86149204, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.57519531, + "step": 185, + "time_per_iteration": 2.573418140411377 + }, + { + "auxiliary_loss_clip": 0.01422132, + "auxiliary_loss_mlp": 0.01111506, + "balance_loss_clip": 1.11949944, + "balance_loss_mlp": 1.0537132, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.466443749097042, + "language_loss": 0.71010017, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73543662, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.57714844, + "step": 186, + "time_per_iteration": 2.5790724754333496 + }, + { + "auxiliary_loss_clip": 0.01416798, + "auxiliary_loss_mlp": 0.01105235, + "balance_loss_clip": 1.11872387, + "balance_loss_mlp": 1.05283105, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.385734209647237, + "language_loss": 1.01942909, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04464936, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.52416992, + "step": 187, + "time_per_iteration": 2.4906938076019287 + }, + { + "auxiliary_loss_clip": 0.01405993, + "auxiliary_loss_mlp": 0.01110122, + "balance_loss_clip": 1.11812925, + "balance_loss_mlp": 1.05504727, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.826891216085855, + "language_loss": 0.75103378, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77619493, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.55126953, + "step": 188, + "time_per_iteration": 2.6977083683013916 + }, + { + "auxiliary_loss_clip": 0.01296958, + "auxiliary_loss_mlp": 0.01024038, + "balance_loss_clip": 1.13798237, + "balance_loss_mlp": 1.00329518, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7654687407262383, + "language_loss": 0.56257772, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58578765, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20751953, + "step": 189, + "time_per_iteration": 3.1977689266204834 + }, + { + "auxiliary_loss_clip": 0.01404913, + "auxiliary_loss_mlp": 0.0112503, + "balance_loss_clip": 1.11407113, + "balance_loss_mlp": 1.0672369, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.6165524094519284, + "language_loss": 0.94769591, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.9729954, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.578125, + "step": 190, + "time_per_iteration": 2.5263302326202393 + }, + { + "auxiliary_loss_clip": 0.01408242, + "auxiliary_loss_mlp": 0.01101286, + "balance_loss_clip": 1.11695421, + "balance_loss_mlp": 1.04926276, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 3.2520757297266587, + "language_loss": 0.8437171, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.86881244, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.52075195, + "step": 191, + "time_per_iteration": 2.620990514755249 + }, + { + "auxiliary_loss_clip": 0.01407855, + "auxiliary_loss_mlp": 0.01125874, + "balance_loss_clip": 1.11314535, + "balance_loss_mlp": 1.07344604, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 1.869161963688313, + "language_loss": 0.91666716, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94200444, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.5246582, + "step": 192, + "time_per_iteration": 3.983130693435669 + }, + { + "auxiliary_loss_clip": 0.01400728, + "auxiliary_loss_mlp": 0.01119862, + "balance_loss_clip": 1.11384225, + "balance_loss_mlp": 1.06292725, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.4156468447707846, + "language_loss": 0.86897415, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89418006, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.56958008, + "step": 193, + "time_per_iteration": 6.7199952602386475 + }, + { + "auxiliary_loss_clip": 0.01405044, + "auxiliary_loss_mlp": 0.01100075, + "balance_loss_clip": 1.11182404, + "balance_loss_mlp": 1.04838586, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.066929902814091, + "language_loss": 0.92114508, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94619626, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.51733398, + "step": 194, + "time_per_iteration": 2.5127573013305664 + }, + { + "auxiliary_loss_clip": 0.01408755, + "auxiliary_loss_mlp": 0.01109576, + "balance_loss_clip": 1.11616957, + "balance_loss_mlp": 1.05671823, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.0020572115738604, + "language_loss": 0.89849424, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92367762, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.52880859, + "step": 195, + "time_per_iteration": 2.504622459411621 + }, + { + "auxiliary_loss_clip": 0.01401845, + "auxiliary_loss_mlp": 0.01116363, + "balance_loss_clip": 1.11504519, + "balance_loss_mlp": 1.06069183, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 2.3255245929580486, + "language_loss": 0.85964203, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88482416, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.55688477, + "step": 196, + "time_per_iteration": 2.472382068634033 + }, + { + "auxiliary_loss_clip": 0.01397147, + "auxiliary_loss_mlp": 0.01106948, + "balance_loss_clip": 1.11088014, + "balance_loss_mlp": 1.05301785, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.417570251372187, + "language_loss": 0.93188816, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95692909, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.53955078, + "step": 197, + "time_per_iteration": 2.5227229595184326 + }, + { + "auxiliary_loss_clip": 0.0139865, + "auxiliary_loss_mlp": 0.01112313, + "balance_loss_clip": 1.11339545, + "balance_loss_mlp": 1.05919313, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1851816317989967, + "language_loss": 0.79056251, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81567216, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.53125, + "step": 198, + "time_per_iteration": 2.573976516723633 + }, + { + "auxiliary_loss_clip": 0.013923, + "auxiliary_loss_mlp": 0.01119825, + "balance_loss_clip": 1.11344934, + "balance_loss_mlp": 1.06758714, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 2.222028516352342, + "language_loss": 0.88147962, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90660089, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.52270508, + "step": 199, + "time_per_iteration": 2.5193402767181396 + }, + { + "auxiliary_loss_clip": 0.01407906, + "auxiliary_loss_mlp": 0.01125063, + "balance_loss_clip": 1.11744463, + "balance_loss_mlp": 1.0678668, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 2.541591259650831, + "language_loss": 0.81181431, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83714402, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.57177734, + "step": 200, + "time_per_iteration": 2.5499842166900635 + }, + { + "auxiliary_loss_clip": 0.01403518, + "auxiliary_loss_mlp": 0.01101574, + "balance_loss_clip": 1.11318803, + "balance_loss_mlp": 1.04793, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.8218516375622884, + "language_loss": 0.87615693, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.9012078, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.53710938, + "step": 201, + "time_per_iteration": 2.559450387954712 + }, + { + "auxiliary_loss_clip": 0.01400617, + "auxiliary_loss_mlp": 0.01110073, + "balance_loss_clip": 1.11492229, + "balance_loss_mlp": 1.05580878, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 1.9407870938945906, + "language_loss": 0.84107566, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86618257, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.54248047, + "step": 202, + "time_per_iteration": 2.5110719203948975 + }, + { + "auxiliary_loss_clip": 0.01390332, + "auxiliary_loss_mlp": 0.01108371, + "balance_loss_clip": 1.10826826, + "balance_loss_mlp": 1.05579925, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.924991675243028, + "language_loss": 0.89945793, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92444491, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.52563477, + "step": 203, + "time_per_iteration": 2.5636167526245117 + }, + { + "auxiliary_loss_clip": 0.01271705, + "auxiliary_loss_mlp": 0.0102145, + "balance_loss_clip": 1.11889958, + "balance_loss_mlp": 1.00201845, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0228540478070562, + "language_loss": 0.61259317, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63552469, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.1940918, + "step": 204, + "time_per_iteration": 3.0556440353393555 + }, + { + "auxiliary_loss_clip": 0.01402051, + "auxiliary_loss_mlp": 0.01111786, + "balance_loss_clip": 1.11266565, + "balance_loss_mlp": 1.05888081, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.138674503565941, + "language_loss": 0.91298199, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93812037, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.52905273, + "step": 205, + "time_per_iteration": 2.475816249847412 + }, + { + "auxiliary_loss_clip": 0.01408014, + "auxiliary_loss_mlp": 0.01121433, + "balance_loss_clip": 1.11612797, + "balance_loss_mlp": 1.06712115, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.1674575962624116, + "language_loss": 0.89111316, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91640759, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.54345703, + "step": 206, + "time_per_iteration": 2.5094592571258545 + }, + { + "auxiliary_loss_clip": 0.01397027, + "auxiliary_loss_mlp": 0.01100165, + "balance_loss_clip": 1.10976362, + "balance_loss_mlp": 1.04797542, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 4.403378586884232, + "language_loss": 0.95559317, + "learning_rate": 3.43348263905683e-06, + "loss": 0.98056507, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.52172852, + "step": 207, + "time_per_iteration": 2.5024523735046387 + }, + { + "auxiliary_loss_clip": 0.01397158, + "auxiliary_loss_mlp": 0.01116617, + "balance_loss_clip": 1.11558223, + "balance_loss_mlp": 1.06335461, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 1.8447318312098773, + "language_loss": 0.75834441, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78348219, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.53320312, + "step": 208, + "time_per_iteration": 2.613272190093994 + }, + { + "auxiliary_loss_clip": 0.01383114, + "auxiliary_loss_mlp": 0.01108319, + "balance_loss_clip": 1.10860562, + "balance_loss_mlp": 1.05503201, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.520811449861738, + "language_loss": 0.98570728, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01062155, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.53344727, + "step": 209, + "time_per_iteration": 2.6812217235565186 + }, + { + "auxiliary_loss_clip": 0.01390497, + "auxiliary_loss_mlp": 0.01115796, + "balance_loss_clip": 1.11021078, + "balance_loss_mlp": 1.06243801, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 2.9457004014898343, + "language_loss": 0.85398519, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87904811, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.53393555, + "step": 210, + "time_per_iteration": 2.6623458862304688 + }, + { + "auxiliary_loss_clip": 0.01387838, + "auxiliary_loss_mlp": 0.01117362, + "balance_loss_clip": 1.10910463, + "balance_loss_mlp": 1.06696033, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.766113689705472, + "language_loss": 0.97195625, + "learning_rate": 3.445805545042314e-06, + "loss": 0.9970082, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.50463867, + "step": 211, + "time_per_iteration": 2.560655117034912 + }, + { + "auxiliary_loss_clip": 0.01396946, + "auxiliary_loss_mlp": 0.01119662, + "balance_loss_clip": 1.11426938, + "balance_loss_mlp": 1.06596994, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 2.7047957665783096, + "language_loss": 0.94976676, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97493279, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.53759766, + "step": 212, + "time_per_iteration": 2.6041924953460693 + }, + { + "auxiliary_loss_clip": 0.01387678, + "auxiliary_loss_mlp": 0.01119312, + "balance_loss_clip": 1.11131346, + "balance_loss_mlp": 1.06573927, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 1.812122160045799, + "language_loss": 0.76306164, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78813154, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.53613281, + "step": 213, + "time_per_iteration": 2.7191853523254395 + }, + { + "auxiliary_loss_clip": 0.01388475, + "auxiliary_loss_mlp": 0.0110453, + "balance_loss_clip": 1.10883701, + "balance_loss_mlp": 1.05293596, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 61.576435540011126, + "language_loss": 0.86368346, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.88861352, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.51660156, + "step": 214, + "time_per_iteration": 2.4887266159057617 + }, + { + "auxiliary_loss_clip": 0.01386266, + "auxiliary_loss_mlp": 0.01128965, + "balance_loss_clip": 1.11413848, + "balance_loss_mlp": 1.07400954, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 5.20867762915213, + "language_loss": 0.77678412, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80193639, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.54907227, + "step": 215, + "time_per_iteration": 2.5452351570129395 + }, + { + "auxiliary_loss_clip": 0.01395527, + "auxiliary_loss_mlp": 0.01107597, + "balance_loss_clip": 1.11362267, + "balance_loss_mlp": 1.05383372, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 3.0675222933911437, + "language_loss": 0.90521073, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93024206, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.5378418, + "step": 216, + "time_per_iteration": 2.5550999641418457 + }, + { + "auxiliary_loss_clip": 0.01387844, + "auxiliary_loss_mlp": 0.01108462, + "balance_loss_clip": 1.10723114, + "balance_loss_mlp": 1.05555713, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 4.819734351626594, + "language_loss": 0.93492651, + "learning_rate": 3.463858658104523e-06, + "loss": 0.95988959, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.52905273, + "step": 217, + "time_per_iteration": 2.4673850536346436 + }, + { + "auxiliary_loss_clip": 0.01382756, + "auxiliary_loss_mlp": 0.01109323, + "balance_loss_clip": 1.10647035, + "balance_loss_mlp": 1.05369961, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 2.0479723917808377, + "language_loss": 0.93610579, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96102655, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.55688477, + "step": 218, + "time_per_iteration": 2.5033888816833496 + }, + { + "auxiliary_loss_clip": 0.01378429, + "auxiliary_loss_mlp": 0.01104711, + "balance_loss_clip": 1.10537982, + "balance_loss_mlp": 1.05311704, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 3.324986229448793, + "language_loss": 0.86321437, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88804579, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.51586914, + "step": 219, + "time_per_iteration": 2.5281574726104736 + }, + { + "auxiliary_loss_clip": 0.01383973, + "auxiliary_loss_mlp": 0.01105385, + "balance_loss_clip": 1.10890985, + "balance_loss_mlp": 1.0540539, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 2.138233621989336, + "language_loss": 0.87715012, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90204364, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.51391602, + "step": 220, + "time_per_iteration": 2.5654008388519287 + }, + { + "auxiliary_loss_clip": 0.01376672, + "auxiliary_loss_mlp": 0.01111912, + "balance_loss_clip": 1.1043098, + "balance_loss_mlp": 1.06201088, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.8830772940187297, + "language_loss": 0.863837, + "learning_rate": 3.475618842282164e-06, + "loss": 0.8887229, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.49926758, + "step": 221, + "time_per_iteration": 2.477647542953491 + }, + { + "auxiliary_loss_clip": 0.01380199, + "auxiliary_loss_mlp": 0.01112546, + "balance_loss_clip": 1.10264623, + "balance_loss_mlp": 1.060022, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.208181166884598, + "language_loss": 0.92315739, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94808483, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 2.77539062, + "router_z_loss_mlp": 0.52539062, + "step": 222, + "time_per_iteration": 2.477353096008301 + }, + { + "auxiliary_loss_clip": 0.01380434, + "auxiliary_loss_mlp": 0.01106916, + "balance_loss_clip": 1.10660291, + "balance_loss_mlp": 1.05117393, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 3.1300251622951465, + "language_loss": 0.95708513, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98195863, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.55761719, + "step": 223, + "time_per_iteration": 2.4709246158599854 + }, + { + "auxiliary_loss_clip": 0.01379469, + "auxiliary_loss_mlp": 0.01105646, + "balance_loss_clip": 1.10668349, + "balance_loss_mlp": 1.05433834, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.800962410970024, + "language_loss": 0.88272083, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90757197, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.51342773, + "step": 224, + "time_per_iteration": 2.4802985191345215 + }, + { + "auxiliary_loss_clip": 0.01380264, + "auxiliary_loss_mlp": 0.01107301, + "balance_loss_clip": 1.10690951, + "balance_loss_mlp": 1.05196428, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 3.460852113196131, + "language_loss": 0.89470124, + "learning_rate": 3.487168070036317e-06, + "loss": 0.91957688, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.55395508, + "step": 225, + "time_per_iteration": 2.550046443939209 + }, + { + "auxiliary_loss_clip": 0.01376097, + "auxiliary_loss_mlp": 0.01123622, + "balance_loss_clip": 1.10556459, + "balance_loss_mlp": 1.06954849, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.059212513625438, + "language_loss": 0.98991382, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01491106, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.54150391, + "step": 226, + "time_per_iteration": 2.5432651042938232 + }, + { + "auxiliary_loss_clip": 0.01382494, + "auxiliary_loss_mlp": 0.01115061, + "balance_loss_clip": 1.10805869, + "balance_loss_mlp": 1.0598433, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 2.369757718118532, + "language_loss": 0.90897042, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93394601, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.55151367, + "step": 227, + "time_per_iteration": 2.483640670776367 + }, + { + "auxiliary_loss_clip": 0.01271126, + "auxiliary_loss_mlp": 0.01051981, + "balance_loss_clip": 1.09898019, + "balance_loss_mlp": 1.03264523, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9415478343989303, + "language_loss": 0.57657194, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59980297, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.19311523, + "step": 228, + "time_per_iteration": 3.203871965408325 + }, + { + "auxiliary_loss_clip": 0.01370607, + "auxiliary_loss_mlp": 0.01114978, + "balance_loss_clip": 1.10333419, + "balance_loss_mlp": 1.06460011, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.564190186200543, + "language_loss": 0.87833005, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.9031859, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.50366211, + "step": 229, + "time_per_iteration": 2.4424386024475098 + }, + { + "auxiliary_loss_clip": 0.01382161, + "auxiliary_loss_mlp": 0.01101656, + "balance_loss_clip": 1.10615277, + "balance_loss_mlp": 1.05089688, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 3.4755579489775417, + "language_loss": 0.8395282, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86436641, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.50805664, + "step": 230, + "time_per_iteration": 2.505084753036499 + }, + { + "auxiliary_loss_clip": 0.01380812, + "auxiliary_loss_mlp": 0.01116038, + "balance_loss_clip": 1.1069392, + "balance_loss_mlp": 1.06551695, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 2.098327798973544, + "language_loss": 0.90406811, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92903662, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.50512695, + "step": 231, + "time_per_iteration": 2.4814953804016113 + }, + { + "auxiliary_loss_clip": 0.01387957, + "auxiliary_loss_mlp": 0.0111096, + "balance_loss_clip": 1.11275363, + "balance_loss_mlp": 1.06027222, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.1839755248201373, + "language_loss": 0.83434355, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.85933268, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.50683594, + "step": 232, + "time_per_iteration": 2.5521020889282227 + }, + { + "auxiliary_loss_clip": 0.01382613, + "auxiliary_loss_mlp": 0.01095065, + "balance_loss_clip": 1.10145569, + "balance_loss_mlp": 1.04313791, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 3.4156604143816316, + "language_loss": 0.74168777, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76646459, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.52001953, + "step": 233, + "time_per_iteration": 2.4860236644744873 + }, + { + "auxiliary_loss_clip": 0.01386409, + "auxiliary_loss_mlp": 0.01122128, + "balance_loss_clip": 1.10744119, + "balance_loss_mlp": 1.06843638, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.1631652974886104, + "language_loss": 0.85684192, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88192725, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.53710938, + "step": 234, + "time_per_iteration": 2.550232410430908 + }, + { + "auxiliary_loss_clip": 0.01381523, + "auxiliary_loss_mlp": 0.01109305, + "balance_loss_clip": 1.1092037, + "balance_loss_mlp": 1.05923676, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6968016266190187, + "language_loss": 0.89254355, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9174518, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.50097656, + "step": 235, + "time_per_iteration": 3.9279966354370117 + }, + { + "auxiliary_loss_clip": 0.01380389, + "auxiliary_loss_mlp": 0.01118701, + "balance_loss_clip": 1.11017621, + "balance_loss_mlp": 1.06705976, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.1427761688331257, + "language_loss": 0.85564405, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88063502, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.51611328, + "step": 236, + "time_per_iteration": 5.397352457046509 + }, + { + "auxiliary_loss_clip": 0.01377845, + "auxiliary_loss_mlp": 0.01104844, + "balance_loss_clip": 1.10340285, + "balance_loss_mlp": 1.0517, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.0920916572473494, + "language_loss": 0.82384837, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84867531, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.53100586, + "step": 237, + "time_per_iteration": 4.056375026702881 + }, + { + "auxiliary_loss_clip": 0.01378425, + "auxiliary_loss_mlp": 0.01122499, + "balance_loss_clip": 1.10816932, + "balance_loss_mlp": 1.06790113, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 2.8821276137998053, + "language_loss": 0.7699551, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79496431, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.5456543, + "step": 238, + "time_per_iteration": 2.450962543487549 + }, + { + "auxiliary_loss_clip": 0.01372476, + "auxiliary_loss_mlp": 0.01116117, + "balance_loss_clip": 1.10812736, + "balance_loss_mlp": 1.06831408, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.7526792663245834, + "language_loss": 0.87051594, + "learning_rate": 3.526033015791284e-06, + "loss": 0.8954019, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.47827148, + "step": 239, + "time_per_iteration": 2.51208233833313 + }, + { + "auxiliary_loss_clip": 0.01354499, + "auxiliary_loss_mlp": 0.01098425, + "balance_loss_clip": 1.09717309, + "balance_loss_mlp": 1.05031252, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.403941342229746, + "language_loss": 0.93162525, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95615447, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.48095703, + "step": 240, + "time_per_iteration": 2.5398340225219727 + }, + { + "auxiliary_loss_clip": 0.01368854, + "auxiliary_loss_mlp": 0.01106401, + "balance_loss_clip": 1.10478938, + "balance_loss_mlp": 1.05945659, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 6.009835364856386, + "language_loss": 0.84867829, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87343085, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.46948242, + "step": 241, + "time_per_iteration": 2.6544721126556396 + }, + { + "auxiliary_loss_clip": 0.01362869, + "auxiliary_loss_mlp": 0.01121575, + "balance_loss_clip": 1.10881782, + "balance_loss_mlp": 1.07036221, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 6.24157322726267, + "language_loss": 0.88479882, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90964323, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.51196289, + "step": 242, + "time_per_iteration": 2.472008228302002 + }, + { + "auxiliary_loss_clip": 0.01367976, + "auxiliary_loss_mlp": 0.01101368, + "balance_loss_clip": 1.10422015, + "balance_loss_mlp": 1.05003679, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.4826043495280827, + "language_loss": 0.86654305, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89123648, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.51367188, + "step": 243, + "time_per_iteration": 2.5090649127960205 + }, + { + "auxiliary_loss_clip": 0.01370438, + "auxiliary_loss_mlp": 0.01107955, + "balance_loss_clip": 1.10525548, + "balance_loss_mlp": 1.05698133, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.6460029682930142, + "language_loss": 0.84280849, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86759239, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.50952148, + "step": 244, + "time_per_iteration": 2.5113279819488525 + }, + { + "auxiliary_loss_clip": 0.01376803, + "auxiliary_loss_mlp": 0.01121511, + "balance_loss_clip": 1.10741901, + "balance_loss_mlp": 1.06798661, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.504774425007231, + "language_loss": 0.78652984, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81151295, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.53491211, + "step": 245, + "time_per_iteration": 2.531485080718994 + }, + { + "auxiliary_loss_clip": 0.01375097, + "auxiliary_loss_mlp": 0.01106729, + "balance_loss_clip": 1.1024195, + "balance_loss_mlp": 1.05201173, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 3.81793128831068, + "language_loss": 0.84264827, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86746657, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.54736328, + "step": 246, + "time_per_iteration": 2.5228843688964844 + }, + { + "auxiliary_loss_clip": 0.01371912, + "auxiliary_loss_mlp": 0.01103891, + "balance_loss_clip": 1.10174537, + "balance_loss_mlp": 1.05017543, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 3.539412012182816, + "language_loss": 0.8988952, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92365324, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.53759766, + "step": 247, + "time_per_iteration": 2.596885919570923 + }, + { + "auxiliary_loss_clip": 0.01374573, + "auxiliary_loss_mlp": 0.01102202, + "balance_loss_clip": 1.09926951, + "balance_loss_mlp": 1.05189586, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.0111213676386486, + "language_loss": 0.78289521, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80766302, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.50244141, + "step": 248, + "time_per_iteration": 2.501185417175293 + }, + { + "auxiliary_loss_clip": 0.01370206, + "auxiliary_loss_mlp": 0.01106161, + "balance_loss_clip": 1.10613096, + "balance_loss_mlp": 1.05463839, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.0466102543513265, + "language_loss": 0.83814299, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86290669, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.51513672, + "step": 249, + "time_per_iteration": 2.5622901916503906 + }, + { + "auxiliary_loss_clip": 0.01364201, + "auxiliary_loss_mlp": 0.0110782, + "balance_loss_clip": 1.10092151, + "balance_loss_mlp": 1.0586108, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.836569610151512, + "language_loss": 0.93568945, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.9604097, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.49169922, + "step": 250, + "time_per_iteration": 2.42912220954895 + }, + { + "auxiliary_loss_clip": 0.01372502, + "auxiliary_loss_mlp": 0.01118427, + "balance_loss_clip": 1.10511851, + "balance_loss_mlp": 1.06609416, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.6346286213288006, + "language_loss": 0.97054571, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99545503, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.5234375, + "step": 251, + "time_per_iteration": 2.544102430343628 + }, + { + "auxiliary_loss_clip": 0.0136732, + "auxiliary_loss_mlp": 0.01112828, + "balance_loss_clip": 1.10011601, + "balance_loss_mlp": 1.06359506, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 1.9940272072790892, + "language_loss": 0.84405214, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86885363, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.49243164, + "step": 252, + "time_per_iteration": 2.5194640159606934 + }, + { + "auxiliary_loss_clip": 0.01361664, + "auxiliary_loss_mlp": 0.01115873, + "balance_loss_clip": 1.10220408, + "balance_loss_mlp": 1.06535172, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.3641299985545827, + "language_loss": 0.98179173, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.006567, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.50537109, + "step": 253, + "time_per_iteration": 2.529755115509033 + }, + { + "auxiliary_loss_clip": 0.01246533, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.10484743, + "balance_loss_mlp": 1.0205102, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8520011707035202, + "language_loss": 0.55608898, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57894278, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.18334961, + "step": 254, + "time_per_iteration": 3.076733350753784 + }, + { + "auxiliary_loss_clip": 0.01365165, + "auxiliary_loss_mlp": 0.01120265, + "balance_loss_clip": 1.09819043, + "balance_loss_mlp": 1.06957674, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.5505780177785566, + "language_loss": 0.90201831, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92687267, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.50732422, + "step": 255, + "time_per_iteration": 2.6004555225372314 + }, + { + "auxiliary_loss_clip": 0.01368775, + "auxiliary_loss_mlp": 0.01132898, + "balance_loss_clip": 1.10257888, + "balance_loss_mlp": 1.08097029, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.276363145471272, + "language_loss": 0.8543641, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.87938082, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 2.6640625, + "router_z_loss_mlp": 0.52001953, + "step": 256, + "time_per_iteration": 2.462963819503784 + }, + { + "auxiliary_loss_clip": 0.01372022, + "auxiliary_loss_mlp": 0.01115494, + "balance_loss_clip": 1.10160303, + "balance_loss_mlp": 1.06432927, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.582315217713961, + "language_loss": 0.71357387, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.7384491, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.51245117, + "step": 257, + "time_per_iteration": 2.481613874435425 + }, + { + "auxiliary_loss_clip": 0.01363528, + "auxiliary_loss_mlp": 0.01108377, + "balance_loss_clip": 1.10036898, + "balance_loss_mlp": 1.05742741, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.035508525025552, + "language_loss": 0.9464035, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97112256, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.50976562, + "step": 258, + "time_per_iteration": 2.5293219089508057 + }, + { + "auxiliary_loss_clip": 0.0136068, + "auxiliary_loss_mlp": 0.01109566, + "balance_loss_clip": 1.09715724, + "balance_loss_mlp": 1.06009364, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 2.503273177027947, + "language_loss": 0.92928451, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95398706, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.49462891, + "step": 259, + "time_per_iteration": 2.7084150314331055 + }, + { + "auxiliary_loss_clip": 0.01353392, + "auxiliary_loss_mlp": 0.01108414, + "balance_loss_clip": 1.10010195, + "balance_loss_mlp": 1.05844092, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 1.9641128791089826, + "language_loss": 0.97463596, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99925399, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.5, + "step": 260, + "time_per_iteration": 2.455620288848877 + }, + { + "auxiliary_loss_clip": 0.01372074, + "auxiliary_loss_mlp": 0.01120004, + "balance_loss_clip": 1.10307288, + "balance_loss_mlp": 1.07091379, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.2112979431327577, + "language_loss": 0.87666106, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.901582, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.49145508, + "step": 261, + "time_per_iteration": 2.6779582500457764 + }, + { + "auxiliary_loss_clip": 0.01360119, + "auxiliary_loss_mlp": 0.01114386, + "balance_loss_clip": 1.09710503, + "balance_loss_mlp": 1.06460381, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 3.6787419312136325, + "language_loss": 0.67212063, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69686568, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.49755859, + "step": 262, + "time_per_iteration": 2.472348213195801 + }, + { + "auxiliary_loss_clip": 0.01359416, + "auxiliary_loss_mlp": 0.01125461, + "balance_loss_clip": 1.09876347, + "balance_loss_mlp": 1.07410526, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.5251514689530534, + "language_loss": 0.68607497, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71092379, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.51318359, + "step": 263, + "time_per_iteration": 2.5086443424224854 + }, + { + "auxiliary_loss_clip": 0.01360621, + "auxiliary_loss_mlp": 0.01111613, + "balance_loss_clip": 1.0954845, + "balance_loss_mlp": 1.06123507, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 2.8027943920286873, + "language_loss": 0.85272312, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87744546, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.50415039, + "step": 264, + "time_per_iteration": 2.4528214931488037 + }, + { + "auxiliary_loss_clip": 0.01367464, + "auxiliary_loss_mlp": 0.01095378, + "balance_loss_clip": 1.10098267, + "balance_loss_mlp": 1.04778993, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 2.3279279615067523, + "language_loss": 1.04054701, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06517529, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.47607422, + "step": 265, + "time_per_iteration": 2.4940147399902344 + }, + { + "auxiliary_loss_clip": 0.01370066, + "auxiliary_loss_mlp": 0.0112183, + "balance_loss_clip": 1.10326266, + "balance_loss_mlp": 1.06904423, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2651624355074174, + "language_loss": 0.75012946, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77504838, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.52807617, + "step": 266, + "time_per_iteration": 2.4766204357147217 + }, + { + "auxiliary_loss_clip": 0.01354158, + "auxiliary_loss_mlp": 0.01104228, + "balance_loss_clip": 1.09959447, + "balance_loss_mlp": 1.0542078, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 2.375457867929858, + "language_loss": 0.90497798, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.92956179, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.5, + "step": 267, + "time_per_iteration": 2.5558156967163086 + }, + { + "auxiliary_loss_clip": 0.01367292, + "auxiliary_loss_mlp": 0.01121876, + "balance_loss_clip": 1.1020267, + "balance_loss_mlp": 1.07061565, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.581016727288901, + "language_loss": 0.85542309, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88031483, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.51269531, + "step": 268, + "time_per_iteration": 2.458991289138794 + }, + { + "auxiliary_loss_clip": 0.01357872, + "auxiliary_loss_mlp": 0.01103429, + "balance_loss_clip": 1.10295749, + "balance_loss_mlp": 1.05498219, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 2.5619581729446805, + "language_loss": 0.88248336, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90709639, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.48461914, + "step": 269, + "time_per_iteration": 2.4958252906799316 + }, + { + "auxiliary_loss_clip": 0.01361306, + "auxiliary_loss_mlp": 0.01107363, + "balance_loss_clip": 1.09891486, + "balance_loss_mlp": 1.05467248, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 3.406857942225187, + "language_loss": 0.97134006, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99602669, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.52685547, + "step": 270, + "time_per_iteration": 2.5033681392669678 + }, + { + "auxiliary_loss_clip": 0.01364629, + "auxiliary_loss_mlp": 0.01120674, + "balance_loss_clip": 1.10496342, + "balance_loss_mlp": 1.07091653, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.8288224333220517, + "language_loss": 0.85845149, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88330448, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.49731445, + "step": 271, + "time_per_iteration": 2.568772315979004 + }, + { + "auxiliary_loss_clip": 0.01361349, + "auxiliary_loss_mlp": 0.01107473, + "balance_loss_clip": 1.09668314, + "balance_loss_mlp": 1.05776238, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 2.402040070918821, + "language_loss": 0.81313777, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83782607, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.49682617, + "step": 272, + "time_per_iteration": 2.5359110832214355 + }, + { + "auxiliary_loss_clip": 0.01354966, + "auxiliary_loss_mlp": 0.01119865, + "balance_loss_clip": 1.09906399, + "balance_loss_mlp": 1.07191873, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 3.3985509126942763, + "language_loss": 0.81191581, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83666414, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.47949219, + "step": 273, + "time_per_iteration": 2.502667188644409 + }, + { + "auxiliary_loss_clip": 0.01355007, + "auxiliary_loss_mlp": 0.01108552, + "balance_loss_clip": 1.09577036, + "balance_loss_mlp": 1.05617118, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.1367934759082883, + "language_loss": 0.91595304, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94058865, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.52416992, + "step": 274, + "time_per_iteration": 2.4784321784973145 + }, + { + "auxiliary_loss_clip": 0.01349009, + "auxiliary_loss_mlp": 0.01110752, + "balance_loss_clip": 1.09583282, + "balance_loss_mlp": 1.06092215, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 2.8960944295617894, + "language_loss": 0.8815856, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90618324, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.49804688, + "step": 275, + "time_per_iteration": 2.5473740100860596 + }, + { + "auxiliary_loss_clip": 0.01353921, + "auxiliary_loss_mlp": 0.01107579, + "balance_loss_clip": 1.09658957, + "balance_loss_mlp": 1.05629539, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 3.221725849613771, + "language_loss": 0.80615377, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83076882, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.51293945, + "step": 276, + "time_per_iteration": 2.477541446685791 + }, + { + "auxiliary_loss_clip": 0.01344755, + "auxiliary_loss_mlp": 0.01103922, + "balance_loss_clip": 1.09653759, + "balance_loss_mlp": 1.05824065, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.9941518725380136, + "language_loss": 0.8104043, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83489108, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.45629883, + "step": 277, + "time_per_iteration": 2.5840554237365723 + }, + { + "auxiliary_loss_clip": 0.01343495, + "auxiliary_loss_mlp": 0.01096269, + "balance_loss_clip": 1.08947814, + "balance_loss_mlp": 1.04705966, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.4302830695309785, + "language_loss": 0.80508375, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82948136, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.49194336, + "step": 278, + "time_per_iteration": 2.451174736022949 + }, + { + "auxiliary_loss_clip": 0.0134923, + "auxiliary_loss_mlp": 0.01100472, + "balance_loss_clip": 1.09586644, + "balance_loss_mlp": 1.05343223, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 2.3418972152926125, + "language_loss": 0.90622866, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93072569, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.47070312, + "step": 279, + "time_per_iteration": 3.9742746353149414 + }, + { + "auxiliary_loss_clip": 0.01351042, + "auxiliary_loss_mlp": 0.01115195, + "balance_loss_clip": 1.09384942, + "balance_loss_mlp": 1.06643844, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 2.8157683636116966, + "language_loss": 0.94125384, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96591622, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.48779297, + "step": 280, + "time_per_iteration": 5.378132581710815 + }, + { + "auxiliary_loss_clip": 0.01354748, + "auxiliary_loss_mlp": 0.01108233, + "balance_loss_clip": 1.09456301, + "balance_loss_mlp": 1.0587616, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.0268789655075503, + "language_loss": 0.73929632, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76392615, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.49462891, + "step": 281, + "time_per_iteration": 2.618635892868042 + }, + { + "auxiliary_loss_clip": 0.01350181, + "auxiliary_loss_mlp": 0.01119665, + "balance_loss_clip": 1.09704876, + "balance_loss_mlp": 1.07298267, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.653929361485316, + "language_loss": 0.80240142, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82709992, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.46704102, + "step": 282, + "time_per_iteration": 3.8450350761413574 + }, + { + "auxiliary_loss_clip": 0.01354102, + "auxiliary_loss_mlp": 0.01132229, + "balance_loss_clip": 1.09744823, + "balance_loss_mlp": 1.0805161, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.43468836713892, + "language_loss": 0.77628708, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80115038, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.51708984, + "step": 283, + "time_per_iteration": 2.4676673412323 + }, + { + "auxiliary_loss_clip": 0.01353482, + "auxiliary_loss_mlp": 0.01096878, + "balance_loss_clip": 1.09816027, + "balance_loss_mlp": 1.05062532, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 6.369315201332969, + "language_loss": 0.84012628, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86462986, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.46240234, + "step": 284, + "time_per_iteration": 2.608922004699707 + }, + { + "auxiliary_loss_clip": 0.01344256, + "auxiliary_loss_mlp": 0.01102816, + "balance_loss_clip": 1.09384811, + "balance_loss_mlp": 1.05465519, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.4063791397467558, + "language_loss": 0.96682966, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99130046, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.48168945, + "step": 285, + "time_per_iteration": 2.487102746963501 + }, + { + "auxiliary_loss_clip": 0.01347658, + "auxiliary_loss_mlp": 0.01098055, + "balance_loss_clip": 1.0963794, + "balance_loss_mlp": 1.05239749, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.067643912656935, + "language_loss": 0.93774438, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96220148, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.45654297, + "step": 286, + "time_per_iteration": 2.563542604446411 + }, + { + "auxiliary_loss_clip": 0.01341391, + "auxiliary_loss_mlp": 0.01091356, + "balance_loss_clip": 1.09143662, + "balance_loss_mlp": 1.04391074, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 2.203755655660166, + "language_loss": 0.91977721, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94410467, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.47460938, + "step": 287, + "time_per_iteration": 2.535193681716919 + }, + { + "auxiliary_loss_clip": 0.01344539, + "auxiliary_loss_mlp": 0.01094122, + "balance_loss_clip": 1.09069419, + "balance_loss_mlp": 1.04445922, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 25.808843455056977, + "language_loss": 1.01673973, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04112637, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.49707031, + "step": 288, + "time_per_iteration": 2.4060797691345215 + }, + { + "auxiliary_loss_clip": 0.01229341, + "auxiliary_loss_mlp": 0.01098721, + "balance_loss_clip": 1.09180963, + "balance_loss_mlp": 1.08284223, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9195665555184012, + "language_loss": 0.63869971, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66198027, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.15869141, + "step": 289, + "time_per_iteration": 3.1823182106018066 + }, + { + "auxiliary_loss_clip": 0.01347766, + "auxiliary_loss_mlp": 0.01110153, + "balance_loss_clip": 1.09686971, + "balance_loss_mlp": 1.06487703, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4486387162197913, + "language_loss": 0.88302624, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90760541, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.45263672, + "step": 290, + "time_per_iteration": 2.472369432449341 + }, + { + "auxiliary_loss_clip": 0.01344941, + "auxiliary_loss_mlp": 0.01099989, + "balance_loss_clip": 1.09416032, + "balance_loss_mlp": 1.05368805, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 1.991070111144608, + "language_loss": 0.84643787, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87088722, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.46289062, + "step": 291, + "time_per_iteration": 2.4806900024414062 + }, + { + "auxiliary_loss_clip": 0.01352656, + "auxiliary_loss_mlp": 0.01113781, + "balance_loss_clip": 1.10325801, + "balance_loss_mlp": 1.06378472, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6122476114059583, + "language_loss": 0.72669554, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75135988, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.49975586, + "step": 292, + "time_per_iteration": 2.532413959503174 + }, + { + "auxiliary_loss_clip": 0.01340188, + "auxiliary_loss_mlp": 0.01103348, + "balance_loss_clip": 1.09376049, + "balance_loss_mlp": 1.05647469, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.7613238350357094, + "language_loss": 0.86927783, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89371318, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.46899414, + "step": 293, + "time_per_iteration": 2.499272346496582 + }, + { + "auxiliary_loss_clip": 0.0133678, + "auxiliary_loss_mlp": 0.01107684, + "balance_loss_clip": 1.09165645, + "balance_loss_mlp": 1.06269431, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.244925121376031, + "language_loss": 0.80992568, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83437037, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.45019531, + "step": 294, + "time_per_iteration": 2.5356101989746094 + }, + { + "auxiliary_loss_clip": 0.01337012, + "auxiliary_loss_mlp": 0.01116781, + "balance_loss_clip": 1.08862138, + "balance_loss_mlp": 1.07062316, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.094718507473471, + "language_loss": 0.83980882, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.8643468, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.46191406, + "step": 295, + "time_per_iteration": 2.5277185440063477 + }, + { + "auxiliary_loss_clip": 0.01344251, + "auxiliary_loss_mlp": 0.0111634, + "balance_loss_clip": 1.09930503, + "balance_loss_mlp": 1.07030141, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.493668519456928, + "language_loss": 0.84774286, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87234879, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.46020508, + "step": 296, + "time_per_iteration": 2.486301898956299 + }, + { + "auxiliary_loss_clip": 0.0134343, + "auxiliary_loss_mlp": 0.01105843, + "balance_loss_clip": 1.0942328, + "balance_loss_mlp": 1.05966139, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 3.508820204032983, + "language_loss": 0.87793827, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90243101, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.46142578, + "step": 297, + "time_per_iteration": 2.455165386199951 + }, + { + "auxiliary_loss_clip": 0.0134123, + "auxiliary_loss_mlp": 0.01103678, + "balance_loss_clip": 1.09172368, + "balance_loss_mlp": 1.0591172, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.0902581338773527, + "language_loss": 0.88322127, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90767038, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.44604492, + "step": 298, + "time_per_iteration": 2.4939799308776855 + }, + { + "auxiliary_loss_clip": 0.01337538, + "auxiliary_loss_mlp": 0.01124172, + "balance_loss_clip": 1.09340453, + "balance_loss_mlp": 1.0772748, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 2.027788232504227, + "language_loss": 0.88515043, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.90976751, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.46923828, + "step": 299, + "time_per_iteration": 2.469698190689087 + }, + { + "auxiliary_loss_clip": 0.01344002, + "auxiliary_loss_mlp": 0.01107686, + "balance_loss_clip": 1.09374774, + "balance_loss_mlp": 1.06138515, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 20.883727134766524, + "language_loss": 0.65228117, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67679805, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.46362305, + "step": 300, + "time_per_iteration": 2.5017025470733643 + }, + { + "auxiliary_loss_clip": 0.01346063, + "auxiliary_loss_mlp": 0.01117049, + "balance_loss_clip": 1.09721637, + "balance_loss_mlp": 1.07046199, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 3.3968159946685113, + "language_loss": 0.88161278, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90624392, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.46557617, + "step": 301, + "time_per_iteration": 2.4263691902160645 + }, + { + "auxiliary_loss_clip": 0.01221686, + "auxiliary_loss_mlp": 0.01050629, + "balance_loss_clip": 1.08565402, + "balance_loss_mlp": 1.036062, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8248758573005537, + "language_loss": 0.62216729, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64489043, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.14526367, + "step": 302, + "time_per_iteration": 3.2084670066833496 + }, + { + "auxiliary_loss_clip": 0.01334965, + "auxiliary_loss_mlp": 0.01112009, + "balance_loss_clip": 1.09033465, + "balance_loss_mlp": 1.06551707, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.142579177117954, + "language_loss": 0.89650917, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.9209789, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.46484375, + "step": 303, + "time_per_iteration": 2.5021755695343018 + }, + { + "auxiliary_loss_clip": 0.0134341, + "auxiliary_loss_mlp": 0.01116797, + "balance_loss_clip": 1.09721422, + "balance_loss_mlp": 1.06901741, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.8715310661782645, + "language_loss": 0.80362892, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82823098, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.47802734, + "step": 304, + "time_per_iteration": 2.5548717975616455 + }, + { + "auxiliary_loss_clip": 0.01334054, + "auxiliary_loss_mlp": 0.01096895, + "balance_loss_clip": 1.09714544, + "balance_loss_mlp": 1.05264485, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.6946356057385576, + "language_loss": 0.82928157, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85359108, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.44238281, + "step": 305, + "time_per_iteration": 2.497422933578491 + }, + { + "auxiliary_loss_clip": 0.01335676, + "auxiliary_loss_mlp": 0.01097253, + "balance_loss_clip": 1.08955467, + "balance_loss_mlp": 1.05262113, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.8622160119527245, + "language_loss": 0.90992498, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93425429, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.44628906, + "step": 306, + "time_per_iteration": 2.5219600200653076 + }, + { + "auxiliary_loss_clip": 0.01329944, + "auxiliary_loss_mlp": 0.01093456, + "balance_loss_clip": 1.0874083, + "balance_loss_mlp": 1.04982531, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 3.3485499608923828, + "language_loss": 0.86665249, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89088643, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.43603516, + "step": 307, + "time_per_iteration": 2.60627818107605 + }, + { + "auxiliary_loss_clip": 0.01329871, + "auxiliary_loss_mlp": 0.01105642, + "balance_loss_clip": 1.09160006, + "balance_loss_mlp": 1.05669451, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.2509456906949543, + "language_loss": 0.71786469, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74221981, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.48901367, + "step": 308, + "time_per_iteration": 2.5381312370300293 + }, + { + "auxiliary_loss_clip": 0.01333693, + "auxiliary_loss_mlp": 0.01103045, + "balance_loss_clip": 1.08731413, + "balance_loss_mlp": 1.05824602, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 3.609901049208492, + "language_loss": 0.91848654, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94285393, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.44775391, + "step": 309, + "time_per_iteration": 2.448066473007202 + }, + { + "auxiliary_loss_clip": 0.01343664, + "auxiliary_loss_mlp": 0.01102039, + "balance_loss_clip": 1.08993316, + "balance_loss_mlp": 1.05435503, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.049081416094973, + "language_loss": 0.72753358, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75199062, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.47680664, + "step": 310, + "time_per_iteration": 2.643512725830078 + }, + { + "auxiliary_loss_clip": 0.01334719, + "auxiliary_loss_mlp": 0.01112327, + "balance_loss_clip": 1.09448445, + "balance_loss_mlp": 1.06643176, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 2.0161986314051594, + "language_loss": 0.73745823, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76192868, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.45947266, + "step": 311, + "time_per_iteration": 2.762510299682617 + }, + { + "auxiliary_loss_clip": 0.01344361, + "auxiliary_loss_mlp": 0.01110987, + "balance_loss_clip": 1.09115887, + "balance_loss_mlp": 1.06618774, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.957970276905221, + "language_loss": 0.91780102, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.94235456, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.44824219, + "step": 312, + "time_per_iteration": 2.507730484008789 + }, + { + "auxiliary_loss_clip": 0.01346901, + "auxiliary_loss_mlp": 0.01118762, + "balance_loss_clip": 1.09557879, + "balance_loss_mlp": 1.07117391, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.2776275576644824, + "language_loss": 0.89721215, + "learning_rate": 3.699705471087043e-06, + "loss": 0.9218688, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.47631836, + "step": 313, + "time_per_iteration": 2.5170459747314453 + }, + { + "auxiliary_loss_clip": 0.01345752, + "auxiliary_loss_mlp": 0.01103749, + "balance_loss_clip": 1.0915947, + "balance_loss_mlp": 1.05656588, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.2232509657843895, + "language_loss": 0.73106664, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75556159, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.47192383, + "step": 314, + "time_per_iteration": 2.497211217880249 + }, + { + "auxiliary_loss_clip": 0.01333454, + "auxiliary_loss_mlp": 0.01106133, + "balance_loss_clip": 1.08982921, + "balance_loss_mlp": 1.061692, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.431398268107733, + "language_loss": 0.89779794, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92219377, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.44482422, + "step": 315, + "time_per_iteration": 2.5614914894104004 + }, + { + "auxiliary_loss_clip": 0.01333867, + "auxiliary_loss_mlp": 0.01099579, + "balance_loss_clip": 1.09081793, + "balance_loss_mlp": 1.0541842, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 2.319281097311174, + "language_loss": 0.80675679, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83109123, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.45361328, + "step": 316, + "time_per_iteration": 2.53731632232666 + }, + { + "auxiliary_loss_clip": 0.01332959, + "auxiliary_loss_mlp": 0.01094504, + "balance_loss_clip": 1.09072256, + "balance_loss_mlp": 1.04920471, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 22.941343061651594, + "language_loss": 0.90215003, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92642468, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.453125, + "step": 317, + "time_per_iteration": 2.4896790981292725 + }, + { + "auxiliary_loss_clip": 0.01326622, + "auxiliary_loss_mlp": 0.01095989, + "balance_loss_clip": 1.08761096, + "balance_loss_mlp": 1.05049837, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 3.4182752281239335, + "language_loss": 0.90968472, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93391085, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.45507812, + "step": 318, + "time_per_iteration": 2.4946837425231934 + }, + { + "auxiliary_loss_clip": 0.01326577, + "auxiliary_loss_mlp": 0.01090639, + "balance_loss_clip": 1.08576465, + "balance_loss_mlp": 1.04824769, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 5.38455620363393, + "language_loss": 0.94162107, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96579325, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.42382812, + "step": 319, + "time_per_iteration": 2.554530143737793 + }, + { + "auxiliary_loss_clip": 0.01205016, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_clip": 1.07754529, + "balance_loss_mlp": 1.04651284, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 1.042431834543999, + "language_loss": 0.59776318, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.6204384, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.16015625, + "step": 320, + "time_per_iteration": 3.006904125213623 + }, + { + "auxiliary_loss_clip": 0.01325796, + "auxiliary_loss_mlp": 0.01098335, + "balance_loss_clip": 1.08736229, + "balance_loss_mlp": 1.05453706, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.21162891562916, + "language_loss": 0.90165031, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92589164, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.43798828, + "step": 321, + "time_per_iteration": 2.469407081604004 + }, + { + "auxiliary_loss_clip": 0.01333678, + "auxiliary_loss_mlp": 0.01112195, + "balance_loss_clip": 1.08979714, + "balance_loss_mlp": 1.06677604, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.2003991826565694, + "language_loss": 0.83005071, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85450941, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.45458984, + "step": 322, + "time_per_iteration": 2.467902898788452 + }, + { + "auxiliary_loss_clip": 0.01336374, + "auxiliary_loss_mlp": 0.01099621, + "balance_loss_clip": 1.08868885, + "balance_loss_mlp": 1.05472672, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.6071685350743525, + "language_loss": 0.73115361, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75551355, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.44873047, + "step": 323, + "time_per_iteration": 5.258114337921143 + }, + { + "auxiliary_loss_clip": 0.0132724, + "auxiliary_loss_mlp": 0.01091945, + "balance_loss_clip": 1.08490944, + "balance_loss_mlp": 1.04807591, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 5.421859102243476, + "language_loss": 0.92183352, + "learning_rate": 3.721944334919596e-06, + "loss": 0.94602531, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.4387207, + "step": 324, + "time_per_iteration": 3.9150495529174805 + }, + { + "auxiliary_loss_clip": 0.01331685, + "auxiliary_loss_mlp": 0.01089627, + "balance_loss_clip": 1.09001648, + "balance_loss_mlp": 1.04828477, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 2.871631819376646, + "language_loss": 0.65276873, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67698181, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.41357422, + "step": 325, + "time_per_iteration": 2.5127291679382324 + }, + { + "auxiliary_loss_clip": 0.01330404, + "auxiliary_loss_mlp": 0.01102897, + "balance_loss_clip": 1.09502888, + "balance_loss_mlp": 1.05919492, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.9230456072094626, + "language_loss": 0.76339674, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78772974, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.43701172, + "step": 326, + "time_per_iteration": 3.918156385421753 + }, + { + "auxiliary_loss_clip": 0.01324973, + "auxiliary_loss_mlp": 0.01102583, + "balance_loss_clip": 1.09079373, + "balance_loss_mlp": 1.05895233, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 4.570098883703584, + "language_loss": 0.79682422, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82109982, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.43652344, + "step": 327, + "time_per_iteration": 2.5567076206207275 + }, + { + "auxiliary_loss_clip": 0.01333792, + "auxiliary_loss_mlp": 0.01106771, + "balance_loss_clip": 1.0926621, + "balance_loss_mlp": 1.0639987, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.264868331273124, + "language_loss": 0.80895668, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83336234, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.42749023, + "step": 328, + "time_per_iteration": 2.4637339115142822 + }, + { + "auxiliary_loss_clip": 0.01332159, + "auxiliary_loss_mlp": 0.01096087, + "balance_loss_clip": 1.08633232, + "balance_loss_mlp": 1.05150259, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.388113338602331, + "language_loss": 0.93798989, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96227241, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.44555664, + "step": 329, + "time_per_iteration": 2.4820785522460938 + }, + { + "auxiliary_loss_clip": 0.01336757, + "auxiliary_loss_mlp": 0.01114735, + "balance_loss_clip": 1.09042418, + "balance_loss_mlp": 1.0681479, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.599914607540736, + "language_loss": 0.74655032, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.77106524, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.46557617, + "step": 330, + "time_per_iteration": 2.4573304653167725 + }, + { + "auxiliary_loss_clip": 0.01334383, + "auxiliary_loss_mlp": 0.01119894, + "balance_loss_clip": 1.08965182, + "balance_loss_mlp": 1.07495213, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 3.1409051543959157, + "language_loss": 0.93496019, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.95950294, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.44921875, + "step": 331, + "time_per_iteration": 2.4333980083465576 + }, + { + "auxiliary_loss_clip": 0.01322279, + "auxiliary_loss_mlp": 0.01089459, + "balance_loss_clip": 1.08812368, + "balance_loss_mlp": 1.04771245, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.346077195397742, + "language_loss": 0.92834163, + "learning_rate": 3.737648825272422e-06, + "loss": 0.9524591, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.41723633, + "step": 332, + "time_per_iteration": 2.4200243949890137 + }, + { + "auxiliary_loss_clip": 0.0133067, + "auxiliary_loss_mlp": 0.01093854, + "balance_loss_clip": 1.0937947, + "balance_loss_mlp": 1.04955637, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 3.585493008324099, + "language_loss": 0.75602531, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78027052, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.44287109, + "step": 333, + "time_per_iteration": 2.5126490592956543 + }, + { + "auxiliary_loss_clip": 0.01327402, + "auxiliary_loss_mlp": 0.01092526, + "balance_loss_clip": 1.08796716, + "balance_loss_mlp": 1.04746461, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.340718602312793, + "language_loss": 0.78863859, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81283784, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.45092773, + "step": 334, + "time_per_iteration": 2.5557093620300293 + }, + { + "auxiliary_loss_clip": 0.013283, + "auxiliary_loss_mlp": 0.01099585, + "balance_loss_clip": 1.0834949, + "balance_loss_mlp": 1.05333138, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.6396772065423963, + "language_loss": 0.83517218, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85945106, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.46264648, + "step": 335, + "time_per_iteration": 2.50285267829895 + }, + { + "auxiliary_loss_clip": 0.01323953, + "auxiliary_loss_mlp": 0.01089007, + "balance_loss_clip": 1.08577693, + "balance_loss_mlp": 1.04573417, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.8440823285445096, + "language_loss": 0.92551732, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94964695, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.43286133, + "step": 336, + "time_per_iteration": 2.436910629272461 + }, + { + "auxiliary_loss_clip": 0.01324519, + "auxiliary_loss_mlp": 0.01083925, + "balance_loss_clip": 1.08415294, + "balance_loss_mlp": 1.04129565, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.8815839284972304, + "language_loss": 0.8830778, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90716219, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.42651367, + "step": 337, + "time_per_iteration": 2.4628312587738037 + }, + { + "auxiliary_loss_clip": 0.01314536, + "auxiliary_loss_mlp": 0.01105527, + "balance_loss_clip": 1.081985, + "balance_loss_mlp": 1.0605855, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.8802927965611724, + "language_loss": 0.89960009, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92380071, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.44970703, + "step": 338, + "time_per_iteration": 2.5198347568511963 + }, + { + "auxiliary_loss_clip": 0.01327134, + "auxiliary_loss_mlp": 0.01102145, + "balance_loss_clip": 1.08664703, + "balance_loss_mlp": 1.05801404, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.318805307220815, + "language_loss": 0.85051668, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87480938, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.44116211, + "step": 339, + "time_per_iteration": 2.5338354110717773 + }, + { + "auxiliary_loss_clip": 0.01329981, + "auxiliary_loss_mlp": 0.01101472, + "balance_loss_clip": 1.08891821, + "balance_loss_mlp": 1.0570544, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7641796825001117, + "language_loss": 0.88872129, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91303581, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.44384766, + "step": 340, + "time_per_iteration": 2.458604574203491 + }, + { + "auxiliary_loss_clip": 0.01330343, + "auxiliary_loss_mlp": 0.01087732, + "balance_loss_clip": 1.08794737, + "balance_loss_mlp": 1.04145479, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 5.094959909325056, + "language_loss": 0.88228911, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90646982, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.46240234, + "step": 341, + "time_per_iteration": 2.5540695190429688 + }, + { + "auxiliary_loss_clip": 0.01328207, + "auxiliary_loss_mlp": 0.01106257, + "balance_loss_clip": 1.08450377, + "balance_loss_mlp": 1.06152916, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 4.541392464248229, + "language_loss": 0.80378872, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82813334, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.44726562, + "step": 342, + "time_per_iteration": 2.4632914066314697 + }, + { + "auxiliary_loss_clip": 0.01321954, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_clip": 1.08678937, + "balance_loss_mlp": 1.05216384, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.236534585308003, + "language_loss": 0.89480352, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91901433, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.4699707, + "step": 343, + "time_per_iteration": 2.5239899158477783 + }, + { + "auxiliary_loss_clip": 0.01331353, + "auxiliary_loss_mlp": 0.0109511, + "balance_loss_clip": 1.09250915, + "balance_loss_mlp": 1.05322027, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 1.889120457494352, + "language_loss": 0.78248197, + "learning_rate": 3.7605098841644e-06, + "loss": 0.8067466, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.41870117, + "step": 344, + "time_per_iteration": 2.440788984298706 + }, + { + "auxiliary_loss_clip": 0.01314172, + "auxiliary_loss_mlp": 0.01103698, + "balance_loss_clip": 1.08350086, + "balance_loss_mlp": 1.05870891, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 3.9207139515388243, + "language_loss": 0.75165325, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77583194, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.44995117, + "step": 345, + "time_per_iteration": 2.557891368865967 + }, + { + "auxiliary_loss_clip": 0.01323766, + "auxiliary_loss_mlp": 0.01106697, + "balance_loss_clip": 1.0904007, + "balance_loss_mlp": 1.06177926, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.3253759461296086, + "language_loss": 0.90397125, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92827594, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.44897461, + "step": 346, + "time_per_iteration": 2.4929075241088867 + }, + { + "auxiliary_loss_clip": 0.01317524, + "auxiliary_loss_mlp": 0.01094402, + "balance_loss_clip": 1.08516765, + "balance_loss_mlp": 1.05370426, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.195064603822085, + "language_loss": 0.79254901, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81666827, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.40673828, + "step": 347, + "time_per_iteration": 2.5742719173431396 + }, + { + "auxiliary_loss_clip": 0.01321485, + "auxiliary_loss_mlp": 0.01105557, + "balance_loss_clip": 1.08929253, + "balance_loss_mlp": 1.05808842, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.0606356099372567, + "language_loss": 0.71396422, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73823464, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.47485352, + "step": 348, + "time_per_iteration": 2.47751784324646 + }, + { + "auxiliary_loss_clip": 0.01322809, + "auxiliary_loss_mlp": 0.01103071, + "balance_loss_clip": 1.08568573, + "balance_loss_mlp": 1.05555391, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.3422566475425493, + "language_loss": 0.77193224, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.7961911, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.47485352, + "step": 349, + "time_per_iteration": 2.487544059753418 + }, + { + "auxiliary_loss_clip": 0.01313661, + "auxiliary_loss_mlp": 0.01105977, + "balance_loss_clip": 1.08699846, + "balance_loss_mlp": 1.06313276, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.694764560452036, + "language_loss": 0.8522774, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87647378, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.4284668, + "step": 350, + "time_per_iteration": 2.4772250652313232 + }, + { + "auxiliary_loss_clip": 0.01317165, + "auxiliary_loss_mlp": 0.01093131, + "balance_loss_clip": 1.08879948, + "balance_loss_mlp": 1.05257607, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.801007341142059, + "language_loss": 0.79994112, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82404405, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.40576172, + "step": 351, + "time_per_iteration": 2.5189707279205322 + }, + { + "auxiliary_loss_clip": 0.01324918, + "auxiliary_loss_mlp": 0.0110468, + "balance_loss_clip": 1.08966947, + "balance_loss_mlp": 1.06011915, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 2.5926446089846777, + "language_loss": 0.876589, + "learning_rate": 3.775311735671078e-06, + "loss": 0.90088499, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.44555664, + "step": 352, + "time_per_iteration": 2.4128732681274414 + }, + { + "auxiliary_loss_clip": 0.01317307, + "auxiliary_loss_mlp": 0.01100668, + "balance_loss_clip": 1.08812535, + "balance_loss_mlp": 1.05651307, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 2.854016144012632, + "language_loss": 0.82613295, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.85031271, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.44140625, + "step": 353, + "time_per_iteration": 2.5055339336395264 + }, + { + "auxiliary_loss_clip": 0.01313193, + "auxiliary_loss_mlp": 0.01094839, + "balance_loss_clip": 1.08627355, + "balance_loss_mlp": 1.05313945, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 1.8396865057740879, + "language_loss": 0.81144851, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83552885, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.41699219, + "step": 354, + "time_per_iteration": 2.4810643196105957 + }, + { + "auxiliary_loss_clip": 0.01318702, + "auxiliary_loss_mlp": 0.01092149, + "balance_loss_clip": 1.08389461, + "balance_loss_mlp": 1.04799402, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.650535251378492, + "language_loss": 0.81255203, + "learning_rate": 3.780775860546545e-06, + "loss": 0.8366605, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.44165039, + "step": 355, + "time_per_iteration": 2.5206656455993652 + }, + { + "auxiliary_loss_clip": 0.01315748, + "auxiliary_loss_mlp": 0.01091835, + "balance_loss_clip": 1.0831418, + "balance_loss_mlp": 1.04882383, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.263302853886685, + "language_loss": 0.89420265, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91827846, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.43041992, + "step": 356, + "time_per_iteration": 2.4236161708831787 + }, + { + "auxiliary_loss_clip": 0.01316653, + "auxiliary_loss_mlp": 0.01090312, + "balance_loss_clip": 1.08760977, + "balance_loss_mlp": 1.04622817, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 3.4480026671535677, + "language_loss": 0.80360562, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82767522, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.44091797, + "step": 357, + "time_per_iteration": 2.6487536430358887 + }, + { + "auxiliary_loss_clip": 0.01314114, + "auxiliary_loss_mlp": 0.0108554, + "balance_loss_clip": 1.08297634, + "balance_loss_mlp": 1.04510438, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.5651824748246583, + "language_loss": 0.76786679, + "learning_rate": 3.786194003461506e-06, + "loss": 0.79186332, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.40429688, + "step": 358, + "time_per_iteration": 2.448637008666992 + }, + { + "auxiliary_loss_clip": 0.01313053, + "auxiliary_loss_mlp": 0.01100857, + "balance_loss_clip": 1.08169568, + "balance_loss_mlp": 1.05407906, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.966863950593063, + "language_loss": 0.8850922, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90923131, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.46801758, + "step": 359, + "time_per_iteration": 2.4450812339782715 + }, + { + "auxiliary_loss_clip": 0.01325979, + "auxiliary_loss_mlp": 0.01091817, + "balance_loss_clip": 1.08866465, + "balance_loss_mlp": 1.05073786, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.9858240426143325, + "language_loss": 0.7626704, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78684831, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.41040039, + "step": 360, + "time_per_iteration": 2.514803171157837 + }, + { + "auxiliary_loss_clip": 0.01193918, + "auxiliary_loss_mlp": 0.01045047, + "balance_loss_clip": 1.06586206, + "balance_loss_mlp": 1.0323633, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8540056185595325, + "language_loss": 0.64950377, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67189348, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.12683105, + "step": 361, + "time_per_iteration": 3.2020087242126465 + }, + { + "auxiliary_loss_clip": 0.01312477, + "auxiliary_loss_mlp": 0.01089369, + "balance_loss_clip": 1.08002579, + "balance_loss_mlp": 1.04740739, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 3.5262776065028913, + "language_loss": 0.78342366, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80744207, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.41967773, + "step": 362, + "time_per_iteration": 2.47363543510437 + }, + { + "auxiliary_loss_clip": 0.01315607, + "auxiliary_loss_mlp": 0.01091664, + "balance_loss_clip": 1.08306551, + "balance_loss_mlp": 1.05020308, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.239734995152755, + "language_loss": 0.92331994, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94739264, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.41479492, + "step": 363, + "time_per_iteration": 2.508645534515381 + }, + { + "auxiliary_loss_clip": 0.01311864, + "auxiliary_loss_mlp": 0.01101503, + "balance_loss_clip": 1.08335257, + "balance_loss_mlp": 1.06085229, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.36441142921019, + "language_loss": 0.89824677, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92238045, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.40673828, + "step": 364, + "time_per_iteration": 2.4711058139801025 + }, + { + "auxiliary_loss_clip": 0.01322294, + "auxiliary_loss_mlp": 0.01098863, + "balance_loss_clip": 1.09014928, + "balance_loss_mlp": 1.05606651, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 1.8847305183197196, + "language_loss": 0.79169393, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81590545, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.42773438, + "step": 365, + "time_per_iteration": 2.5032753944396973 + }, + { + "auxiliary_loss_clip": 0.0131706, + "auxiliary_loss_mlp": 0.01098332, + "balance_loss_clip": 1.08593309, + "balance_loss_mlp": 1.05453467, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.0359826160600574, + "language_loss": 0.84581077, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86996472, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.43798828, + "step": 366, + "time_per_iteration": 3.813486337661743 + }, + { + "auxiliary_loss_clip": 0.01317869, + "auxiliary_loss_mlp": 0.01088908, + "balance_loss_clip": 1.08510232, + "balance_loss_mlp": 1.04897237, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 1.9530141583615657, + "language_loss": 0.87140262, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89547038, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.39941406, + "step": 367, + "time_per_iteration": 5.275500535964966 + }, + { + "auxiliary_loss_clip": 0.01321287, + "auxiliary_loss_mlp": 0.01098537, + "balance_loss_clip": 1.08387804, + "balance_loss_mlp": 1.05493045, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.2726756719315357, + "language_loss": 0.84845829, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87265652, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.43603516, + "step": 368, + "time_per_iteration": 2.4383738040924072 + }, + { + "auxiliary_loss_clip": 0.01323773, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_clip": 1.08427835, + "balance_loss_mlp": 1.04381812, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 2.35903935725817, + "language_loss": 0.75770748, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78179419, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.41088867, + "step": 369, + "time_per_iteration": 3.8357789516448975 + }, + { + "auxiliary_loss_clip": 0.01313573, + "auxiliary_loss_mlp": 0.01104251, + "balance_loss_clip": 1.08420539, + "balance_loss_mlp": 1.06290925, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.293211266901971, + "language_loss": 0.82812917, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85230744, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.41308594, + "step": 370, + "time_per_iteration": 2.5040946006774902 + }, + { + "auxiliary_loss_clip": 0.01311574, + "auxiliary_loss_mlp": 0.0110642, + "balance_loss_clip": 1.0827167, + "balance_loss_mlp": 1.0623126, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 4.034374875977678, + "language_loss": 0.81657517, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84075511, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.44091797, + "step": 371, + "time_per_iteration": 2.505246162414551 + }, + { + "auxiliary_loss_clip": 0.01319946, + "auxiliary_loss_mlp": 0.010958, + "balance_loss_clip": 1.08858275, + "balance_loss_mlp": 1.05267024, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.1347760458320857, + "language_loss": 0.83547938, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.8596369, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.43139648, + "step": 372, + "time_per_iteration": 2.4667232036590576 + }, + { + "auxiliary_loss_clip": 0.01311884, + "auxiliary_loss_mlp": 0.01095555, + "balance_loss_clip": 1.08401155, + "balance_loss_mlp": 1.05356908, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 5.092924682333101, + "language_loss": 0.78811431, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81218874, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.41967773, + "step": 373, + "time_per_iteration": 2.5102555751800537 + }, + { + "auxiliary_loss_clip": 0.01315156, + "auxiliary_loss_mlp": 0.01091472, + "balance_loss_clip": 1.08628774, + "balance_loss_mlp": 1.04812777, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 3.492241710056962, + "language_loss": 0.77587813, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79994446, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.43334961, + "step": 374, + "time_per_iteration": 2.4135072231292725 + }, + { + "auxiliary_loss_clip": 0.01307004, + "auxiliary_loss_mlp": 0.01077726, + "balance_loss_clip": 1.07693374, + "balance_loss_mlp": 1.03454816, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.5976523865448187, + "language_loss": 0.85984397, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88369125, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.43164062, + "step": 375, + "time_per_iteration": 2.545318365097046 + }, + { + "auxiliary_loss_clip": 0.01314823, + "auxiliary_loss_mlp": 0.01100579, + "balance_loss_clip": 1.08648205, + "balance_loss_mlp": 1.05747318, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.295731392134249, + "language_loss": 0.89131355, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91546756, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.43139648, + "step": 376, + "time_per_iteration": 2.43568754196167 + }, + { + "auxiliary_loss_clip": 0.01320194, + "auxiliary_loss_mlp": 0.01088421, + "balance_loss_clip": 1.0836978, + "balance_loss_mlp": 1.0482949, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 2.53324283079049, + "language_loss": 0.75006473, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77415085, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.40112305, + "step": 377, + "time_per_iteration": 2.469390630722046 + }, + { + "auxiliary_loss_clip": 0.01302808, + "auxiliary_loss_mlp": 0.01090552, + "balance_loss_clip": 1.08189917, + "balance_loss_mlp": 1.04828072, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.8518967773224553, + "language_loss": 0.99475724, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01869082, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.4230957, + "step": 378, + "time_per_iteration": 2.464729070663452 + }, + { + "auxiliary_loss_clip": 0.01184259, + "auxiliary_loss_mlp": 0.01071445, + "balance_loss_clip": 1.06566882, + "balance_loss_mlp": 1.05885649, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 1.4884567607862158, + "language_loss": 0.75408298, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77664, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.12585449, + "step": 379, + "time_per_iteration": 3.161653995513916 + }, + { + "auxiliary_loss_clip": 0.01316767, + "auxiliary_loss_mlp": 0.01089673, + "balance_loss_clip": 1.08115625, + "balance_loss_mlp": 1.04978609, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 2.3947646343964375, + "language_loss": 0.78178883, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80585325, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.39868164, + "step": 380, + "time_per_iteration": 2.5933032035827637 + }, + { + "auxiliary_loss_clip": 0.01308845, + "auxiliary_loss_mlp": 0.01088955, + "balance_loss_clip": 1.08196211, + "balance_loss_mlp": 1.04901993, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.2652286083402964, + "language_loss": 0.97052771, + "learning_rate": 3.826284353801652e-06, + "loss": 0.9945057, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.39916992, + "step": 381, + "time_per_iteration": 2.508247137069702 + }, + { + "auxiliary_loss_clip": 0.01319459, + "auxiliary_loss_mlp": 0.01098181, + "balance_loss_clip": 1.08507919, + "balance_loss_mlp": 1.05688703, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 19.20286983313803, + "language_loss": 0.87926364, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90344, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.41308594, + "step": 382, + "time_per_iteration": 2.4682390689849854 + }, + { + "auxiliary_loss_clip": 0.01308648, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_clip": 1.08352506, + "balance_loss_mlp": 1.06159925, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.2988009752645087, + "language_loss": 0.85218489, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87629998, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.41259766, + "step": 383, + "time_per_iteration": 2.551142454147339 + }, + { + "auxiliary_loss_clip": 0.0130951, + "auxiliary_loss_mlp": 0.01111606, + "balance_loss_clip": 1.0844152, + "balance_loss_mlp": 1.07093179, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.6767872254631038, + "language_loss": 0.83620167, + "learning_rate": 3.831334200735543e-06, + "loss": 0.86041278, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.40673828, + "step": 384, + "time_per_iteration": 2.4400908946990967 + }, + { + "auxiliary_loss_clip": 0.01306586, + "auxiliary_loss_mlp": 0.01097138, + "balance_loss_clip": 1.085289, + "balance_loss_mlp": 1.05839527, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8315992068023217, + "language_loss": 0.89275712, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91679436, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.38745117, + "step": 385, + "time_per_iteration": 2.542623281478882 + }, + { + "auxiliary_loss_clip": 0.01313068, + "auxiliary_loss_mlp": 0.01124741, + "balance_loss_clip": 1.08477974, + "balance_loss_mlp": 1.08449554, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 1.75785039400282, + "language_loss": 0.70067465, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72505271, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.40234375, + "step": 386, + "time_per_iteration": 2.4235262870788574 + }, + { + "auxiliary_loss_clip": 0.01312801, + "auxiliary_loss_mlp": 0.01094765, + "balance_loss_clip": 1.08520889, + "balance_loss_mlp": 1.05587864, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.869354462207511, + "language_loss": 0.88049823, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90457392, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.38867188, + "step": 387, + "time_per_iteration": 2.51749324798584 + }, + { + "auxiliary_loss_clip": 0.0131358, + "auxiliary_loss_mlp": 0.01084582, + "balance_loss_clip": 1.08498812, + "balance_loss_mlp": 1.0433358, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 1.9569447316801576, + "language_loss": 0.83395851, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85794014, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.41259766, + "step": 388, + "time_per_iteration": 2.5840675830841064 + }, + { + "auxiliary_loss_clip": 0.01309575, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_clip": 1.08339965, + "balance_loss_mlp": 1.05342805, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 2.2272932429754095, + "language_loss": 0.93784678, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96186525, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.38867188, + "step": 389, + "time_per_iteration": 2.544656991958618 + }, + { + "auxiliary_loss_clip": 0.01300341, + "auxiliary_loss_mlp": 0.01088381, + "balance_loss_clip": 1.08203459, + "balance_loss_mlp": 1.04777861, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 1.9128678012159233, + "language_loss": 0.87708509, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90097231, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.40576172, + "step": 390, + "time_per_iteration": 2.5253286361694336 + }, + { + "auxiliary_loss_clip": 0.01308099, + "auxiliary_loss_mlp": 0.01094071, + "balance_loss_clip": 1.084481, + "balance_loss_mlp": 1.05640042, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.9511460068308897, + "language_loss": 0.89603043, + "learning_rate": 3.842965395193529e-06, + "loss": 0.92005205, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.37695312, + "step": 391, + "time_per_iteration": 2.494295597076416 + }, + { + "auxiliary_loss_clip": 0.01306383, + "auxiliary_loss_mlp": 0.01078269, + "balance_loss_clip": 1.08296323, + "balance_loss_mlp": 1.0390017, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.1131699678599305, + "language_loss": 0.86113322, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88497967, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.39257812, + "step": 392, + "time_per_iteration": 2.503603458404541 + }, + { + "auxiliary_loss_clip": 0.0130428, + "auxiliary_loss_mlp": 0.01097365, + "balance_loss_clip": 1.08367062, + "balance_loss_mlp": 1.05568993, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 1.8771367070604825, + "language_loss": 0.89085782, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91487432, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.41674805, + "step": 393, + "time_per_iteration": 2.4443323612213135 + }, + { + "auxiliary_loss_clip": 0.01311863, + "auxiliary_loss_mlp": 0.01112285, + "balance_loss_clip": 1.08652091, + "balance_loss_mlp": 1.07039523, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0840062961648473, + "language_loss": 0.81415558, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83839709, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.41870117, + "step": 394, + "time_per_iteration": 2.4276275634765625 + }, + { + "auxiliary_loss_clip": 0.01316426, + "auxiliary_loss_mlp": 0.01096703, + "balance_loss_clip": 1.08903277, + "balance_loss_mlp": 1.05562353, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.3856496434510976, + "language_loss": 0.85934687, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88347811, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.41088867, + "step": 395, + "time_per_iteration": 2.4615249633789062 + }, + { + "auxiliary_loss_clip": 0.01177778, + "auxiliary_loss_mlp": 0.01013971, + "balance_loss_clip": 1.06187141, + "balance_loss_mlp": 1.00078654, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9409759131553146, + "language_loss": 0.63898599, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66090357, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.13208008, + "step": 396, + "time_per_iteration": 2.9194271564483643 + }, + { + "auxiliary_loss_clip": 0.01302515, + "auxiliary_loss_mlp": 0.01076287, + "balance_loss_clip": 1.08058059, + "balance_loss_mlp": 1.0381397, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6870110594491026, + "language_loss": 0.83846211, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86225009, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.38183594, + "step": 397, + "time_per_iteration": 2.441936731338501 + }, + { + "auxiliary_loss_clip": 0.01312058, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_clip": 1.08677292, + "balance_loss_mlp": 1.05146015, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 6.140274559397543, + "language_loss": 0.84564888, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86969036, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.40625, + "step": 398, + "time_per_iteration": 2.461946487426758 + }, + { + "auxiliary_loss_clip": 0.01311464, + "auxiliary_loss_mlp": 0.01080372, + "balance_loss_clip": 1.08283186, + "balance_loss_mlp": 1.03879189, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.5081181753705, + "language_loss": 0.85868371, + "learning_rate": 3.856005885185868e-06, + "loss": 0.8826021, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.41577148, + "step": 399, + "time_per_iteration": 2.407464027404785 + }, + { + "auxiliary_loss_clip": 0.01301257, + "auxiliary_loss_mlp": 0.01092288, + "balance_loss_clip": 1.08212209, + "balance_loss_mlp": 1.05266261, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.085686885816695, + "language_loss": 0.86559182, + "learning_rate": 3.857617531042398e-06, + "loss": 0.8895272, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.39648438, + "step": 400, + "time_per_iteration": 2.526860475540161 + }, + { + "auxiliary_loss_clip": 0.01304203, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_clip": 1.08325005, + "balance_loss_mlp": 1.04335475, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 2.0556360426862, + "language_loss": 0.7949453, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81880718, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.38623047, + "step": 401, + "time_per_iteration": 2.503732919692993 + }, + { + "auxiliary_loss_clip": 0.01302939, + "auxiliary_loss_mlp": 0.01098193, + "balance_loss_clip": 1.08044064, + "balance_loss_mlp": 1.05668449, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.8606436850423247, + "language_loss": 0.78459942, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.8086108, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.41503906, + "step": 402, + "time_per_iteration": 2.5160930156707764 + }, + { + "auxiliary_loss_clip": 0.01303445, + "auxiliary_loss_mlp": 0.01084952, + "balance_loss_clip": 1.07838213, + "balance_loss_mlp": 1.04108322, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.251414665169107, + "language_loss": 0.94865251, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97253644, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.4387207, + "step": 403, + "time_per_iteration": 2.4378607273101807 + }, + { + "auxiliary_loss_clip": 0.01307221, + "auxiliary_loss_mlp": 0.01089191, + "balance_loss_clip": 1.07806945, + "balance_loss_mlp": 1.0483973, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3323043115521256, + "language_loss": 1.00016093, + "learning_rate": 3.864024073288798e-06, + "loss": 1.0241251, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.40771484, + "step": 404, + "time_per_iteration": 2.4299652576446533 + }, + { + "auxiliary_loss_clip": 0.01312598, + "auxiliary_loss_mlp": 0.01101313, + "balance_loss_clip": 1.08469355, + "balance_loss_mlp": 1.06185484, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.5948930744060497, + "language_loss": 0.87927818, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90341723, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.39428711, + "step": 405, + "time_per_iteration": 2.464714288711548 + }, + { + "auxiliary_loss_clip": 0.01315677, + "auxiliary_loss_mlp": 0.0110187, + "balance_loss_clip": 1.08616757, + "balance_loss_mlp": 1.06150627, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.3660551717368525, + "language_loss": 0.93772161, + "learning_rate": 3.867203596705844e-06, + "loss": 0.96189708, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.40405273, + "step": 406, + "time_per_iteration": 2.5140440464019775 + }, + { + "auxiliary_loss_clip": 0.01306505, + "auxiliary_loss_mlp": 0.0108787, + "balance_loss_clip": 1.0833025, + "balance_loss_mlp": 1.04671907, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 3.0228258132842973, + "language_loss": 0.87625885, + "learning_rate": 3.86878748971496e-06, + "loss": 0.90020257, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.41137695, + "step": 407, + "time_per_iteration": 2.6760051250457764 + }, + { + "auxiliary_loss_clip": 0.01305264, + "auxiliary_loss_mlp": 0.01085682, + "balance_loss_clip": 1.08410323, + "balance_loss_mlp": 1.04660559, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.0098813959256634, + "language_loss": 0.74270284, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76661229, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.39086914, + "step": 408, + "time_per_iteration": 2.549370050430298 + }, + { + "auxiliary_loss_clip": 0.01310279, + "auxiliary_loss_mlp": 0.01096447, + "balance_loss_clip": 1.08275294, + "balance_loss_mlp": 1.05591607, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.657501592001049, + "language_loss": 0.92910826, + "learning_rate": 3.871943634189376e-06, + "loss": 0.95317554, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.4050293, + "step": 409, + "time_per_iteration": 3.9165544509887695 + }, + { + "auxiliary_loss_clip": 0.01306824, + "auxiliary_loss_mlp": 0.01077058, + "balance_loss_clip": 1.08348942, + "balance_loss_mlp": 1.04055655, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 2.1902209982583716, + "language_loss": 0.8271116, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85095042, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.36499023, + "step": 410, + "time_per_iteration": 2.62082839012146 + }, + { + "auxiliary_loss_clip": 0.01306315, + "auxiliary_loss_mlp": 0.01089621, + "balance_loss_clip": 1.08359313, + "balance_loss_mlp": 1.05147421, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.4969029403161453, + "language_loss": 0.78002048, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80397987, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.38134766, + "step": 411, + "time_per_iteration": 5.348876953125 + }, + { + "auxiliary_loss_clip": 0.01307795, + "auxiliary_loss_mlp": 0.01113611, + "balance_loss_clip": 1.08083105, + "balance_loss_mlp": 1.06978989, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 3.0165637460200516, + "language_loss": 0.86435175, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88856584, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.43823242, + "step": 412, + "time_per_iteration": 3.8021960258483887 + }, + { + "auxiliary_loss_clip": 0.01179371, + "auxiliary_loss_mlp": 0.01019415, + "balance_loss_clip": 1.06560171, + "balance_loss_mlp": 1.00659955, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8471634946284053, + "language_loss": 0.5850774, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60706532, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.1282959, + "step": 413, + "time_per_iteration": 3.1158790588378906 + }, + { + "auxiliary_loss_clip": 0.01301776, + "auxiliary_loss_mlp": 0.01097766, + "balance_loss_clip": 1.08072829, + "balance_loss_mlp": 1.05549455, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7717451050194197, + "language_loss": 0.80297172, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82696718, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.42236328, + "step": 414, + "time_per_iteration": 2.578296184539795 + }, + { + "auxiliary_loss_clip": 0.0129529, + "auxiliary_loss_mlp": 0.01100892, + "balance_loss_clip": 1.07688415, + "balance_loss_mlp": 1.06219649, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.411738165808652, + "language_loss": 0.80329901, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82726085, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.38696289, + "step": 415, + "time_per_iteration": 2.3845882415771484 + }, + { + "auxiliary_loss_clip": 0.01310972, + "auxiliary_loss_mlp": 0.01088585, + "balance_loss_clip": 1.08355284, + "balance_loss_mlp": 1.05077171, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 2.7048789033958376, + "language_loss": 0.96347153, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98746711, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.37792969, + "step": 416, + "time_per_iteration": 2.448673725128174 + }, + { + "auxiliary_loss_clip": 0.01299812, + "auxiliary_loss_mlp": 0.01082459, + "balance_loss_clip": 1.07778788, + "balance_loss_mlp": 1.04197538, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.058466286411631, + "language_loss": 0.77593607, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79975879, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.4050293, + "step": 417, + "time_per_iteration": 2.4508886337280273 + }, + { + "auxiliary_loss_clip": 0.01298506, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_clip": 1.08249116, + "balance_loss_mlp": 1.05940938, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.5498167108272944, + "language_loss": 0.77084255, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79483968, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.41796875, + "step": 418, + "time_per_iteration": 2.5402934551239014 + }, + { + "auxiliary_loss_clip": 0.0130844, + "auxiliary_loss_mlp": 0.01077373, + "balance_loss_clip": 1.0852232, + "balance_loss_mlp": 1.03989363, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.4398310056764707, + "language_loss": 0.81558144, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83943963, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.375, + "step": 419, + "time_per_iteration": 2.4478797912597656 + }, + { + "auxiliary_loss_clip": 0.01302489, + "auxiliary_loss_mlp": 0.0109063, + "balance_loss_clip": 1.08310294, + "balance_loss_mlp": 1.04974103, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.754117171755596, + "language_loss": 0.73740703, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.76133823, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.40869141, + "step": 420, + "time_per_iteration": 2.5171873569488525 + }, + { + "auxiliary_loss_clip": 0.01298766, + "auxiliary_loss_mlp": 0.01105034, + "balance_loss_clip": 1.07637048, + "balance_loss_mlp": 1.06266725, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.729303179593932, + "language_loss": 0.78876728, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81280529, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.42382812, + "step": 421, + "time_per_iteration": 2.4710254669189453 + }, + { + "auxiliary_loss_clip": 0.01301466, + "auxiliary_loss_mlp": 0.01090241, + "balance_loss_clip": 1.08204889, + "balance_loss_mlp": 1.05035353, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 13.185347063797387, + "language_loss": 0.82116902, + "learning_rate": 3.89208987073549e-06, + "loss": 0.8450861, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.39941406, + "step": 422, + "time_per_iteration": 2.5765597820281982 + }, + { + "auxiliary_loss_clip": 0.01299931, + "auxiliary_loss_mlp": 0.01083209, + "balance_loss_clip": 1.07704413, + "balance_loss_mlp": 1.04716039, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 3.1628708759970556, + "language_loss": 0.83628488, + "learning_rate": 3.893613781940409e-06, + "loss": 0.8601163, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.3605957, + "step": 423, + "time_per_iteration": 2.4490549564361572 + }, + { + "auxiliary_loss_clip": 0.01295334, + "auxiliary_loss_mlp": 0.01081522, + "balance_loss_clip": 1.07592738, + "balance_loss_mlp": 1.04420996, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 3.823307652694222, + "language_loss": 0.74646354, + "learning_rate": 3.895134094768415e-06, + "loss": 0.77023208, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.37329102, + "step": 424, + "time_per_iteration": 2.592684507369995 + }, + { + "auxiliary_loss_clip": 0.01305644, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_clip": 1.08229184, + "balance_loss_mlp": 1.05893588, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.567370455330783, + "language_loss": 0.83335698, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85737306, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.36987305, + "step": 425, + "time_per_iteration": 2.444528579711914 + }, + { + "auxiliary_loss_clip": 0.01305872, + "auxiliary_loss_mlp": 0.0108944, + "balance_loss_clip": 1.07701421, + "balance_loss_mlp": 1.04917121, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.3836350892673766, + "language_loss": 0.85435855, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87831163, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.40258789, + "step": 426, + "time_per_iteration": 2.492288112640381 + }, + { + "auxiliary_loss_clip": 0.01165921, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.05338895, + "balance_loss_mlp": 1.01532078, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.9077141307569722, + "language_loss": 0.57221383, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59414864, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.12243652, + "step": 427, + "time_per_iteration": 3.1407344341278076 + }, + { + "auxiliary_loss_clip": 0.01297231, + "auxiliary_loss_mlp": 0.01098784, + "balance_loss_clip": 1.08135724, + "balance_loss_mlp": 1.06116104, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.2300720897431976, + "language_loss": 0.88387978, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90783989, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.37646484, + "step": 428, + "time_per_iteration": 2.4525833129882812 + }, + { + "auxiliary_loss_clip": 0.01291754, + "auxiliary_loss_mlp": 0.01081989, + "balance_loss_clip": 1.0756284, + "balance_loss_mlp": 1.04157686, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6330565140265256, + "language_loss": 0.85840201, + "learning_rate": 3.902682272467353e-06, + "loss": 0.8821395, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.40429688, + "step": 429, + "time_per_iteration": 2.496180772781372 + }, + { + "auxiliary_loss_clip": 0.01295366, + "auxiliary_loss_mlp": 0.01079952, + "balance_loss_clip": 1.07343149, + "balance_loss_mlp": 1.04061317, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.229468120452581, + "language_loss": 0.88142854, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90518177, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.39355469, + "step": 430, + "time_per_iteration": 2.561286449432373 + }, + { + "auxiliary_loss_clip": 0.01298694, + "auxiliary_loss_mlp": 0.01087951, + "balance_loss_clip": 1.08137667, + "balance_loss_mlp": 1.05123448, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9984806426366806, + "language_loss": 0.84316611, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86703253, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.36743164, + "step": 431, + "time_per_iteration": 2.422456741333008 + }, + { + "auxiliary_loss_clip": 0.01293652, + "auxiliary_loss_mlp": 0.01083971, + "balance_loss_clip": 1.07562137, + "balance_loss_mlp": 1.04520416, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 3.037285362402744, + "language_loss": 0.86668706, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89046335, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.38793945, + "step": 432, + "time_per_iteration": 2.494534492492676 + }, + { + "auxiliary_loss_clip": 0.01298341, + "auxiliary_loss_mlp": 0.01075031, + "balance_loss_clip": 1.07751465, + "balance_loss_mlp": 1.03895807, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 1.9649319933627198, + "language_loss": 0.76112872, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78486252, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.36108398, + "step": 433, + "time_per_iteration": 2.5023915767669678 + }, + { + "auxiliary_loss_clip": 0.0129546, + "auxiliary_loss_mlp": 0.01100298, + "balance_loss_clip": 1.07464004, + "balance_loss_mlp": 1.05986202, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.577976283822976, + "language_loss": 0.89524734, + "learning_rate": 3.910142983797699e-06, + "loss": 0.91920495, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.40429688, + "step": 434, + "time_per_iteration": 2.4474563598632812 + }, + { + "auxiliary_loss_clip": 0.01295285, + "auxiliary_loss_mlp": 0.0110638, + "balance_loss_clip": 1.0784291, + "balance_loss_mlp": 1.06687427, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.3941633147558474, + "language_loss": 0.80119705, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82521367, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.39501953, + "step": 435, + "time_per_iteration": 2.411217451095581 + }, + { + "auxiliary_loss_clip": 0.01290905, + "auxiliary_loss_mlp": 0.0109432, + "balance_loss_clip": 1.07307029, + "balance_loss_mlp": 1.05352616, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 2.2364166279405975, + "language_loss": 0.86660838, + "learning_rate": 3.913103228936546e-06, + "loss": 0.89046067, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.40820312, + "step": 436, + "time_per_iteration": 2.4623053073883057 + }, + { + "auxiliary_loss_clip": 0.01296446, + "auxiliary_loss_mlp": 0.01094496, + "balance_loss_clip": 1.07844639, + "balance_loss_mlp": 1.05651569, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.377136996092075, + "language_loss": 0.75014019, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77404964, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.37988281, + "step": 437, + "time_per_iteration": 2.4286575317382812 + }, + { + "auxiliary_loss_clip": 0.01294691, + "auxiliary_loss_mlp": 0.01096131, + "balance_loss_clip": 1.07879364, + "balance_loss_mlp": 1.05588543, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.6042344595326847, + "language_loss": 0.91486049, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93876868, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.40234375, + "step": 438, + "time_per_iteration": 2.4636194705963135 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.06319642, + "balance_loss_mlp": 1.02469862, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.8867159087912537, + "language_loss": 0.62617898, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64823806, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.10876465, + "step": 439, + "time_per_iteration": 3.143786668777466 + }, + { + "auxiliary_loss_clip": 0.0130235, + "auxiliary_loss_mlp": 0.01098549, + "balance_loss_clip": 1.08154726, + "balance_loss_mlp": 1.05956721, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 1.8049182977558054, + "language_loss": 0.75999463, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78400362, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.38964844, + "step": 440, + "time_per_iteration": 2.5376524925231934 + }, + { + "auxiliary_loss_clip": 0.01291405, + "auxiliary_loss_mlp": 0.01079987, + "balance_loss_clip": 1.07619476, + "balance_loss_mlp": 1.04179215, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 2.621708435200459, + "language_loss": 0.83087462, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85458851, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.38183594, + "step": 441, + "time_per_iteration": 2.4154410362243652 + }, + { + "auxiliary_loss_clip": 0.0129375, + "auxiliary_loss_mlp": 0.0109046, + "balance_loss_clip": 1.07365465, + "balance_loss_mlp": 1.05081105, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 1.97555815816537, + "language_loss": 0.78515708, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80899918, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.39672852, + "step": 442, + "time_per_iteration": 2.445411443710327 + }, + { + "auxiliary_loss_clip": 0.01146822, + "auxiliary_loss_mlp": 0.0101397, + "balance_loss_clip": 1.04120743, + "balance_loss_mlp": 1.00429082, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9678976149356853, + "language_loss": 0.64520639, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66681433, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.09667969, + "step": 443, + "time_per_iteration": 3.0047121047973633 + }, + { + "auxiliary_loss_clip": 0.01293809, + "auxiliary_loss_mlp": 0.01090468, + "balance_loss_clip": 1.0778048, + "balance_loss_mlp": 1.0531081, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.261511999933597, + "language_loss": 0.82571882, + "learning_rate": 3.924809954779425e-06, + "loss": 0.84956157, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.37353516, + "step": 444, + "time_per_iteration": 2.442917823791504 + }, + { + "auxiliary_loss_clip": 0.01294549, + "auxiliary_loss_mlp": 0.01091817, + "balance_loss_clip": 1.07407784, + "balance_loss_mlp": 1.05090451, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.3085115314299323, + "language_loss": 0.95446205, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97832572, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.40917969, + "step": 445, + "time_per_iteration": 2.471008062362671 + }, + { + "auxiliary_loss_clip": 0.01292611, + "auxiliary_loss_mlp": 0.01098302, + "balance_loss_clip": 1.07577455, + "balance_loss_mlp": 1.05810499, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.621079958999583, + "language_loss": 0.91982335, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.9437325, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 2.17089844, + "router_z_loss_mlp": 0.40185547, + "step": 446, + "time_per_iteration": 2.4148380756378174 + }, + { + "auxiliary_loss_clip": 0.01291776, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_clip": 1.07787824, + "balance_loss_mlp": 1.05572081, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.1893263166647676, + "language_loss": 0.80074483, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82460105, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.38110352, + "step": 447, + "time_per_iteration": 2.485360622406006 + }, + { + "auxiliary_loss_clip": 0.01288334, + "auxiliary_loss_mlp": 0.01081668, + "balance_loss_clip": 1.07413268, + "balance_loss_mlp": 1.04638243, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.346181337175862, + "language_loss": 0.86496878, + "learning_rate": 3.930584452530952e-06, + "loss": 0.88866878, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.3527832, + "step": 448, + "time_per_iteration": 2.476806879043579 + }, + { + "auxiliary_loss_clip": 0.01283109, + "auxiliary_loss_mlp": 0.01102481, + "balance_loss_clip": 1.07232201, + "balance_loss_mlp": 1.06750536, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 1.9295101808145052, + "language_loss": 0.88844144, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91229737, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.35009766, + "step": 449, + "time_per_iteration": 2.491192579269409 + }, + { + "auxiliary_loss_clip": 0.01293678, + "auxiliary_loss_mlp": 0.01098216, + "balance_loss_clip": 1.0741899, + "balance_loss_mlp": 1.05925846, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.8753231982112508, + "language_loss": 0.80430782, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82822675, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.3894043, + "step": 450, + "time_per_iteration": 2.482844352722168 + }, + { + "auxiliary_loss_clip": 0.01288243, + "auxiliary_loss_mlp": 0.01085661, + "balance_loss_clip": 1.07597458, + "balance_loss_mlp": 1.04689443, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 1.6372195602276163, + "language_loss": 0.81647319, + "learning_rate": 3.934881590952304e-06, + "loss": 0.84021223, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.38769531, + "step": 451, + "time_per_iteration": 2.480602979660034 + }, + { + "auxiliary_loss_clip": 0.01284437, + "auxiliary_loss_mlp": 0.01097035, + "balance_loss_clip": 1.07421184, + "balance_loss_mlp": 1.05757689, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 2.1854204460159314, + "language_loss": 0.76889098, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79270571, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.39428711, + "step": 452, + "time_per_iteration": 2.475796699523926 + }, + { + "auxiliary_loss_clip": 0.01282484, + "auxiliary_loss_mlp": 0.01091182, + "balance_loss_clip": 1.07248545, + "balance_loss_mlp": 1.05408418, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.8465331107832053, + "language_loss": 0.72930896, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75304556, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.37084961, + "step": 453, + "time_per_iteration": 3.9012019634246826 + }, + { + "auxiliary_loss_clip": 0.01281064, + "auxiliary_loss_mlp": 0.01093977, + "balance_loss_clip": 1.07139397, + "balance_loss_mlp": 1.05394626, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 17.782461277694914, + "language_loss": 0.82268298, + "learning_rate": 3.939150239848748e-06, + "loss": 0.8464334, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.40039062, + "step": 454, + "time_per_iteration": 2.6243810653686523 + }, + { + "auxiliary_loss_clip": 0.01285262, + "auxiliary_loss_mlp": 0.01076342, + "balance_loss_clip": 1.07340622, + "balance_loss_mlp": 1.04096115, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 1.998548251246117, + "language_loss": 0.75514078, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77875686, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.35400391, + "step": 455, + "time_per_iteration": 5.342177152633667 + }, + { + "auxiliary_loss_clip": 0.01285442, + "auxiliary_loss_mlp": 0.01092914, + "balance_loss_clip": 1.07025754, + "balance_loss_mlp": 1.05679345, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.0038022736448196, + "language_loss": 0.80692637, + "learning_rate": 3.941980363893499e-06, + "loss": 0.83070993, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.36132812, + "step": 456, + "time_per_iteration": 2.4388248920440674 + }, + { + "auxiliary_loss_clip": 0.01280556, + "auxiliary_loss_mlp": 0.01080371, + "balance_loss_clip": 1.07035482, + "balance_loss_mlp": 1.0422951, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 1.945802310348953, + "language_loss": 0.81676841, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.84037769, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.38110352, + "step": 457, + "time_per_iteration": 2.390629529953003 + }, + { + "auxiliary_loss_clip": 0.01284901, + "auxiliary_loss_mlp": 0.01085677, + "balance_loss_clip": 1.06928194, + "balance_loss_mlp": 1.04886532, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 1.9370529985645188, + "language_loss": 0.93704379, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96074957, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.36791992, + "step": 458, + "time_per_iteration": 2.5223124027252197 + }, + { + "auxiliary_loss_clip": 0.01282815, + "auxiliary_loss_mlp": 0.01097534, + "balance_loss_clip": 1.07037401, + "balance_loss_mlp": 1.06158018, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 3.398719528474248, + "language_loss": 0.79093742, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81474096, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.35961914, + "step": 459, + "time_per_iteration": 2.388437509536743 + }, + { + "auxiliary_loss_clip": 0.01288421, + "auxiliary_loss_mlp": 0.01090598, + "balance_loss_clip": 1.07740021, + "balance_loss_mlp": 1.04730082, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.7433817812882613, + "language_loss": 0.83594984, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85974002, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.43359375, + "step": 460, + "time_per_iteration": 2.5000979900360107 + }, + { + "auxiliary_loss_clip": 0.01141517, + "auxiliary_loss_mlp": 0.01042158, + "balance_loss_clip": 1.03615284, + "balance_loss_mlp": 1.0333246, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5801113027757487, + "language_loss": 0.73548853, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75732523, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.08850098, + "step": 461, + "time_per_iteration": 3.0854063034057617 + }, + { + "auxiliary_loss_clip": 0.01286507, + "auxiliary_loss_mlp": 0.01084825, + "balance_loss_clip": 1.08030772, + "balance_loss_mlp": 1.04989648, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.4448953495516217, + "language_loss": 0.81447262, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83818591, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.34936523, + "step": 462, + "time_per_iteration": 2.968337297439575 + }, + { + "auxiliary_loss_clip": 0.01282833, + "auxiliary_loss_mlp": 0.01077778, + "balance_loss_clip": 1.07382977, + "balance_loss_mlp": 1.04342175, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.3665701041462013, + "language_loss": 0.90264022, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92624629, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.34375, + "step": 463, + "time_per_iteration": 2.47147274017334 + }, + { + "auxiliary_loss_clip": 0.0114238, + "auxiliary_loss_mlp": 0.01013888, + "balance_loss_clip": 1.03564405, + "balance_loss_mlp": 1.00454164, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8829860139928805, + "language_loss": 0.59047472, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61203742, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.09338379, + "step": 464, + "time_per_iteration": 3.0251224040985107 + }, + { + "auxiliary_loss_clip": 0.01298515, + "auxiliary_loss_mlp": 0.01104551, + "balance_loss_clip": 1.08231938, + "balance_loss_mlp": 1.06709504, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.2108563600203066, + "language_loss": 0.8126173, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83664793, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 2.16308594, + "router_z_loss_mlp": 0.37451172, + "step": 465, + "time_per_iteration": 2.4965875148773193 + }, + { + "auxiliary_loss_clip": 0.01291607, + "auxiliary_loss_mlp": 0.01093492, + "balance_loss_clip": 1.07975507, + "balance_loss_mlp": 1.05651331, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.0112537245569757, + "language_loss": 0.78446549, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80831647, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.36938477, + "step": 466, + "time_per_iteration": 2.5239717960357666 + }, + { + "auxiliary_loss_clip": 0.01287414, + "auxiliary_loss_mlp": 0.01085903, + "balance_loss_clip": 1.07741535, + "balance_loss_mlp": 1.04913855, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 1.8339816770065696, + "language_loss": 0.87665969, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90039289, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.36767578, + "step": 467, + "time_per_iteration": 2.5039546489715576 + }, + { + "auxiliary_loss_clip": 0.01285817, + "auxiliary_loss_mlp": 0.01095128, + "balance_loss_clip": 1.07556677, + "balance_loss_mlp": 1.057482, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.358091789103757, + "language_loss": 0.85992908, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88373852, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.37646484, + "step": 468, + "time_per_iteration": 2.4190313816070557 + }, + { + "auxiliary_loss_clip": 0.0128352, + "auxiliary_loss_mlp": 0.01083789, + "balance_loss_clip": 1.07334757, + "balance_loss_mlp": 1.04468822, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.1802193449335503, + "language_loss": 0.9175598, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94123292, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.39086914, + "step": 469, + "time_per_iteration": 2.480530023574829 + }, + { + "auxiliary_loss_clip": 0.0128049, + "auxiliary_loss_mlp": 0.01095788, + "balance_loss_clip": 1.07367849, + "balance_loss_mlp": 1.05578089, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.2802135508659154, + "language_loss": 0.81654578, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84030855, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.39990234, + "step": 470, + "time_per_iteration": 2.4312193393707275 + }, + { + "auxiliary_loss_clip": 0.0128353, + "auxiliary_loss_mlp": 0.01086895, + "balance_loss_clip": 1.07142067, + "balance_loss_mlp": 1.04841459, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.7498571337304236, + "language_loss": 0.93289012, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95659435, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.38500977, + "step": 471, + "time_per_iteration": 2.4806149005889893 + }, + { + "auxiliary_loss_clip": 0.01278114, + "auxiliary_loss_mlp": 0.01089787, + "balance_loss_clip": 1.0716995, + "balance_loss_mlp": 1.05264151, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 2.006273026361256, + "language_loss": 0.76097095, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78464997, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.37133789, + "step": 472, + "time_per_iteration": 2.5532948970794678 + }, + { + "auxiliary_loss_clip": 0.01284283, + "auxiliary_loss_mlp": 0.01076861, + "balance_loss_clip": 1.07042086, + "balance_loss_mlp": 1.04116917, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 2.1722074721825235, + "language_loss": 0.93441331, + "learning_rate": 3.965547014290071e-06, + "loss": 0.9580248, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.35668945, + "step": 473, + "time_per_iteration": 2.4855117797851562 + }, + { + "auxiliary_loss_clip": 0.01289268, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_clip": 1.07429969, + "balance_loss_mlp": 1.06463158, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 2.581106792760042, + "language_loss": 0.88635427, + "learning_rate": 3.96690678709433e-06, + "loss": 0.91024709, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.35400391, + "step": 474, + "time_per_iteration": 2.4098196029663086 + }, + { + "auxiliary_loss_clip": 0.0128774, + "auxiliary_loss_mlp": 0.01091869, + "balance_loss_clip": 1.0779525, + "balance_loss_mlp": 1.05479479, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.0297264710231944, + "language_loss": 0.7889353, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81273139, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.37084961, + "step": 475, + "time_per_iteration": 2.5314676761627197 + }, + { + "auxiliary_loss_clip": 0.01143496, + "auxiliary_loss_mlp": 0.01017295, + "balance_loss_clip": 1.03931844, + "balance_loss_mlp": 1.00743604, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 1.0474760700978412, + "language_loss": 0.66987079, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69147873, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.09851074, + "step": 476, + "time_per_iteration": 2.9891014099121094 + }, + { + "auxiliary_loss_clip": 0.01287818, + "auxiliary_loss_mlp": 0.0108695, + "balance_loss_clip": 1.07720304, + "balance_loss_mlp": 1.04968548, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.3309485252929463, + "language_loss": 0.84238279, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86613047, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.37280273, + "step": 477, + "time_per_iteration": 2.4572043418884277 + }, + { + "auxiliary_loss_clip": 0.01289654, + "auxiliary_loss_mlp": 0.01096584, + "balance_loss_clip": 1.07704926, + "balance_loss_mlp": 1.0588429, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.1340278133151487, + "language_loss": 0.82350528, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84736764, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.37768555, + "step": 478, + "time_per_iteration": 2.475874185562134 + }, + { + "auxiliary_loss_clip": 0.01284237, + "auxiliary_loss_mlp": 0.01082088, + "balance_loss_clip": 1.07406688, + "balance_loss_mlp": 1.04506135, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.1829607239936295, + "language_loss": 0.81174332, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83540654, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.37036133, + "step": 479, + "time_per_iteration": 2.4648618698120117 + }, + { + "auxiliary_loss_clip": 0.01276263, + "auxiliary_loss_mlp": 0.01083609, + "balance_loss_clip": 1.0696367, + "balance_loss_mlp": 1.0443182, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.877526633678373, + "language_loss": 0.73626256, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75986123, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.39282227, + "step": 480, + "time_per_iteration": 2.4364736080169678 + }, + { + "auxiliary_loss_clip": 0.01280254, + "auxiliary_loss_mlp": 0.01075247, + "balance_loss_clip": 1.0739491, + "balance_loss_mlp": 1.04110599, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.8255077996391835, + "language_loss": 0.88067496, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90423, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.34179688, + "step": 481, + "time_per_iteration": 2.527010202407837 + }, + { + "auxiliary_loss_clip": 0.01137583, + "auxiliary_loss_mlp": 0.01019161, + "balance_loss_clip": 1.03368855, + "balance_loss_mlp": 1.00988698, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8263172746158538, + "language_loss": 0.66073167, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68229914, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.09265137, + "step": 482, + "time_per_iteration": 2.8079748153686523 + }, + { + "auxiliary_loss_clip": 0.01291542, + "auxiliary_loss_mlp": 0.01077664, + "balance_loss_clip": 1.07564425, + "balance_loss_mlp": 1.04221153, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.6451074527561995, + "language_loss": 0.79430962, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81800163, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.35449219, + "step": 483, + "time_per_iteration": 2.4554944038391113 + }, + { + "auxiliary_loss_clip": 0.01284196, + "auxiliary_loss_mlp": 0.01097003, + "balance_loss_clip": 1.07439053, + "balance_loss_mlp": 1.0602628, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.1079131516209952, + "language_loss": 0.75911808, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78293008, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.36743164, + "step": 484, + "time_per_iteration": 2.4140126705169678 + }, + { + "auxiliary_loss_clip": 0.01277998, + "auxiliary_loss_mlp": 0.01081678, + "balance_loss_clip": 1.07105887, + "balance_loss_mlp": 1.04686856, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.1066506355617136, + "language_loss": 0.83745003, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86104679, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.34790039, + "step": 485, + "time_per_iteration": 2.5206453800201416 + }, + { + "auxiliary_loss_clip": 0.01290231, + "auxiliary_loss_mlp": 0.01089926, + "balance_loss_clip": 1.07982314, + "balance_loss_mlp": 1.05373406, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.4669993236485115, + "language_loss": 0.8447051, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86850667, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.36206055, + "step": 486, + "time_per_iteration": 2.4329185485839844 + }, + { + "auxiliary_loss_clip": 0.01282425, + "auxiliary_loss_mlp": 0.01099175, + "balance_loss_clip": 1.07378435, + "balance_loss_mlp": 1.06222022, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 3.8814123682813055, + "language_loss": 0.89174354, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91555953, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.36987305, + "step": 487, + "time_per_iteration": 2.5205633640289307 + }, + { + "auxiliary_loss_clip": 0.01284701, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_clip": 1.076406, + "balance_loss_mlp": 1.03660321, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 3.5499114629060853, + "language_loss": 0.88262439, + "learning_rate": 3.985648090637122e-06, + "loss": 0.9061836, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.34619141, + "step": 488, + "time_per_iteration": 2.4109089374542236 + }, + { + "auxiliary_loss_clip": 0.01277586, + "auxiliary_loss_mlp": 0.01085951, + "balance_loss_clip": 1.07180595, + "balance_loss_mlp": 1.04987872, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 4.608310261380659, + "language_loss": 0.89192301, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91555834, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.36083984, + "step": 489, + "time_per_iteration": 2.502392292022705 + }, + { + "auxiliary_loss_clip": 0.01272285, + "auxiliary_loss_mlp": 0.01078003, + "balance_loss_clip": 1.06733322, + "balance_loss_mlp": 1.04128671, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 1.9597296167544331, + "language_loss": 0.8848328, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90833575, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.3671875, + "step": 490, + "time_per_iteration": 2.443208932876587 + }, + { + "auxiliary_loss_clip": 0.01286029, + "auxiliary_loss_mlp": 0.01085293, + "balance_loss_clip": 1.07456934, + "balance_loss_mlp": 1.0503881, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 3.7057040465337887, + "language_loss": 0.91345394, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93716717, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.34912109, + "step": 491, + "time_per_iteration": 2.476398468017578 + }, + { + "auxiliary_loss_clip": 0.01273409, + "auxiliary_loss_mlp": 0.01076714, + "balance_loss_clip": 1.07205558, + "balance_loss_mlp": 1.04300201, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 1.9722047811495544, + "language_loss": 0.85559106, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87909228, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.33691406, + "step": 492, + "time_per_iteration": 2.4115254878997803 + }, + { + "auxiliary_loss_clip": 0.01282183, + "auxiliary_loss_mlp": 0.01099072, + "balance_loss_clip": 1.07582831, + "balance_loss_mlp": 1.06264138, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.7926699815744054, + "language_loss": 0.84380507, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86761761, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.36425781, + "step": 493, + "time_per_iteration": 2.4228672981262207 + }, + { + "auxiliary_loss_clip": 0.01274557, + "auxiliary_loss_mlp": 0.01075767, + "balance_loss_clip": 1.06870985, + "balance_loss_mlp": 1.04169738, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 2.0437391346867884, + "language_loss": 0.86727989, + "learning_rate": 3.99351603600268e-06, + "loss": 0.89078307, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.34033203, + "step": 494, + "time_per_iteration": 2.4273693561553955 + }, + { + "auxiliary_loss_clip": 0.01281915, + "auxiliary_loss_mlp": 0.01087797, + "balance_loss_clip": 1.07199025, + "balance_loss_mlp": 1.05205774, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 2.1072959150577426, + "language_loss": 0.86942893, + "learning_rate": 3.994818063106668e-06, + "loss": 0.89312601, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.35717773, + "step": 495, + "time_per_iteration": 2.5249595642089844 + }, + { + "auxiliary_loss_clip": 0.0126975, + "auxiliary_loss_mlp": 0.01084565, + "balance_loss_clip": 1.06956232, + "balance_loss_mlp": 1.05061388, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.9148385439716502, + "language_loss": 0.62221408, + "learning_rate": 3.99611746250533e-06, + "loss": 0.6457572, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.33984375, + "step": 496, + "time_per_iteration": 3.9166171550750732 + }, + { + "auxiliary_loss_clip": 0.01274123, + "auxiliary_loss_mlp": 0.01098046, + "balance_loss_clip": 1.07313538, + "balance_loss_mlp": 1.06156826, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 2.738021684030482, + "language_loss": 0.89074314, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91446483, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.36499023, + "step": 497, + "time_per_iteration": 3.9013938903808594 + }, + { + "auxiliary_loss_clip": 0.0128206, + "auxiliary_loss_mlp": 0.01082528, + "balance_loss_clip": 1.07537925, + "balance_loss_mlp": 1.04795718, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 2.6904200399605327, + "language_loss": 0.8509779, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87462378, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.34594727, + "step": 498, + "time_per_iteration": 3.8652923107147217 + }, + { + "auxiliary_loss_clip": 0.01275587, + "auxiliary_loss_mlp": 0.01082449, + "balance_loss_clip": 1.07208169, + "balance_loss_mlp": 1.04930878, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 3.157660723674499, + "language_loss": 0.77953821, + "learning_rate": 4e-06, + "loss": 0.80311859, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.33129883, + "step": 499, + "time_per_iteration": 2.431337356567383 + }, + { + "auxiliary_loss_clip": 0.01280913, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_clip": 1.0762955, + "balance_loss_mlp": 1.04900181, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 1.8093470736063548, + "language_loss": 0.83024991, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85389388, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.3449707, + "step": 500, + "time_per_iteration": 2.504876136779785 + }, + { + "auxiliary_loss_clip": 0.0126726, + "auxiliary_loss_mlp": 0.01086867, + "balance_loss_clip": 1.06734467, + "balance_loss_mlp": 1.05055547, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.9368837938229366, + "language_loss": 0.88141608, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90495735, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.36328125, + "step": 501, + "time_per_iteration": 2.5115973949432373 + }, + { + "auxiliary_loss_clip": 0.01278447, + "auxiliary_loss_mlp": 0.01078456, + "balance_loss_clip": 1.07082522, + "balance_loss_mlp": 1.04524434, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.684453261384599, + "language_loss": 0.86474299, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88831204, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.33203125, + "step": 502, + "time_per_iteration": 2.4233484268188477 + }, + { + "auxiliary_loss_clip": 0.0127246, + "auxiliary_loss_mlp": 0.0107672, + "balance_loss_clip": 1.07153738, + "balance_loss_mlp": 1.04269779, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6330766217183434, + "language_loss": 0.84226358, + "learning_rate": 3.999999393278425e-06, + "loss": 0.86575538, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.34008789, + "step": 503, + "time_per_iteration": 2.42061710357666 + }, + { + "auxiliary_loss_clip": 0.01265653, + "auxiliary_loss_mlp": 0.01090681, + "balance_loss_clip": 1.06968307, + "balance_loss_mlp": 1.05672991, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6415034763778669, + "language_loss": 0.88139319, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90495652, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.33959961, + "step": 504, + "time_per_iteration": 2.5186946392059326 + }, + { + "auxiliary_loss_clip": 0.01271363, + "auxiliary_loss_mlp": 0.01095071, + "balance_loss_clip": 1.07041049, + "balance_loss_mlp": 1.06140614, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 1.9502562535275592, + "language_loss": 0.78190601, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80557036, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.33666992, + "step": 505, + "time_per_iteration": 2.4602766036987305 + }, + { + "auxiliary_loss_clip": 0.01147097, + "auxiliary_loss_mlp": 0.01049806, + "balance_loss_clip": 1.03593826, + "balance_loss_mlp": 1.03944635, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.9490348438043998, + "language_loss": 0.55054951, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57251853, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.1036377, + "step": 506, + "time_per_iteration": 3.227980613708496 + }, + { + "auxiliary_loss_clip": 0.01270862, + "auxiliary_loss_mlp": 0.01082561, + "balance_loss_clip": 1.06786132, + "balance_loss_mlp": 1.04810989, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 2.386378732616954, + "language_loss": 0.83568025, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85921443, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.34448242, + "step": 507, + "time_per_iteration": 2.509920120239258 + }, + { + "auxiliary_loss_clip": 0.01273029, + "auxiliary_loss_mlp": 0.01085248, + "balance_loss_clip": 1.06923079, + "balance_loss_mlp": 1.04862678, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.245019790947023, + "language_loss": 0.8881551, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91173792, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.36645508, + "step": 508, + "time_per_iteration": 2.5041348934173584 + }, + { + "auxiliary_loss_clip": 0.01279886, + "auxiliary_loss_mlp": 0.01068852, + "balance_loss_clip": 1.07299972, + "balance_loss_mlp": 1.03423381, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.095945573608271, + "language_loss": 0.71362633, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73711371, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.34619141, + "step": 509, + "time_per_iteration": 2.582265853881836 + }, + { + "auxiliary_loss_clip": 0.01271332, + "auxiliary_loss_mlp": 0.01070654, + "balance_loss_clip": 1.07158518, + "balance_loss_mlp": 1.03827691, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.2303061678007317, + "language_loss": 0.82088125, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8443011, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.32373047, + "step": 510, + "time_per_iteration": 2.4590160846710205 + }, + { + "auxiliary_loss_clip": 0.01271896, + "auxiliary_loss_mlp": 0.01094754, + "balance_loss_clip": 1.073071, + "balance_loss_mlp": 1.05784667, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.5604378973412234, + "language_loss": 0.83884758, + "learning_rate": 3.999994539508036e-06, + "loss": 0.86251408, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.36889648, + "step": 511, + "time_per_iteration": 2.4935812950134277 + }, + { + "auxiliary_loss_clip": 0.01272784, + "auxiliary_loss_mlp": 0.01078007, + "balance_loss_clip": 1.0684154, + "balance_loss_mlp": 1.0442946, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.2504115759534793, + "language_loss": 0.82515132, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84865928, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.3371582, + "step": 512, + "time_per_iteration": 2.4614174365997314 + }, + { + "auxiliary_loss_clip": 0.01272707, + "auxiliary_loss_mlp": 0.01082717, + "balance_loss_clip": 1.0704776, + "balance_loss_mlp": 1.04807496, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 2.0181206659803834, + "language_loss": 0.87138361, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89493781, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.34643555, + "step": 513, + "time_per_iteration": 2.6019437313079834 + }, + { + "auxiliary_loss_clip": 0.01278128, + "auxiliary_loss_mlp": 0.01077728, + "balance_loss_clip": 1.07229447, + "balance_loss_mlp": 1.0427047, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 2.1180048144420867, + "language_loss": 0.79359943, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81715798, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.3503418, + "step": 514, + "time_per_iteration": 2.465216875076294 + }, + { + "auxiliary_loss_clip": 0.01269983, + "auxiliary_loss_mlp": 0.0106698, + "balance_loss_clip": 1.07107246, + "balance_loss_mlp": 1.03438866, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 3.7400651271102245, + "language_loss": 0.7766664, + "learning_rate": 3.999990292462167e-06, + "loss": 0.80003601, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.32617188, + "step": 515, + "time_per_iteration": 2.510329246520996 + }, + { + "auxiliary_loss_clip": 0.01268951, + "auxiliary_loss_mlp": 0.01070529, + "balance_loss_clip": 1.06661415, + "balance_loss_mlp": 1.03624392, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.932378645137839, + "language_loss": 0.82799578, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85139054, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.34301758, + "step": 516, + "time_per_iteration": 2.6695549488067627 + }, + { + "auxiliary_loss_clip": 0.01267771, + "auxiliary_loss_mlp": 0.01077292, + "balance_loss_clip": 1.06976581, + "balance_loss_mlp": 1.04286492, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 1.7589337832660743, + "language_loss": 0.79101479, + "learning_rate": 3.999987713900071e-06, + "loss": 0.8144654, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.34423828, + "step": 517, + "time_per_iteration": 2.4792487621307373 + }, + { + "auxiliary_loss_clip": 0.01265992, + "auxiliary_loss_mlp": 0.01076322, + "balance_loss_clip": 1.07150495, + "balance_loss_mlp": 1.04234731, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.7819471997809357, + "language_loss": 0.90738797, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93081117, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.33935547, + "step": 518, + "time_per_iteration": 2.4929251670837402 + }, + { + "auxiliary_loss_clip": 0.0127455, + "auxiliary_loss_mlp": 0.01098829, + "balance_loss_clip": 1.07592332, + "balance_loss_mlp": 1.06268466, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 4.212982880922224, + "language_loss": 0.86374021, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88747394, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.36108398, + "step": 519, + "time_per_iteration": 2.463771343231201 + }, + { + "auxiliary_loss_clip": 0.01271997, + "auxiliary_loss_mlp": 0.01095973, + "balance_loss_clip": 1.06750798, + "balance_loss_mlp": 1.05951905, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.3677310140264414, + "language_loss": 0.87275994, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89643967, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.36425781, + "step": 520, + "time_per_iteration": 2.4958927631378174 + }, + { + "auxiliary_loss_clip": 0.01273595, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_clip": 1.06924343, + "balance_loss_mlp": 1.05449152, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 2.014826589605571, + "language_loss": 0.89647299, + "learning_rate": 3.999981646699509e-06, + "loss": 0.92010665, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.35302734, + "step": 521, + "time_per_iteration": 2.4442553520202637 + }, + { + "auxiliary_loss_clip": 0.01266339, + "auxiliary_loss_mlp": 0.01080164, + "balance_loss_clip": 1.06726456, + "balance_loss_mlp": 1.04134917, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.189281731023907, + "language_loss": 0.71294105, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73640609, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.38793945, + "step": 522, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.01277147, + "auxiliary_loss_mlp": 0.01082005, + "balance_loss_clip": 1.07051957, + "balance_loss_mlp": 1.0485785, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1271580482394827, + "language_loss": 0.85475934, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87835085, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.33447266, + "step": 523, + "time_per_iteration": 2.437105178833008 + }, + { + "auxiliary_loss_clip": 0.01275574, + "auxiliary_loss_mlp": 0.01092029, + "balance_loss_clip": 1.06828332, + "balance_loss_mlp": 1.05433512, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 2.762270803278037, + "language_loss": 0.90424883, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92792487, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.37695312, + "step": 524, + "time_per_iteration": 2.4177260398864746 + }, + { + "auxiliary_loss_clip": 0.0129388, + "auxiliary_loss_mlp": 0.01088396, + "balance_loss_clip": 1.08220887, + "balance_loss_mlp": 1.05308604, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.5130931636483007, + "language_loss": 0.80130821, + "learning_rate": 3.999974366066933e-06, + "loss": 0.825131, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.35327148, + "step": 525, + "time_per_iteration": 2.499000310897827 + }, + { + "auxiliary_loss_clip": 0.0127063, + "auxiliary_loss_mlp": 0.01077083, + "balance_loss_clip": 1.06699598, + "balance_loss_mlp": 1.0427506, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.2741599910790846, + "language_loss": 0.80812764, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83160478, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.34326172, + "step": 526, + "time_per_iteration": 2.4493188858032227 + }, + { + "auxiliary_loss_clip": 0.01284464, + "auxiliary_loss_mlp": 0.0107656, + "balance_loss_clip": 1.07565761, + "balance_loss_mlp": 1.03867543, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.163671185716368, + "language_loss": 0.81309545, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83670568, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.37866211, + "step": 527, + "time_per_iteration": 2.4406042098999023 + }, + { + "auxiliary_loss_clip": 0.01269608, + "auxiliary_loss_mlp": 0.01083045, + "balance_loss_clip": 1.06675839, + "balance_loss_mlp": 1.04854536, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 1.987926953695524, + "language_loss": 0.93864775, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96217424, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.3449707, + "step": 528, + "time_per_iteration": 2.4678566455841064 + }, + { + "auxiliary_loss_clip": 0.01269628, + "auxiliary_loss_mlp": 0.0107081, + "balance_loss_clip": 1.0662843, + "balance_loss_mlp": 1.03726423, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8664282236463314, + "language_loss": 0.83966434, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86306864, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.33520508, + "step": 529, + "time_per_iteration": 2.479313850402832 + }, + { + "auxiliary_loss_clip": 0.01272557, + "auxiliary_loss_mlp": 0.01087788, + "balance_loss_clip": 1.07432365, + "balance_loss_mlp": 1.05378962, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 1.972386330222208, + "language_loss": 0.90487325, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92847675, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.33996582, + "step": 530, + "time_per_iteration": 2.496230125427246 + }, + { + "auxiliary_loss_clip": 0.0126517, + "auxiliary_loss_mlp": 0.0107532, + "balance_loss_clip": 1.06264949, + "balance_loss_mlp": 1.03929508, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.143241539002764, + "language_loss": 0.75985491, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78325987, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.36035156, + "step": 531, + "time_per_iteration": 2.5200157165527344 + }, + { + "auxiliary_loss_clip": 0.01266656, + "auxiliary_loss_mlp": 0.010709, + "balance_loss_clip": 1.06539738, + "balance_loss_mlp": 1.03489923, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.188522032270987, + "language_loss": 0.90467536, + "learning_rate": 3.999958705152843e-06, + "loss": 0.92805094, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.35986328, + "step": 532, + "time_per_iteration": 2.482733964920044 + }, + { + "auxiliary_loss_clip": 0.01146204, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.0397768, + "balance_loss_mlp": 1.02257419, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7326584471308951, + "language_loss": 0.57914186, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.6009053, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.07556152, + "step": 533, + "time_per_iteration": 3.113551616668701 + }, + { + "auxiliary_loss_clip": 0.01279027, + "auxiliary_loss_mlp": 0.0108656, + "balance_loss_clip": 1.0749284, + "balance_loss_mlp": 1.05215621, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8062485003245479, + "language_loss": 0.86704206, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89069784, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.34399414, + "step": 534, + "time_per_iteration": 2.4983747005462646 + }, + { + "auxiliary_loss_clip": 0.01266429, + "auxiliary_loss_mlp": 0.01072134, + "balance_loss_clip": 1.06789708, + "balance_loss_mlp": 1.03839731, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.2328496613266013, + "language_loss": 0.76978779, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79317337, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.33740234, + "step": 535, + "time_per_iteration": 2.5291225910186768 + }, + { + "auxiliary_loss_clip": 0.01284089, + "auxiliary_loss_mlp": 0.01095657, + "balance_loss_clip": 1.0780493, + "balance_loss_mlp": 1.06149173, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 3.0130644375054367, + "language_loss": 0.80964613, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83344358, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.34179688, + "step": 536, + "time_per_iteration": 2.524505138397217 + }, + { + "auxiliary_loss_clip": 0.01268132, + "auxiliary_loss_mlp": 0.01077246, + "balance_loss_clip": 1.06821704, + "balance_loss_mlp": 1.04110157, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.763377009430925, + "language_loss": 0.69976974, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72322357, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.36132812, + "step": 537, + "time_per_iteration": 2.5459742546081543 + }, + { + "auxiliary_loss_clip": 0.01278048, + "auxiliary_loss_mlp": 0.01090496, + "balance_loss_clip": 1.0772686, + "balance_loss_mlp": 1.05678368, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.352809112169099, + "language_loss": 0.82919204, + "learning_rate": 3.999942323804607e-06, + "loss": 0.8528775, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.33691406, + "step": 538, + "time_per_iteration": 2.4757044315338135 + }, + { + "auxiliary_loss_clip": 0.01279244, + "auxiliary_loss_mlp": 0.01083096, + "balance_loss_clip": 1.0710274, + "balance_loss_mlp": 1.04883504, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 2.5035266925388573, + "language_loss": 0.79400885, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81763232, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.3425293, + "step": 539, + "time_per_iteration": 2.532674551010132 + }, + { + "auxiliary_loss_clip": 0.01269138, + "auxiliary_loss_mlp": 0.01073964, + "balance_loss_clip": 1.06851077, + "balance_loss_mlp": 1.03903627, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.633965224383571, + "language_loss": 0.7730881, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79651916, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.34912109, + "step": 540, + "time_per_iteration": 3.9111578464508057 + }, + { + "auxiliary_loss_clip": 0.0127527, + "auxiliary_loss_mlp": 0.01080658, + "balance_loss_clip": 1.07209599, + "balance_loss_mlp": 1.04687405, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.812065825210922, + "language_loss": 0.8562429, + "learning_rate": 3.999933109315878e-06, + "loss": 0.87980217, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.33764648, + "step": 541, + "time_per_iteration": 3.8845434188842773 + }, + { + "auxiliary_loss_clip": 0.01264782, + "auxiliary_loss_mlp": 0.01101701, + "balance_loss_clip": 1.06816101, + "balance_loss_mlp": 1.06250477, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 4.055442586337348, + "language_loss": 0.89346194, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91712677, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.3918457, + "step": 542, + "time_per_iteration": 5.197389602661133 + }, + { + "auxiliary_loss_clip": 0.01268081, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_clip": 1.06885028, + "balance_loss_mlp": 1.04943013, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.2044322380770103, + "language_loss": 0.70837677, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73189902, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.34716797, + "step": 543, + "time_per_iteration": 2.4609365463256836 + }, + { + "auxiliary_loss_clip": 0.01266246, + "auxiliary_loss_mlp": 0.01090269, + "balance_loss_clip": 1.06277251, + "balance_loss_mlp": 1.05457795, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.66556752239319, + "language_loss": 0.9177534, + "learning_rate": 3.999923212288192e-06, + "loss": 0.94131851, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.35668945, + "step": 544, + "time_per_iteration": 2.4055731296539307 + }, + { + "auxiliary_loss_clip": 0.01269215, + "auxiliary_loss_mlp": 0.01093396, + "balance_loss_clip": 1.0687232, + "balance_loss_mlp": 1.05877733, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.8514371412001416, + "language_loss": 0.65740347, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68102962, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.34619141, + "step": 545, + "time_per_iteration": 2.4541826248168945 + }, + { + "auxiliary_loss_clip": 0.012687, + "auxiliary_loss_mlp": 0.01073575, + "balance_loss_clip": 1.06614709, + "balance_loss_mlp": 1.040411, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.432656000928981, + "language_loss": 0.92087471, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94429737, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.33154297, + "step": 546, + "time_per_iteration": 2.4250826835632324 + }, + { + "auxiliary_loss_clip": 0.01262558, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.06178188, + "balance_loss_mlp": 1.03770435, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 3.7740733880707813, + "language_loss": 0.81677979, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84013718, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.35473633, + "step": 547, + "time_per_iteration": 2.4886105060577393 + }, + { + "auxiliary_loss_clip": 0.01264711, + "auxiliary_loss_mlp": 0.01076631, + "balance_loss_clip": 1.06570506, + "balance_loss_mlp": 1.0419172, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.8977237263158555, + "language_loss": 0.81083345, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83424687, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.34716797, + "step": 548, + "time_per_iteration": 2.51440167427063 + }, + { + "auxiliary_loss_clip": 0.01267169, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_clip": 1.06580734, + "balance_loss_mlp": 1.04857814, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.811832765896871, + "language_loss": 0.67497045, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69848573, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.35791016, + "step": 549, + "time_per_iteration": 2.4712605476379395 + }, + { + "auxiliary_loss_clip": 0.01259237, + "auxiliary_loss_mlp": 0.010795, + "balance_loss_clip": 1.0652473, + "balance_loss_mlp": 1.04559743, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.803915269466362, + "language_loss": 0.86163747, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88502485, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.33886719, + "step": 550, + "time_per_iteration": 2.401296377182007 + }, + { + "auxiliary_loss_clip": 0.01270106, + "auxiliary_loss_mlp": 0.01094437, + "balance_loss_clip": 1.07174122, + "balance_loss_mlp": 1.06043863, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 1.8366782596224869, + "language_loss": 0.81393206, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83757752, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.33984375, + "step": 551, + "time_per_iteration": 2.4586291313171387 + }, + { + "auxiliary_loss_clip": 0.01281964, + "auxiliary_loss_mlp": 0.01083647, + "balance_loss_clip": 1.07539725, + "balance_loss_mlp": 1.04969561, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.8699637218934466, + "language_loss": 0.86756575, + "learning_rate": 3.999893483383658e-06, + "loss": 0.89122176, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.33935547, + "step": 552, + "time_per_iteration": 2.4951858520507812 + }, + { + "auxiliary_loss_clip": 0.01275043, + "auxiliary_loss_mlp": 0.01082865, + "balance_loss_clip": 1.07320821, + "balance_loss_mlp": 1.04691124, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 3.039045477373242, + "language_loss": 0.93138361, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95496261, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.359375, + "step": 553, + "time_per_iteration": 2.529451370239258 + }, + { + "auxiliary_loss_clip": 0.01285504, + "auxiliary_loss_mlp": 0.01079587, + "balance_loss_clip": 1.08195221, + "balance_loss_mlp": 1.04341877, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.2949179525013013, + "language_loss": 0.78901225, + "learning_rate": 3.999885292792986e-06, + "loss": 0.8126632, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.36157227, + "step": 554, + "time_per_iteration": 2.555934190750122 + }, + { + "auxiliary_loss_clip": 0.01261288, + "auxiliary_loss_mlp": 0.01084385, + "balance_loss_clip": 1.06548238, + "balance_loss_mlp": 1.04890811, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1049556510259118, + "language_loss": 0.81991303, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84336984, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.35473633, + "step": 555, + "time_per_iteration": 2.5828497409820557 + }, + { + "auxiliary_loss_clip": 0.01264884, + "auxiliary_loss_mlp": 0.01081854, + "balance_loss_clip": 1.06540632, + "balance_loss_mlp": 1.04675889, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.9130667641625023, + "language_loss": 0.88321412, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90668148, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.35107422, + "step": 556, + "time_per_iteration": 2.501940965652466 + }, + { + "auxiliary_loss_clip": 0.01267542, + "auxiliary_loss_mlp": 0.01079881, + "balance_loss_clip": 1.06873429, + "balance_loss_mlp": 1.04440415, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.295487668169056, + "language_loss": 0.83412457, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85759878, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.35498047, + "step": 557, + "time_per_iteration": 2.512049436569214 + }, + { + "auxiliary_loss_clip": 0.0127056, + "auxiliary_loss_mlp": 0.01071, + "balance_loss_clip": 1.07267201, + "balance_loss_mlp": 1.03785944, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 2.665559508791083, + "language_loss": 0.93671942, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96013498, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.33129883, + "step": 558, + "time_per_iteration": 2.4143247604370117 + }, + { + "auxiliary_loss_clip": 0.01268054, + "auxiliary_loss_mlp": 0.01073473, + "balance_loss_clip": 1.06797457, + "balance_loss_mlp": 1.03992701, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 6.775750133927222, + "language_loss": 0.77202445, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79543972, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.33544922, + "step": 559, + "time_per_iteration": 2.503293514251709 + }, + { + "auxiliary_loss_clip": 0.01265118, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_clip": 1.0666759, + "balance_loss_mlp": 1.05030537, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.014361797714762, + "language_loss": 0.87668806, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90016675, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.32421875, + "step": 560, + "time_per_iteration": 2.4671030044555664 + }, + { + "auxiliary_loss_clip": 0.012601, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_clip": 1.06417966, + "balance_loss_mlp": 1.03654003, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2578935603846757, + "language_loss": 0.81519407, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83846921, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.30859375, + "step": 561, + "time_per_iteration": 2.478531837463379 + }, + { + "auxiliary_loss_clip": 0.01258473, + "auxiliary_loss_mlp": 0.01072121, + "balance_loss_clip": 1.06617999, + "balance_loss_mlp": 1.04045916, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 2.3064441104796223, + "language_loss": 0.82157779, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84488374, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.31640625, + "step": 562, + "time_per_iteration": 2.460118293762207 + }, + { + "auxiliary_loss_clip": 0.01266023, + "auxiliary_loss_mlp": 0.01075689, + "balance_loss_clip": 1.06850219, + "balance_loss_mlp": 1.04188156, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.1319519514195693, + "language_loss": 0.84157825, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86499536, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.33813477, + "step": 563, + "time_per_iteration": 2.4259562492370605 + }, + { + "auxiliary_loss_clip": 0.01263722, + "auxiliary_loss_mlp": 0.01081572, + "balance_loss_clip": 1.06958723, + "balance_loss_mlp": 1.04867029, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.9479317421775297, + "language_loss": 0.93954277, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96299571, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.32910156, + "step": 564, + "time_per_iteration": 2.3870346546173096 + }, + { + "auxiliary_loss_clip": 0.01278812, + "auxiliary_loss_mlp": 0.01070793, + "balance_loss_clip": 1.07548976, + "balance_loss_mlp": 1.03703284, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.3191458372418223, + "language_loss": 0.94541544, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96891153, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.33764648, + "step": 565, + "time_per_iteration": 2.455517530441284 + }, + { + "auxiliary_loss_clip": 0.01157268, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.04765916, + "balance_loss_mlp": 1.0263871, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.035331267751436, + "language_loss": 0.54804832, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.56996614, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.08129883, + "step": 566, + "time_per_iteration": 3.110107421875 + }, + { + "auxiliary_loss_clip": 0.01266023, + "auxiliary_loss_mlp": 0.0107212, + "balance_loss_clip": 1.0695653, + "balance_loss_mlp": 1.0365243, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.4172541441174578, + "language_loss": 0.76810509, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79148656, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.35571289, + "step": 567, + "time_per_iteration": 2.511686325073242 + }, + { + "auxiliary_loss_clip": 0.01258329, + "auxiliary_loss_mlp": 0.01079997, + "balance_loss_clip": 1.0665369, + "balance_loss_mlp": 1.04876375, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 2.0031988073477334, + "language_loss": 0.80754197, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83092529, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.31225586, + "step": 568, + "time_per_iteration": 2.529449701309204 + }, + { + "auxiliary_loss_clip": 0.01259479, + "auxiliary_loss_mlp": 0.01084554, + "balance_loss_clip": 1.06803203, + "balance_loss_mlp": 1.04879153, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.714001865685956, + "language_loss": 0.86327451, + "learning_rate": 3.999814194385413e-06, + "loss": 0.88671482, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.35791016, + "step": 569, + "time_per_iteration": 2.4500653743743896 + }, + { + "auxiliary_loss_clip": 0.01265615, + "auxiliary_loss_mlp": 0.01077846, + "balance_loss_clip": 1.06903207, + "balance_loss_mlp": 1.04487324, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 2.323375365632005, + "language_loss": 0.95924622, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98268086, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.32958984, + "step": 570, + "time_per_iteration": 2.407297372817993 + }, + { + "auxiliary_loss_clip": 0.01262212, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_clip": 1.06356311, + "balance_loss_mlp": 1.04049182, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.3216707884992958, + "language_loss": 0.79556483, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81897235, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.38037109, + "step": 571, + "time_per_iteration": 2.4652841091156006 + }, + { + "auxiliary_loss_clip": 0.01268118, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_clip": 1.07225227, + "balance_loss_mlp": 1.05107951, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.289128407121325, + "language_loss": 0.80097419, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82451373, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.34765625, + "step": 572, + "time_per_iteration": 2.502070426940918 + }, + { + "auxiliary_loss_clip": 0.0126951, + "auxiliary_loss_mlp": 0.01070436, + "balance_loss_clip": 1.07134056, + "balance_loss_mlp": 1.03810596, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 2.1646623471944206, + "language_loss": 0.84444213, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86784154, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.32299805, + "step": 573, + "time_per_iteration": 2.441690683364868 + }, + { + "auxiliary_loss_clip": 0.01260228, + "auxiliary_loss_mlp": 0.01063983, + "balance_loss_clip": 1.06221259, + "balance_loss_mlp": 1.03332198, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.413000196800887, + "language_loss": 0.76625824, + "learning_rate": 3.999786703227023e-06, + "loss": 0.7895003, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.30615234, + "step": 574, + "time_per_iteration": 2.410158157348633 + }, + { + "auxiliary_loss_clip": 0.01260756, + "auxiliary_loss_mlp": 0.01069262, + "balance_loss_clip": 1.06500006, + "balance_loss_mlp": 1.03826737, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.399362707311582, + "language_loss": 0.83932877, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86262894, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.31030273, + "step": 575, + "time_per_iteration": 2.446709632873535 + }, + { + "auxiliary_loss_clip": 0.01257149, + "auxiliary_loss_mlp": 0.01072367, + "balance_loss_clip": 1.06830239, + "balance_loss_mlp": 1.04144382, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.7208847948307002, + "language_loss": 0.83786523, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.8611604, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.30944824, + "step": 576, + "time_per_iteration": 2.4228646755218506 + }, + { + "auxiliary_loss_clip": 0.01265269, + "auxiliary_loss_mlp": 0.01075774, + "balance_loss_clip": 1.07463229, + "balance_loss_mlp": 1.04451728, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.8904718516067494, + "language_loss": 0.86485851, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88826901, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.3125, + "step": 577, + "time_per_iteration": 2.519881010055542 + }, + { + "auxiliary_loss_clip": 0.01255321, + "auxiliary_loss_mlp": 0.01077754, + "balance_loss_clip": 1.06691837, + "balance_loss_mlp": 1.04611588, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 1.899071467849308, + "language_loss": 0.72004938, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74338007, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.31640625, + "step": 578, + "time_per_iteration": 2.440695285797119 + }, + { + "auxiliary_loss_clip": 0.01255368, + "auxiliary_loss_mlp": 0.01073207, + "balance_loss_clip": 1.06347346, + "balance_loss_mlp": 1.04075837, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.9618317469992324, + "language_loss": 0.77611995, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79940569, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.32470703, + "step": 579, + "time_per_iteration": 2.5134618282318115 + }, + { + "auxiliary_loss_clip": 0.01254063, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_clip": 1.06479466, + "balance_loss_mlp": 1.04819727, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 6.048321881614732, + "language_loss": 0.86778641, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89113164, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.32250977, + "step": 580, + "time_per_iteration": 2.445385217666626 + }, + { + "auxiliary_loss_clip": 0.01266631, + "auxiliary_loss_mlp": 0.01063222, + "balance_loss_clip": 1.07090282, + "balance_loss_mlp": 1.03427815, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.8807242754115157, + "language_loss": 0.82152796, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84482646, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.28955078, + "step": 581, + "time_per_iteration": 2.4358489513397217 + }, + { + "auxiliary_loss_clip": 0.01254698, + "auxiliary_loss_mlp": 0.01068331, + "balance_loss_clip": 1.06533146, + "balance_loss_mlp": 1.03869557, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.8825582488873573, + "language_loss": 0.77933764, + "learning_rate": 3.99973877411558e-06, + "loss": 0.80256796, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.29589844, + "step": 582, + "time_per_iteration": 2.416388750076294 + }, + { + "auxiliary_loss_clip": 0.01251161, + "auxiliary_loss_mlp": 0.0108249, + "balance_loss_clip": 1.06549931, + "balance_loss_mlp": 1.05130494, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.082147777818996, + "language_loss": 0.87522542, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89856195, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.31176758, + "step": 583, + "time_per_iteration": 3.9232211112976074 + }, + { + "auxiliary_loss_clip": 0.01257852, + "auxiliary_loss_mlp": 0.01085376, + "balance_loss_clip": 1.06570756, + "balance_loss_mlp": 1.0550015, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.5357860941219124, + "language_loss": 0.80758297, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83101529, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.3034668, + "step": 584, + "time_per_iteration": 2.501668930053711 + }, + { + "auxiliary_loss_clip": 0.01252352, + "auxiliary_loss_mlp": 0.01063235, + "balance_loss_clip": 1.06220448, + "balance_loss_mlp": 1.03302717, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 3.1661098832579055, + "language_loss": 0.93133688, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95449269, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.30224609, + "step": 585, + "time_per_iteration": 3.9737977981567383 + }, + { + "auxiliary_loss_clip": 0.01253896, + "auxiliary_loss_mlp": 0.01074883, + "balance_loss_clip": 1.06421471, + "balance_loss_mlp": 1.04395962, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 4.555648730721972, + "language_loss": 0.87701553, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.90030336, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.30932617, + "step": 586, + "time_per_iteration": 3.909459352493286 + }, + { + "auxiliary_loss_clip": 0.01260186, + "auxiliary_loss_mlp": 0.01077548, + "balance_loss_clip": 1.06725264, + "balance_loss_mlp": 1.04729235, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.8975489031996278, + "language_loss": 0.76709205, + "learning_rate": 3.999706353928965e-06, + "loss": 0.79046935, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.30249023, + "step": 587, + "time_per_iteration": 2.4322116374969482 + }, + { + "auxiliary_loss_clip": 0.01265277, + "auxiliary_loss_mlp": 0.01070325, + "balance_loss_clip": 1.0699563, + "balance_loss_mlp": 1.03534889, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6239935810814503, + "language_loss": 0.78720272, + "learning_rate": 3.999699642403449e-06, + "loss": 0.81055868, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.34985352, + "step": 588, + "time_per_iteration": 2.4393699169158936 + }, + { + "auxiliary_loss_clip": 0.01257747, + "auxiliary_loss_mlp": 0.01073967, + "balance_loss_clip": 1.06435215, + "balance_loss_mlp": 1.04061258, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6217520199257915, + "language_loss": 0.94101095, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96432805, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.33349609, + "step": 589, + "time_per_iteration": 2.4542205333709717 + }, + { + "auxiliary_loss_clip": 0.01259439, + "auxiliary_loss_mlp": 0.01075235, + "balance_loss_clip": 1.06528544, + "balance_loss_mlp": 1.0450511, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.3513884221370818, + "language_loss": 0.84346068, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86680746, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.30175781, + "step": 590, + "time_per_iteration": 2.537113904953003 + }, + { + "auxiliary_loss_clip": 0.01253422, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_clip": 1.06622362, + "balance_loss_mlp": 1.0404253, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 3.323350135774785, + "language_loss": 0.87055618, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89378124, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.28625488, + "step": 591, + "time_per_iteration": 2.4329397678375244 + }, + { + "auxiliary_loss_clip": 0.01253012, + "auxiliary_loss_mlp": 0.01075348, + "balance_loss_clip": 1.06128287, + "balance_loss_mlp": 1.04471159, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 2.3828886590699825, + "language_loss": 0.83167148, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85495508, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.30639648, + "step": 592, + "time_per_iteration": 2.421966075897217 + }, + { + "auxiliary_loss_clip": 0.0115648, + "auxiliary_loss_mlp": 0.01016845, + "balance_loss_clip": 1.04814053, + "balance_loss_mlp": 1.00988352, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8805966993594648, + "language_loss": 0.59784442, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61957765, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.06970215, + "step": 593, + "time_per_iteration": 3.117891311645508 + }, + { + "auxiliary_loss_clip": 0.01253659, + "auxiliary_loss_mlp": 0.01075905, + "balance_loss_clip": 1.06761122, + "balance_loss_mlp": 1.04483879, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 2.0463621466338124, + "language_loss": 0.87014967, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89344531, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.31054688, + "step": 594, + "time_per_iteration": 2.452394723892212 + }, + { + "auxiliary_loss_clip": 0.01251621, + "auxiliary_loss_mlp": 0.01076574, + "balance_loss_clip": 1.06085217, + "balance_loss_mlp": 1.0466764, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.49417145388115, + "language_loss": 0.83904302, + "learning_rate": 3.999650538532742e-06, + "loss": 0.86232495, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.29846191, + "step": 595, + "time_per_iteration": 2.5821022987365723 + }, + { + "auxiliary_loss_clip": 0.01250092, + "auxiliary_loss_mlp": 0.01074663, + "balance_loss_clip": 1.06376028, + "balance_loss_mlp": 1.04434836, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 2.6768076028524175, + "language_loss": 0.96130681, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98455435, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.30285645, + "step": 596, + "time_per_iteration": 2.4444072246551514 + }, + { + "auxiliary_loss_clip": 0.01254338, + "auxiliary_loss_mlp": 0.01072749, + "balance_loss_clip": 1.06542301, + "balance_loss_mlp": 1.04440069, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.460724644560582, + "language_loss": 0.83158314, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85485399, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.28344727, + "step": 597, + "time_per_iteration": 2.5133752822875977 + }, + { + "auxiliary_loss_clip": 0.01250173, + "auxiliary_loss_mlp": 0.01077409, + "balance_loss_clip": 1.0651412, + "balance_loss_mlp": 1.04391146, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.017000711697892, + "language_loss": 0.81473446, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83801031, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.3347168, + "step": 598, + "time_per_iteration": 2.4579508304595947 + }, + { + "auxiliary_loss_clip": 0.01245591, + "auxiliary_loss_mlp": 0.01075125, + "balance_loss_clip": 1.06497788, + "balance_loss_mlp": 1.04451263, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.7660120314751449, + "language_loss": 0.81468189, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83788908, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.3059082, + "step": 599, + "time_per_iteration": 2.455057144165039 + }, + { + "auxiliary_loss_clip": 0.01253579, + "auxiliary_loss_mlp": 0.01078233, + "balance_loss_clip": 1.06279492, + "balance_loss_mlp": 1.04649901, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.280113516340334, + "language_loss": 0.85972083, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88303888, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.31713867, + "step": 600, + "time_per_iteration": 2.4519293308258057 + }, + { + "auxiliary_loss_clip": 0.01248453, + "auxiliary_loss_mlp": 0.01084522, + "balance_loss_clip": 1.06245887, + "balance_loss_mlp": 1.05483866, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 4.657010746023376, + "language_loss": 0.82660282, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84993255, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.296875, + "step": 601, + "time_per_iteration": 2.470601797103882 + }, + { + "auxiliary_loss_clip": 0.01245559, + "auxiliary_loss_mlp": 0.01071755, + "balance_loss_clip": 1.06123364, + "balance_loss_mlp": 1.03949738, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.394620494100761, + "language_loss": 0.75280678, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77597994, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.32250977, + "step": 602, + "time_per_iteration": 2.4276628494262695 + }, + { + "auxiliary_loss_clip": 0.01251975, + "auxiliary_loss_mlp": 0.01056939, + "balance_loss_clip": 1.06567717, + "balance_loss_mlp": 1.02735138, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 2.0882687520225796, + "language_loss": 0.79997432, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82306343, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.29589844, + "step": 603, + "time_per_iteration": 2.5183472633361816 + }, + { + "auxiliary_loss_clip": 0.01249842, + "auxiliary_loss_mlp": 0.01066292, + "balance_loss_clip": 1.0656023, + "balance_loss_mlp": 1.0379436, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.935100237490102, + "language_loss": 0.86739266, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89055395, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.28393555, + "step": 604, + "time_per_iteration": 2.461756467819214 + }, + { + "auxiliary_loss_clip": 0.01250852, + "auxiliary_loss_mlp": 0.01063405, + "balance_loss_clip": 1.06646109, + "balance_loss_mlp": 1.03238654, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 2.9864383282697724, + "language_loss": 0.80757737, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83071995, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.31030273, + "step": 605, + "time_per_iteration": 2.532248020172119 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.0106205, + "balance_loss_clip": 1.0635457, + "balance_loss_mlp": 1.03451252, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.3124892290812276, + "language_loss": 0.85656083, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87968278, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.27539062, + "step": 606, + "time_per_iteration": 2.4789857864379883 + }, + { + "auxiliary_loss_clip": 0.01254947, + "auxiliary_loss_mlp": 0.01067969, + "balance_loss_clip": 1.06446719, + "balance_loss_mlp": 1.03733253, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1135270844322998, + "language_loss": 0.82598984, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84921908, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.30664062, + "step": 607, + "time_per_iteration": 2.4660301208496094 + }, + { + "auxiliary_loss_clip": 0.01250757, + "auxiliary_loss_mlp": 0.01062593, + "balance_loss_clip": 1.06642866, + "balance_loss_mlp": 1.03466213, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.0318875203146973, + "language_loss": 0.83465075, + "learning_rate": 3.999549488202358e-06, + "loss": 0.85778421, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.27941895, + "step": 608, + "time_per_iteration": 2.455007791519165 + }, + { + "auxiliary_loss_clip": 0.0125314, + "auxiliary_loss_mlp": 0.01063658, + "balance_loss_clip": 1.0659517, + "balance_loss_mlp": 1.03254497, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.0825021127336734, + "language_loss": 0.82240176, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84556973, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.31103516, + "step": 609, + "time_per_iteration": 2.449518918991089 + }, + { + "auxiliary_loss_clip": 0.01275557, + "auxiliary_loss_mlp": 0.01085413, + "balance_loss_clip": 1.08337343, + "balance_loss_mlp": 1.05670774, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.2276679473499583, + "language_loss": 0.7937448, + "learning_rate": 3.999532804634215e-06, + "loss": 0.8173545, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.2869873, + "step": 610, + "time_per_iteration": 2.5116255283355713 + }, + { + "auxiliary_loss_clip": 0.0125677, + "auxiliary_loss_mlp": 0.01075228, + "balance_loss_clip": 1.0672965, + "balance_loss_mlp": 1.04568827, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 1.9963790692880516, + "language_loss": 0.87386465, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89718461, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.29541016, + "step": 611, + "time_per_iteration": 2.460076093673706 + }, + { + "auxiliary_loss_clip": 0.01251906, + "auxiliary_loss_mlp": 0.01095228, + "balance_loss_clip": 1.06682646, + "balance_loss_mlp": 1.06182528, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.1057743522041674, + "language_loss": 0.72955954, + "learning_rate": 3.999515817776136e-06, + "loss": 0.7530309, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.33374023, + "step": 612, + "time_per_iteration": 2.479724645614624 + }, + { + "auxiliary_loss_clip": 0.01250863, + "auxiliary_loss_mlp": 0.01072, + "balance_loss_clip": 1.06377506, + "balance_loss_mlp": 1.0416255, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.980739708967497, + "language_loss": 0.79069626, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81392491, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.30358887, + "step": 613, + "time_per_iteration": 2.4404444694519043 + }, + { + "auxiliary_loss_clip": 0.01246295, + "auxiliary_loss_mlp": 0.01074035, + "balance_loss_clip": 1.06249237, + "balance_loss_mlp": 1.04540122, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.174160844719924, + "language_loss": 0.93797398, + "learning_rate": 3.9994985276307e-06, + "loss": 0.96117735, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.28625488, + "step": 614, + "time_per_iteration": 2.4248735904693604 + }, + { + "auxiliary_loss_clip": 0.01253365, + "auxiliary_loss_mlp": 0.01068464, + "balance_loss_clip": 1.06678843, + "balance_loss_mlp": 1.03742182, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.705578323334829, + "language_loss": 0.73034728, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75356555, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.31030273, + "step": 615, + "time_per_iteration": 2.5300753116607666 + }, + { + "auxiliary_loss_clip": 0.01251579, + "auxiliary_loss_mlp": 0.01067234, + "balance_loss_clip": 1.06330347, + "balance_loss_mlp": 1.038028, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 1.7345158688738778, + "language_loss": 0.81544459, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83863276, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.29187012, + "step": 616, + "time_per_iteration": 2.5139291286468506 + }, + { + "auxiliary_loss_clip": 0.01251433, + "auxiliary_loss_mlp": 0.01068509, + "balance_loss_clip": 1.06532764, + "balance_loss_mlp": 1.04018462, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.287943845759663, + "language_loss": 0.67865306, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70185244, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.28344727, + "step": 617, + "time_per_iteration": 2.5318710803985596 + }, + { + "auxiliary_loss_clip": 0.01254759, + "auxiliary_loss_mlp": 0.01066579, + "balance_loss_clip": 1.06836605, + "balance_loss_mlp": 1.03544164, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 2.9316807699080125, + "language_loss": 0.80468822, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8279016, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.3112793, + "step": 618, + "time_per_iteration": 2.4263484477996826 + }, + { + "auxiliary_loss_clip": 0.01258607, + "auxiliary_loss_mlp": 0.01075636, + "balance_loss_clip": 1.06916976, + "balance_loss_mlp": 1.04330635, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.146672579170062, + "language_loss": 0.91302502, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93636745, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.32324219, + "step": 619, + "time_per_iteration": 2.4282379150390625 + }, + { + "auxiliary_loss_clip": 0.0125441, + "auxiliary_loss_mlp": 0.01073468, + "balance_loss_clip": 1.06928587, + "balance_loss_mlp": 1.04423785, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.3328514349622145, + "language_loss": 0.94258177, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96586055, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.29223633, + "step": 620, + "time_per_iteration": 2.505189895629883 + }, + { + "auxiliary_loss_clip": 0.01254455, + "auxiliary_loss_mlp": 0.01077825, + "balance_loss_clip": 1.06663787, + "balance_loss_mlp": 1.04647315, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.9575732299851465, + "language_loss": 0.77385271, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79717547, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.31347656, + "step": 621, + "time_per_iteration": 2.468454360961914 + }, + { + "auxiliary_loss_clip": 0.0125649, + "auxiliary_loss_mlp": 0.01060943, + "balance_loss_clip": 1.07281351, + "balance_loss_mlp": 1.03018665, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 3.0710788249133087, + "language_loss": 0.87173647, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89491081, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.30712891, + "step": 622, + "time_per_iteration": 2.457653760910034 + }, + { + "auxiliary_loss_clip": 0.01247789, + "auxiliary_loss_mlp": 0.01063895, + "balance_loss_clip": 1.06520009, + "balance_loss_mlp": 1.03416443, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.168261393279893, + "language_loss": 0.90343928, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92655611, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.29724121, + "step": 623, + "time_per_iteration": 2.489748477935791 + }, + { + "auxiliary_loss_clip": 0.01252008, + "auxiliary_loss_mlp": 0.01086195, + "balance_loss_clip": 1.06741166, + "balance_loss_mlp": 1.0536027, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 2.2654947380507156, + "language_loss": 0.84071004, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86409205, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.32592773, + "step": 624, + "time_per_iteration": 2.4852142333984375 + }, + { + "auxiliary_loss_clip": 0.012577, + "auxiliary_loss_mlp": 0.0106892, + "balance_loss_clip": 1.06709027, + "balance_loss_mlp": 1.0384022, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.6680138874953405, + "language_loss": 0.67240387, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69567007, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.30517578, + "step": 625, + "time_per_iteration": 2.5767738819122314 + }, + { + "auxiliary_loss_clip": 0.01244064, + "auxiliary_loss_mlp": 0.0106739, + "balance_loss_clip": 1.06300807, + "balance_loss_mlp": 1.03558469, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.167463262912995, + "language_loss": 0.77722007, + "learning_rate": 3.999388417873652e-06, + "loss": 0.80033457, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.31787109, + "step": 626, + "time_per_iteration": 3.9055521488189697 + }, + { + "auxiliary_loss_clip": 0.01250971, + "auxiliary_loss_mlp": 0.01072507, + "balance_loss_clip": 1.0666306, + "balance_loss_mlp": 1.04206121, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.004888230287325, + "language_loss": 0.81336653, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83660132, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.30444336, + "step": 627, + "time_per_iteration": 2.492433786392212 + }, + { + "auxiliary_loss_clip": 0.0126406, + "auxiliary_loss_mlp": 0.01078417, + "balance_loss_clip": 1.07542598, + "balance_loss_mlp": 1.04812622, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 2.06532163636352, + "language_loss": 0.89036465, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91378939, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.30310059, + "step": 628, + "time_per_iteration": 3.9666833877563477 + }, + { + "auxiliary_loss_clip": 0.01250052, + "auxiliary_loss_mlp": 0.01068799, + "balance_loss_clip": 1.06329572, + "balance_loss_mlp": 1.03947365, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.9360427047907067, + "language_loss": 0.79563904, + "learning_rate": 3.999359184527658e-06, + "loss": 0.81882751, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.29345703, + "step": 629, + "time_per_iteration": 3.878174304962158 + }, + { + "auxiliary_loss_clip": 0.01253302, + "auxiliary_loss_mlp": 0.0106645, + "balance_loss_clip": 1.06677973, + "balance_loss_mlp": 1.03737497, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.6714168204997784, + "language_loss": 0.76952034, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79271787, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.29064941, + "step": 630, + "time_per_iteration": 3.9531402587890625 + }, + { + "auxiliary_loss_clip": 0.01257514, + "auxiliary_loss_mlp": 0.01068791, + "balance_loss_clip": 1.06766129, + "balance_loss_mlp": 1.03978693, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 4.223902313507048, + "language_loss": 0.91737211, + "learning_rate": 3.99933931655021e-06, + "loss": 0.94063514, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.28991699, + "step": 631, + "time_per_iteration": 2.505610466003418 + }, + { + "auxiliary_loss_clip": 0.01246401, + "auxiliary_loss_mlp": 0.01077736, + "balance_loss_clip": 1.06465101, + "balance_loss_mlp": 1.04624081, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.9326189425671587, + "language_loss": 0.92451489, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94775623, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.31518555, + "step": 632, + "time_per_iteration": 2.478419065475464 + }, + { + "auxiliary_loss_clip": 0.01246355, + "auxiliary_loss_mlp": 0.01072557, + "balance_loss_clip": 1.06327224, + "balance_loss_mlp": 1.04079962, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0524121692832464, + "language_loss": 0.83104628, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85423541, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.31762695, + "step": 633, + "time_per_iteration": 2.489433765411377 + }, + { + "auxiliary_loss_clip": 0.01248425, + "auxiliary_loss_mlp": 0.01076512, + "balance_loss_clip": 1.06273818, + "balance_loss_mlp": 1.04365766, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.667188507577927, + "language_loss": 0.69929022, + "learning_rate": 3.999308945971392e-06, + "loss": 0.7225396, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.32836914, + "step": 634, + "time_per_iteration": 2.525099515914917 + }, + { + "auxiliary_loss_clip": 0.01165535, + "auxiliary_loss_mlp": 0.01043812, + "balance_loss_clip": 1.05962253, + "balance_loss_mlp": 1.03688622, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.883902333288324, + "language_loss": 0.6160152, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63810867, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.06933594, + "step": 635, + "time_per_iteration": 3.1247613430023193 + }, + { + "auxiliary_loss_clip": 0.0126231, + "auxiliary_loss_mlp": 0.01069047, + "balance_loss_clip": 1.07655704, + "balance_loss_mlp": 1.03941131, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.185470742541124, + "language_loss": 0.83716488, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.8604784, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.29602051, + "step": 636, + "time_per_iteration": 2.4689719676971436 + }, + { + "auxiliary_loss_clip": 0.01248181, + "auxiliary_loss_mlp": 0.010754, + "balance_loss_clip": 1.06545782, + "balance_loss_mlp": 1.04699278, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 3.4291243491681627, + "language_loss": 0.79501593, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81825173, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.2845459, + "step": 637, + "time_per_iteration": 2.477790117263794 + }, + { + "auxiliary_loss_clip": 0.01250087, + "auxiliary_loss_mlp": 0.01077342, + "balance_loss_clip": 1.06271267, + "balance_loss_mlp": 1.04653847, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.1267883398379754, + "language_loss": 0.84057665, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86385095, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.30786133, + "step": 638, + "time_per_iteration": 2.444639205932617 + }, + { + "auxiliary_loss_clip": 0.01252244, + "auxiliary_loss_mlp": 0.01068147, + "balance_loss_clip": 1.06382966, + "balance_loss_mlp": 1.03795087, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.3346128858251913, + "language_loss": 0.70186591, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72506976, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.30212402, + "step": 639, + "time_per_iteration": 2.49729323387146 + }, + { + "auxiliary_loss_clip": 0.01249226, + "auxiliary_loss_mlp": 0.01074745, + "balance_loss_clip": 1.06416047, + "balance_loss_mlp": 1.04427505, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.5634768100056426, + "language_loss": 0.85416329, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87740302, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.3046875, + "step": 640, + "time_per_iteration": 2.457425117492676 + }, + { + "auxiliary_loss_clip": 0.01255489, + "auxiliary_loss_mlp": 0.01077371, + "balance_loss_clip": 1.06856632, + "balance_loss_mlp": 1.04632926, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.1570369390002666, + "language_loss": 0.82235861, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84568721, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.31030273, + "step": 641, + "time_per_iteration": 2.5049078464508057 + }, + { + "auxiliary_loss_clip": 0.01156829, + "auxiliary_loss_mlp": 0.01011147, + "balance_loss_clip": 1.05521345, + "balance_loss_mlp": 1.0039227, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9090873289589033, + "language_loss": 0.65491825, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67659801, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.07214355, + "step": 642, + "time_per_iteration": 3.0639586448669434 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.01056129, + "balance_loss_clip": 1.06219625, + "balance_loss_mlp": 1.0279603, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 2.056479364154889, + "language_loss": 0.79538673, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81838322, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.28161621, + "step": 643, + "time_per_iteration": 2.4980010986328125 + }, + { + "auxiliary_loss_clip": 0.01244851, + "auxiliary_loss_mlp": 0.01069686, + "balance_loss_clip": 1.06087995, + "balance_loss_mlp": 1.04118323, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.016841318006057, + "language_loss": 0.83117247, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85431778, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.28503418, + "step": 644, + "time_per_iteration": 2.467411518096924 + }, + { + "auxiliary_loss_clip": 0.01249536, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.06384063, + "balance_loss_mlp": 1.0335412, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.256427404791217, + "language_loss": 0.8220439, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84517056, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.29589844, + "step": 645, + "time_per_iteration": 2.5916879177093506 + }, + { + "auxiliary_loss_clip": 0.01252848, + "auxiliary_loss_mlp": 0.01068143, + "balance_loss_clip": 1.06536055, + "balance_loss_mlp": 1.03803086, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.3901027655344413, + "language_loss": 0.82132542, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84453535, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.30102539, + "step": 646, + "time_per_iteration": 2.4722189903259277 + }, + { + "auxiliary_loss_clip": 0.01254777, + "auxiliary_loss_mlp": 0.01077781, + "balance_loss_clip": 1.07174098, + "balance_loss_mlp": 1.0490036, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.940053002155115, + "language_loss": 0.81842965, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84175527, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.28747559, + "step": 647, + "time_per_iteration": 2.4782800674438477 + }, + { + "auxiliary_loss_clip": 0.01249134, + "auxiliary_loss_mlp": 0.01064432, + "balance_loss_clip": 1.06592655, + "balance_loss_mlp": 1.03570271, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 2.328807606735426, + "language_loss": 0.84388447, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86702019, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.2869873, + "step": 648, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01246719, + "auxiliary_loss_mlp": 0.01067271, + "balance_loss_clip": 1.06424379, + "balance_loss_mlp": 1.03773117, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 12.893090686029698, + "language_loss": 0.84490347, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.8680433, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.29541016, + "step": 649, + "time_per_iteration": 2.481580972671509 + }, + { + "auxiliary_loss_clip": 0.01249096, + "auxiliary_loss_mlp": 0.01067491, + "balance_loss_clip": 1.06559372, + "balance_loss_mlp": 1.03678215, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8752317329542143, + "language_loss": 0.79839742, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82156336, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.30688477, + "step": 650, + "time_per_iteration": 2.452439069747925 + }, + { + "auxiliary_loss_clip": 0.01241058, + "auxiliary_loss_mlp": 0.01069753, + "balance_loss_clip": 1.05958056, + "balance_loss_mlp": 1.03987885, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.235445561053212, + "language_loss": 0.78339714, + "learning_rate": 3.9991239579635e-06, + "loss": 0.8065052, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.29882812, + "step": 651, + "time_per_iteration": 2.4702706336975098 + }, + { + "auxiliary_loss_clip": 0.01246793, + "auxiliary_loss_mlp": 0.01078197, + "balance_loss_clip": 1.06399536, + "balance_loss_mlp": 1.04558146, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.471086260329896, + "language_loss": 0.87732029, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90057015, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.32592773, + "step": 652, + "time_per_iteration": 2.462461233139038 + }, + { + "auxiliary_loss_clip": 0.01238893, + "auxiliary_loss_mlp": 0.01065682, + "balance_loss_clip": 1.06049204, + "balance_loss_mlp": 1.03707194, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.5946290852242333, + "language_loss": 0.78968394, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81272972, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.28613281, + "step": 653, + "time_per_iteration": 2.5209829807281494 + }, + { + "auxiliary_loss_clip": 0.01256497, + "auxiliary_loss_mlp": 0.01063139, + "balance_loss_clip": 1.06610656, + "balance_loss_mlp": 1.03295517, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.4343773993215025, + "language_loss": 0.86489868, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88809502, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.30200195, + "step": 654, + "time_per_iteration": 2.528764247894287 + }, + { + "auxiliary_loss_clip": 0.01161638, + "auxiliary_loss_mlp": 0.01069677, + "balance_loss_clip": 1.07238531, + "balance_loss_mlp": 1.06121278, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7529279812653269, + "language_loss": 0.49922198, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52153516, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.08459473, + "step": 655, + "time_per_iteration": 3.1714792251586914 + }, + { + "auxiliary_loss_clip": 0.01244455, + "auxiliary_loss_mlp": 0.01070304, + "balance_loss_clip": 1.0662992, + "balance_loss_mlp": 1.04121661, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.2643669015206616, + "language_loss": 0.81194115, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83508873, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.29101562, + "step": 656, + "time_per_iteration": 2.4469714164733887 + }, + { + "auxiliary_loss_clip": 0.01253711, + "auxiliary_loss_mlp": 0.01093711, + "balance_loss_clip": 1.06613982, + "balance_loss_mlp": 1.05570745, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.50106679653301, + "language_loss": 0.76241761, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78589189, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.37988281, + "step": 657, + "time_per_iteration": 2.4040260314941406 + }, + { + "auxiliary_loss_clip": 0.01254146, + "auxiliary_loss_mlp": 0.01074501, + "balance_loss_clip": 1.06922221, + "balance_loss_mlp": 1.04214728, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.3471065829040207, + "language_loss": 0.81956077, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84284723, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.32348633, + "step": 658, + "time_per_iteration": 2.5148909091949463 + }, + { + "auxiliary_loss_clip": 0.01246798, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_clip": 1.06621504, + "balance_loss_mlp": 1.04598951, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2458684847666857, + "language_loss": 0.91352487, + "learning_rate": 3.999029323959287e-06, + "loss": 0.9367975, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.34472656, + "step": 659, + "time_per_iteration": 2.507502555847168 + }, + { + "auxiliary_loss_clip": 0.01252568, + "auxiliary_loss_mlp": 0.01072203, + "balance_loss_clip": 1.06734753, + "balance_loss_mlp": 1.04274666, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.119574425103679, + "language_loss": 0.79518712, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81843483, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.29418945, + "step": 660, + "time_per_iteration": 2.4464449882507324 + }, + { + "auxiliary_loss_clip": 0.01251965, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.07083333, + "balance_loss_mlp": 1.04899216, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 2.0977159912455634, + "language_loss": 0.81910861, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84245551, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.3371582, + "step": 661, + "time_per_iteration": 2.4903628826141357 + }, + { + "auxiliary_loss_clip": 0.01174204, + "auxiliary_loss_mlp": 0.01114884, + "balance_loss_clip": 1.07249832, + "balance_loss_mlp": 1.10583591, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9405697827959738, + "language_loss": 0.69386363, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71675444, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.09033203, + "step": 662, + "time_per_iteration": 3.2286548614501953 + }, + { + "auxiliary_loss_clip": 0.01256246, + "auxiliary_loss_mlp": 0.01070012, + "balance_loss_clip": 1.07070088, + "balance_loss_mlp": 1.03951848, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.374817533645842, + "language_loss": 0.83074605, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85400867, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.3046875, + "step": 663, + "time_per_iteration": 2.424852132797241 + }, + { + "auxiliary_loss_clip": 0.01257748, + "auxiliary_loss_mlp": 0.01067728, + "balance_loss_clip": 1.07122231, + "balance_loss_mlp": 1.03561258, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.7008870680875208, + "language_loss": 0.87274623, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89600098, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.32128906, + "step": 664, + "time_per_iteration": 2.5280966758728027 + }, + { + "auxiliary_loss_clip": 0.01244677, + "auxiliary_loss_mlp": 0.01065006, + "balance_loss_clip": 1.06511378, + "balance_loss_mlp": 1.03124571, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.376272385855498, + "language_loss": 0.84998578, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87308264, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.33764648, + "step": 665, + "time_per_iteration": 2.425635576248169 + }, + { + "auxiliary_loss_clip": 0.0125914, + "auxiliary_loss_mlp": 0.01086088, + "balance_loss_clip": 1.06984532, + "balance_loss_mlp": 1.05023038, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.7885047240052914, + "language_loss": 0.81722516, + "learning_rate": 3.998942539520158e-06, + "loss": 0.8406775, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.35839844, + "step": 666, + "time_per_iteration": 2.601614475250244 + }, + { + "auxiliary_loss_clip": 0.01252322, + "auxiliary_loss_mlp": 0.01076034, + "balance_loss_clip": 1.06989026, + "balance_loss_mlp": 1.04093862, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 3.3668209859080327, + "language_loss": 0.87076283, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89404631, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.35083008, + "step": 667, + "time_per_iteration": 2.4739842414855957 + }, + { + "auxiliary_loss_clip": 0.01248362, + "auxiliary_loss_mlp": 0.01065808, + "balance_loss_clip": 1.07006812, + "balance_loss_mlp": 1.03526616, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.6814094494867824, + "language_loss": 0.80599129, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82913297, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.30566406, + "step": 668, + "time_per_iteration": 2.4582009315490723 + }, + { + "auxiliary_loss_clip": 0.0115084, + "auxiliary_loss_mlp": 0.01027239, + "balance_loss_clip": 1.05464768, + "balance_loss_mlp": 1.02052724, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.865714891011847, + "language_loss": 0.60104632, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62282705, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.06726074, + "step": 669, + "time_per_iteration": 3.1695451736450195 + }, + { + "auxiliary_loss_clip": 0.01250198, + "auxiliary_loss_mlp": 0.01077794, + "balance_loss_clip": 1.06515288, + "balance_loss_mlp": 1.04718065, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.6898733694055357, + "language_loss": 0.8586641, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88194394, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.30639648, + "step": 670, + "time_per_iteration": 3.8796815872192383 + }, + { + "auxiliary_loss_clip": 0.01252864, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_clip": 1.0702188, + "balance_loss_mlp": 1.05295968, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.7459767229930945, + "language_loss": 0.75160742, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77497089, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.30480957, + "step": 671, + "time_per_iteration": 2.4560210704803467 + }, + { + "auxiliary_loss_clip": 0.01255515, + "auxiliary_loss_mlp": 0.01080229, + "balance_loss_clip": 1.07113087, + "balance_loss_mlp": 1.04906726, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 1.904068113155564, + "language_loss": 0.92423564, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94759309, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.31152344, + "step": 672, + "time_per_iteration": 5.3825883865356445 + }, + { + "auxiliary_loss_clip": 0.01249485, + "auxiliary_loss_mlp": 0.01091106, + "balance_loss_clip": 1.06791377, + "balance_loss_mlp": 1.05489039, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 2.026451875440384, + "language_loss": 0.9004215, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92382741, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.36254883, + "step": 673, + "time_per_iteration": 2.465686798095703 + }, + { + "auxiliary_loss_clip": 0.01246675, + "auxiliary_loss_mlp": 0.01083565, + "balance_loss_clip": 1.06576407, + "balance_loss_mlp": 1.05052018, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.2963549969604617, + "language_loss": 0.75063872, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77394116, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.33056641, + "step": 674, + "time_per_iteration": 3.905606269836426 + }, + { + "auxiliary_loss_clip": 0.01258099, + "auxiliary_loss_mlp": 0.01067426, + "balance_loss_clip": 1.06852269, + "balance_loss_mlp": 1.03528762, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 3.8123136314886708, + "language_loss": 0.78232372, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80557895, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.32128906, + "step": 675, + "time_per_iteration": 2.4202826023101807 + }, + { + "auxiliary_loss_clip": 0.01250406, + "auxiliary_loss_mlp": 0.01081119, + "balance_loss_clip": 1.06654191, + "balance_loss_mlp": 1.04793131, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 2.3469863637141937, + "language_loss": 0.76842141, + "learning_rate": 3.998812118783757e-06, + "loss": 0.7917366, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.33178711, + "step": 676, + "time_per_iteration": 2.5369174480438232 + }, + { + "auxiliary_loss_clip": 0.01257702, + "auxiliary_loss_mlp": 0.01081545, + "balance_loss_clip": 1.07081258, + "balance_loss_mlp": 1.04833364, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.578839302165079, + "language_loss": 0.85852653, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.88191903, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.33227539, + "step": 677, + "time_per_iteration": 2.39884614944458 + }, + { + "auxiliary_loss_clip": 0.01245559, + "auxiliary_loss_mlp": 0.01066335, + "balance_loss_clip": 1.0666821, + "balance_loss_mlp": 1.03426743, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.8983011598753088, + "language_loss": 0.76469421, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78781319, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.32067871, + "step": 678, + "time_per_iteration": 2.4862594604492188 + }, + { + "auxiliary_loss_clip": 0.01249592, + "auxiliary_loss_mlp": 0.01077356, + "balance_loss_clip": 1.06729913, + "balance_loss_mlp": 1.04643357, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 2.1416616850722776, + "language_loss": 0.82433152, + "learning_rate": 3.998771514534505e-06, + "loss": 0.847601, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.30932617, + "step": 679, + "time_per_iteration": 2.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01251422, + "auxiliary_loss_mlp": 0.01063039, + "balance_loss_clip": 1.06973124, + "balance_loss_mlp": 1.03187799, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.948591777379978, + "language_loss": 0.76319027, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78633493, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.31176758, + "step": 680, + "time_per_iteration": 2.5571963787078857 + }, + { + "auxiliary_loss_clip": 0.01254941, + "auxiliary_loss_mlp": 0.01072777, + "balance_loss_clip": 1.06751537, + "balance_loss_mlp": 1.0391835, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 2.1339040812597774, + "language_loss": 0.83348268, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85675991, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.3359375, + "step": 681, + "time_per_iteration": 2.488900899887085 + }, + { + "auxiliary_loss_clip": 0.0125076, + "auxiliary_loss_mlp": 0.01070625, + "balance_loss_clip": 1.06468463, + "balance_loss_mlp": 1.03654289, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.8597703606700817, + "language_loss": 0.71342313, + "learning_rate": 3.998730228142726e-06, + "loss": 0.736637, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.34069824, + "step": 682, + "time_per_iteration": 2.448887586593628 + }, + { + "auxiliary_loss_clip": 0.01249273, + "auxiliary_loss_mlp": 0.01071466, + "balance_loss_clip": 1.06618953, + "balance_loss_mlp": 1.04080582, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7998288936301354, + "language_loss": 0.7243917, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74759901, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.30639648, + "step": 683, + "time_per_iteration": 2.416147470474243 + }, + { + "auxiliary_loss_clip": 0.0125337, + "auxiliary_loss_mlp": 0.01083812, + "balance_loss_clip": 1.07528961, + "balance_loss_mlp": 1.05250835, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 1.9649391522904809, + "language_loss": 0.81414187, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83751369, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.31274414, + "step": 684, + "time_per_iteration": 2.4567532539367676 + }, + { + "auxiliary_loss_clip": 0.0124915, + "auxiliary_loss_mlp": 0.01068766, + "balance_loss_clip": 1.06688511, + "balance_loss_mlp": 1.03574526, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.523276709362758, + "language_loss": 0.90759593, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.93077505, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.33007812, + "step": 685, + "time_per_iteration": 2.4987123012542725 + }, + { + "auxiliary_loss_clip": 0.01248639, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.06728649, + "balance_loss_mlp": 1.03816986, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.7060618779433834, + "language_loss": 0.88395905, + "learning_rate": 3.998674118534141e-06, + "loss": 0.9071359, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.30908203, + "step": 686, + "time_per_iteration": 2.5450937747955322 + }, + { + "auxiliary_loss_clip": 0.01264171, + "auxiliary_loss_mlp": 0.01074133, + "balance_loss_clip": 1.07412243, + "balance_loss_mlp": 1.04232776, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 2.1362875786760114, + "language_loss": 0.7150979, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73848093, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.31787109, + "step": 687, + "time_per_iteration": 2.451294183731079 + }, + { + "auxiliary_loss_clip": 0.01250665, + "auxiliary_loss_mlp": 0.0106911, + "balance_loss_clip": 1.07309461, + "balance_loss_mlp": 1.04055977, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 4.592777633971345, + "language_loss": 0.86347699, + "learning_rate": 3.998645608988177e-06, + "loss": 0.8866747, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.28527832, + "step": 688, + "time_per_iteration": 2.5534677505493164 + }, + { + "auxiliary_loss_clip": 0.01247644, + "auxiliary_loss_mlp": 0.01077641, + "balance_loss_clip": 1.06909585, + "balance_loss_mlp": 1.04674232, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.049435489498499, + "language_loss": 0.83454597, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.30908203, + "step": 689, + "time_per_iteration": 2.479696273803711 + }, + { + "auxiliary_loss_clip": 0.01251041, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_clip": 1.07013559, + "balance_loss_mlp": 1.05030179, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 2.6637134711601673, + "language_loss": 0.67999381, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70331192, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.3046875, + "step": 690, + "time_per_iteration": 2.607771635055542 + }, + { + "auxiliary_loss_clip": 0.01243778, + "auxiliary_loss_mlp": 0.01075518, + "balance_loss_clip": 1.06492567, + "balance_loss_mlp": 1.04489279, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 3.232980993096607, + "language_loss": 0.74898028, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77217329, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.30627441, + "step": 691, + "time_per_iteration": 2.447395086288452 + }, + { + "auxiliary_loss_clip": 0.01251125, + "auxiliary_loss_mlp": 0.01088108, + "balance_loss_clip": 1.0706327, + "balance_loss_mlp": 1.05589747, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.1324910866262026, + "language_loss": 0.84600759, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86939991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.32202148, + "step": 692, + "time_per_iteration": 2.429086446762085 + }, + { + "auxiliary_loss_clip": 0.0124592, + "auxiliary_loss_mlp": 0.01075035, + "balance_loss_clip": 1.06249022, + "balance_loss_mlp": 1.04241967, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.290168046033916, + "language_loss": 0.89107478, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91428435, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.32617188, + "step": 693, + "time_per_iteration": 2.3749289512634277 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.0106988, + "balance_loss_clip": 1.07179904, + "balance_loss_mlp": 1.03933907, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 3.1650077872778906, + "language_loss": 0.82015538, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84338665, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.30517578, + "step": 694, + "time_per_iteration": 2.519019842147827 + }, + { + "auxiliary_loss_clip": 0.01248748, + "auxiliary_loss_mlp": 0.01083627, + "balance_loss_clip": 1.06484342, + "balance_loss_mlp": 1.04867458, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 1.8192634506892766, + "language_loss": 0.83603597, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85935968, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.34936523, + "step": 695, + "time_per_iteration": 2.470911979675293 + }, + { + "auxiliary_loss_clip": 0.0124422, + "auxiliary_loss_mlp": 0.01078244, + "balance_loss_clip": 1.06422973, + "balance_loss_mlp": 1.0459621, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.320740280991039, + "language_loss": 0.84493458, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86815923, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.32275391, + "step": 696, + "time_per_iteration": 2.4743492603302 + }, + { + "auxiliary_loss_clip": 0.01241538, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.06237602, + "balance_loss_mlp": 1.05221558, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.1991620475554283, + "language_loss": 0.93234026, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95562279, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.34521484, + "step": 697, + "time_per_iteration": 2.5342464447021484 + }, + { + "auxiliary_loss_clip": 0.01240311, + "auxiliary_loss_mlp": 0.01071197, + "balance_loss_clip": 1.06372619, + "balance_loss_mlp": 1.04158521, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.622679767112635, + "language_loss": 0.83554006, + "learning_rate": 3.998498514015987e-06, + "loss": 0.85865515, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.29614258, + "step": 698, + "time_per_iteration": 2.467566967010498 + }, + { + "auxiliary_loss_clip": 0.01248622, + "auxiliary_loss_mlp": 0.01096517, + "balance_loss_clip": 1.06674469, + "balance_loss_mlp": 1.06311512, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9170792308881999, + "language_loss": 0.91024941, + "learning_rate": 3.998483387701495e-06, + "loss": 0.9337008, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.33398438, + "step": 699, + "time_per_iteration": 2.4276280403137207 + }, + { + "auxiliary_loss_clip": 0.01151159, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.05691576, + "balance_loss_mlp": 1.02741563, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 1.031071609830736, + "language_loss": 0.67910564, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70094621, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.05487061, + "step": 700, + "time_per_iteration": 3.1528613567352295 + }, + { + "auxiliary_loss_clip": 0.01254615, + "auxiliary_loss_mlp": 0.01083754, + "balance_loss_clip": 1.07042074, + "balance_loss_mlp": 1.05123377, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.663659630093051, + "language_loss": 0.88910913, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91249287, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.32543945, + "step": 701, + "time_per_iteration": 2.476468563079834 + }, + { + "auxiliary_loss_clip": 0.01248423, + "auxiliary_loss_mlp": 0.01076615, + "balance_loss_clip": 1.06931865, + "balance_loss_mlp": 1.04533458, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 1.7700667787361877, + "language_loss": 0.67388451, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69713485, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.31274414, + "step": 702, + "time_per_iteration": 2.467581272125244 + }, + { + "auxiliary_loss_clip": 0.01148985, + "auxiliary_loss_mlp": 0.01015457, + "balance_loss_clip": 1.05441833, + "balance_loss_mlp": 1.0099076, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8423329887360171, + "language_loss": 0.60804021, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62968457, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.05548096, + "step": 703, + "time_per_iteration": 3.153231382369995 + }, + { + "auxiliary_loss_clip": 0.01134922, + "auxiliary_loss_mlp": 0.01008549, + "balance_loss_clip": 1.04423499, + "balance_loss_mlp": 1.00339282, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0260876978512428, + "language_loss": 0.57715732, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59859204, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.05157471, + "step": 704, + "time_per_iteration": 3.025033950805664 + }, + { + "auxiliary_loss_clip": 0.01248922, + "auxiliary_loss_mlp": 0.01067154, + "balance_loss_clip": 1.06867826, + "balance_loss_mlp": 1.03594494, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 3.259480884587865, + "language_loss": 0.87577718, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89893794, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.31201172, + "step": 705, + "time_per_iteration": 2.525137186050415 + }, + { + "auxiliary_loss_clip": 0.01232209, + "auxiliary_loss_mlp": 0.01073208, + "balance_loss_clip": 1.05979323, + "balance_loss_mlp": 1.04485989, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 2.8159153741881613, + "language_loss": 0.71433198, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73738617, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.28356934, + "step": 706, + "time_per_iteration": 2.41306734085083 + }, + { + "auxiliary_loss_clip": 0.01236709, + "auxiliary_loss_mlp": 0.01072338, + "balance_loss_clip": 1.06211114, + "balance_loss_mlp": 1.04163003, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.1576398921676314, + "language_loss": 0.93703514, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.96012563, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.30712891, + "step": 707, + "time_per_iteration": 2.4847724437713623 + }, + { + "auxiliary_loss_clip": 0.0124483, + "auxiliary_loss_mlp": 0.01078001, + "balance_loss_clip": 1.06329596, + "balance_loss_mlp": 1.04726875, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 3.0107596028866106, + "language_loss": 0.81177199, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83500028, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.30737305, + "step": 708, + "time_per_iteration": 2.5057015419006348 + }, + { + "auxiliary_loss_clip": 0.0124524, + "auxiliary_loss_mlp": 0.01085049, + "balance_loss_clip": 1.06433952, + "balance_loss_mlp": 1.05393577, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.7081723908373876, + "language_loss": 0.82465625, + "learning_rate": 3.998327956604666e-06, + "loss": 0.8479591, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.31103516, + "step": 709, + "time_per_iteration": 2.440244436264038 + }, + { + "auxiliary_loss_clip": 0.01248865, + "auxiliary_loss_mlp": 0.01079885, + "balance_loss_clip": 1.06676936, + "balance_loss_mlp": 1.04977286, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 3.105715463266087, + "language_loss": 0.85224974, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87553728, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.30102539, + "step": 710, + "time_per_iteration": 2.4116179943084717 + }, + { + "auxiliary_loss_clip": 0.01251277, + "auxiliary_loss_mlp": 0.01081835, + "balance_loss_clip": 1.07290649, + "balance_loss_mlp": 1.05255747, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 2.2078178212965813, + "language_loss": 0.84578454, + "learning_rate": 3.998295961044662e-06, + "loss": 0.86911571, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.29272461, + "step": 711, + "time_per_iteration": 2.4374821186065674 + }, + { + "auxiliary_loss_clip": 0.01238698, + "auxiliary_loss_mlp": 0.01078792, + "balance_loss_clip": 1.06152952, + "balance_loss_mlp": 1.04754734, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.5384971740951536, + "language_loss": 0.85347641, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87665129, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.31237793, + "step": 712, + "time_per_iteration": 2.453376293182373 + }, + { + "auxiliary_loss_clip": 0.01243651, + "auxiliary_loss_mlp": 0.01078516, + "balance_loss_clip": 1.06211877, + "balance_loss_mlp": 1.05022812, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 2.8753900534024965, + "language_loss": 0.90869987, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9319216, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.28283691, + "step": 713, + "time_per_iteration": 2.4869534969329834 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01049634, + "balance_loss_clip": 1.04087353, + "balance_loss_mlp": 1.04181385, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8827299983872205, + "language_loss": 0.63765383, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65947586, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.07824707, + "step": 714, + "time_per_iteration": 4.492361783981323 + }, + { + "auxiliary_loss_clip": 0.01240153, + "auxiliary_loss_mlp": 0.01092527, + "balance_loss_clip": 1.06650138, + "balance_loss_mlp": 1.06272411, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 1.8946812925566545, + "language_loss": 0.74916869, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77249545, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.29785156, + "step": 715, + "time_per_iteration": 2.539600372314453 + }, + { + "auxiliary_loss_clip": 0.0124084, + "auxiliary_loss_mlp": 0.01088814, + "balance_loss_clip": 1.06423163, + "balance_loss_mlp": 1.05519676, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2670105007955894, + "language_loss": 0.72584844, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74914491, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.33642578, + "step": 716, + "time_per_iteration": 2.5228872299194336 + }, + { + "auxiliary_loss_clip": 0.01131299, + "auxiliary_loss_mlp": 0.0100959, + "balance_loss_clip": 1.04095781, + "balance_loss_mlp": 1.00387418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9375005829717354, + "language_loss": 0.65530133, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67671025, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.05712891, + "step": 717, + "time_per_iteration": 5.902749300003052 + }, + { + "auxiliary_loss_clip": 0.0112904, + "auxiliary_loss_mlp": 0.01016839, + "balance_loss_clip": 1.0402869, + "balance_loss_mlp": 1.00879228, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.986781605316482, + "language_loss": 0.58798552, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60944426, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.08056641, + "step": 718, + "time_per_iteration": 4.337741851806641 + }, + { + "auxiliary_loss_clip": 0.01241812, + "auxiliary_loss_mlp": 0.01076476, + "balance_loss_clip": 1.06740451, + "balance_loss_mlp": 1.0448854, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 2.1846202151589127, + "language_loss": 0.91395855, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93714142, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.31616211, + "step": 719, + "time_per_iteration": 2.478377342224121 + }, + { + "auxiliary_loss_clip": 0.01237812, + "auxiliary_loss_mlp": 0.01068511, + "balance_loss_clip": 1.06138551, + "balance_loss_mlp": 1.0408783, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.8061540250049217, + "language_loss": 0.66919649, + "learning_rate": 3.99814823020446e-06, + "loss": 0.69225967, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.27636719, + "step": 720, + "time_per_iteration": 2.5372490882873535 + }, + { + "auxiliary_loss_clip": 0.01238023, + "auxiliary_loss_mlp": 0.01064283, + "balance_loss_clip": 1.06452537, + "balance_loss_mlp": 1.03505278, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 2.162069028879316, + "language_loss": 0.77876306, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80178607, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.29248047, + "step": 721, + "time_per_iteration": 2.4512243270874023 + }, + { + "auxiliary_loss_clip": 0.01241741, + "auxiliary_loss_mlp": 0.0108359, + "balance_loss_clip": 1.06791592, + "balance_loss_mlp": 1.05513501, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 2.8033839699740235, + "language_loss": 0.88624215, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90949547, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.28479004, + "step": 722, + "time_per_iteration": 2.4280943870544434 + }, + { + "auxiliary_loss_clip": 0.01246309, + "auxiliary_loss_mlp": 0.01100264, + "balance_loss_clip": 1.06717873, + "balance_loss_mlp": 1.06829166, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 2.033595675667646, + "language_loss": 0.84553039, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86899602, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.32006836, + "step": 723, + "time_per_iteration": 2.496877908706665 + }, + { + "auxiliary_loss_clip": 0.01247572, + "auxiliary_loss_mlp": 0.01070562, + "balance_loss_clip": 1.06804347, + "balance_loss_mlp": 1.04152215, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 2.116867772948139, + "language_loss": 0.82907259, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85225391, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.29052734, + "step": 724, + "time_per_iteration": 2.5116488933563232 + }, + { + "auxiliary_loss_clip": 0.01248454, + "auxiliary_loss_mlp": 0.0107455, + "balance_loss_clip": 1.07124758, + "balance_loss_mlp": 1.04565382, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 2.130861198617175, + "language_loss": 0.79441947, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81764948, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.28881836, + "step": 725, + "time_per_iteration": 2.4420292377471924 + }, + { + "auxiliary_loss_clip": 0.01248361, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.06648779, + "balance_loss_mlp": 1.03806007, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.9300442116961967, + "language_loss": 0.86632693, + "learning_rate": 3.998046333300584e-06, + "loss": 0.88949108, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.29980469, + "step": 726, + "time_per_iteration": 2.3963475227355957 + }, + { + "auxiliary_loss_clip": 0.01150663, + "auxiliary_loss_mlp": 0.01086335, + "balance_loss_clip": 1.06719267, + "balance_loss_mlp": 1.07932556, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.945045530940515, + "language_loss": 0.55929422, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58166426, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.07019043, + "step": 727, + "time_per_iteration": 3.1925292015075684 + }, + { + "auxiliary_loss_clip": 0.01242208, + "auxiliary_loss_mlp": 0.01067644, + "balance_loss_clip": 1.06521511, + "balance_loss_mlp": 1.03757977, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.112197674252599, + "language_loss": 0.82199466, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84509325, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.30029297, + "step": 728, + "time_per_iteration": 2.4605374336242676 + }, + { + "auxiliary_loss_clip": 0.01234088, + "auxiliary_loss_mlp": 0.01064933, + "balance_loss_clip": 1.06138802, + "balance_loss_mlp": 1.03675175, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.393013270880629, + "language_loss": 0.76853681, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79152703, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.28198242, + "step": 729, + "time_per_iteration": 2.4666595458984375 + }, + { + "auxiliary_loss_clip": 0.01241482, + "auxiliary_loss_mlp": 0.01071601, + "balance_loss_clip": 1.06165195, + "balance_loss_mlp": 1.04244232, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.6286143292757345, + "language_loss": 0.95153368, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97466451, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.29150391, + "step": 730, + "time_per_iteration": 2.537147045135498 + }, + { + "auxiliary_loss_clip": 0.01239594, + "auxiliary_loss_mlp": 0.01059664, + "balance_loss_clip": 1.06143594, + "balance_loss_mlp": 1.02924144, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.6047901242024945, + "language_loss": 0.88361466, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90660727, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.30419922, + "step": 731, + "time_per_iteration": 2.4215660095214844 + }, + { + "auxiliary_loss_clip": 0.01240794, + "auxiliary_loss_mlp": 0.01067827, + "balance_loss_clip": 1.06441808, + "balance_loss_mlp": 1.04092097, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.144281556133188, + "language_loss": 0.88459015, + "learning_rate": 3.997941708816791e-06, + "loss": 0.9076764, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.2689209, + "step": 732, + "time_per_iteration": 2.4137370586395264 + }, + { + "auxiliary_loss_clip": 0.01240929, + "auxiliary_loss_mlp": 0.01070535, + "balance_loss_clip": 1.06377244, + "balance_loss_mlp": 1.04131687, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.6879915825435603, + "language_loss": 0.86312538, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88624007, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.29187012, + "step": 733, + "time_per_iteration": 2.4504477977752686 + }, + { + "auxiliary_loss_clip": 0.01244352, + "auxiliary_loss_mlp": 0.01073562, + "balance_loss_clip": 1.06525981, + "balance_loss_mlp": 1.04268646, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.4046230197371874, + "language_loss": 0.91383725, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93701643, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.30859375, + "step": 734, + "time_per_iteration": 2.511152982711792 + }, + { + "auxiliary_loss_clip": 0.01239074, + "auxiliary_loss_mlp": 0.01058333, + "balance_loss_clip": 1.06657934, + "balance_loss_mlp": 1.03115344, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 2.538003361656597, + "language_loss": 0.77997816, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80295223, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.27197266, + "step": 735, + "time_per_iteration": 2.5184009075164795 + }, + { + "auxiliary_loss_clip": 0.01233447, + "auxiliary_loss_mlp": 0.01058246, + "balance_loss_clip": 1.0615766, + "balance_loss_mlp": 1.03092337, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.2878984385879026, + "language_loss": 0.88565576, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90857267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.27331543, + "step": 736, + "time_per_iteration": 2.512180805206299 + }, + { + "auxiliary_loss_clip": 0.01240137, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_clip": 1.07017279, + "balance_loss_mlp": 1.05255079, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.9334087681519432, + "language_loss": 0.84479225, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86799812, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.27929688, + "step": 737, + "time_per_iteration": 2.4595460891723633 + }, + { + "auxiliary_loss_clip": 0.01248553, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.07224214, + "balance_loss_mlp": 1.03649926, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.1338389847428947, + "language_loss": 0.85179317, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87495625, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.3125, + "step": 738, + "time_per_iteration": 2.5823521614074707 + }, + { + "auxiliary_loss_clip": 0.01134196, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.04799128, + "balance_loss_mlp": 1.02900946, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8768034711062507, + "language_loss": 0.59227622, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61396122, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.05303955, + "step": 739, + "time_per_iteration": 3.0308749675750732 + }, + { + "auxiliary_loss_clip": 0.01244355, + "auxiliary_loss_mlp": 0.01072846, + "balance_loss_clip": 1.07079887, + "balance_loss_mlp": 1.04459321, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.0962492756292224, + "language_loss": 0.91345811, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93663013, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.28271484, + "step": 740, + "time_per_iteration": 2.520900011062622 + }, + { + "auxiliary_loss_clip": 0.01255954, + "auxiliary_loss_mlp": 0.0107187, + "balance_loss_clip": 1.0768044, + "balance_loss_mlp": 1.04361773, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.0273795941938784, + "language_loss": 0.71791422, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74119246, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.2824707, + "step": 741, + "time_per_iteration": 2.5524494647979736 + }, + { + "auxiliary_loss_clip": 0.01238654, + "auxiliary_loss_mlp": 0.01076652, + "balance_loss_clip": 1.06681538, + "balance_loss_mlp": 1.04931712, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 2.3595192832702505, + "language_loss": 0.88852251, + "learning_rate": 3.997761273778037e-06, + "loss": 0.91167557, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.27307129, + "step": 742, + "time_per_iteration": 2.4873435497283936 + }, + { + "auxiliary_loss_clip": 0.01235253, + "auxiliary_loss_mlp": 0.01058797, + "balance_loss_clip": 1.06301117, + "balance_loss_mlp": 1.02925706, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.297667140068227, + "language_loss": 0.84221202, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86515254, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.29516602, + "step": 743, + "time_per_iteration": 2.43430233001709 + }, + { + "auxiliary_loss_clip": 0.01240651, + "auxiliary_loss_mlp": 0.01066384, + "balance_loss_clip": 1.06478572, + "balance_loss_mlp": 1.03825104, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 1.9543964577825628, + "language_loss": 0.80172729, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82479763, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.28149414, + "step": 744, + "time_per_iteration": 2.46313738822937 + }, + { + "auxiliary_loss_clip": 0.01236003, + "auxiliary_loss_mlp": 0.01065853, + "balance_loss_clip": 1.06555164, + "balance_loss_mlp": 1.03798223, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.180486316773699, + "language_loss": 0.85538948, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87840801, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.27868652, + "step": 745, + "time_per_iteration": 2.4246702194213867 + }, + { + "auxiliary_loss_clip": 0.01232191, + "auxiliary_loss_mlp": 0.01058221, + "balance_loss_clip": 1.06195831, + "balance_loss_mlp": 1.02984905, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 3.5168266805503126, + "language_loss": 0.69031453, + "learning_rate": 3.997686978575302e-06, + "loss": 0.71321857, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.28393555, + "step": 746, + "time_per_iteration": 2.5786759853363037 + }, + { + "auxiliary_loss_clip": 0.01255098, + "auxiliary_loss_mlp": 0.01076387, + "balance_loss_clip": 1.07857752, + "balance_loss_mlp": 1.04622698, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.0766638144894025, + "language_loss": 0.68694937, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71026421, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.3013916, + "step": 747, + "time_per_iteration": 2.488770008087158 + }, + { + "auxiliary_loss_clip": 0.01240405, + "auxiliary_loss_mlp": 0.01076286, + "balance_loss_clip": 1.0652138, + "balance_loss_mlp": 1.04741359, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.8146761392729516, + "language_loss": 0.6670801, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69024694, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.28869629, + "step": 748, + "time_per_iteration": 2.4327356815338135 + }, + { + "auxiliary_loss_clip": 0.01254045, + "auxiliary_loss_mlp": 0.01083146, + "balance_loss_clip": 1.07853746, + "balance_loss_mlp": 1.05401123, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 3.097412185856536, + "language_loss": 0.76886153, + "learning_rate": 3.997630461769647e-06, + "loss": 0.79223335, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.2911377, + "step": 749, + "time_per_iteration": 2.411919593811035 + }, + { + "auxiliary_loss_clip": 0.01246431, + "auxiliary_loss_mlp": 0.01076308, + "balance_loss_clip": 1.06946492, + "balance_loss_mlp": 1.04798365, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.4864887837091256, + "language_loss": 0.88879478, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91202223, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.2833252, + "step": 750, + "time_per_iteration": 2.4592273235321045 + }, + { + "auxiliary_loss_clip": 0.01242308, + "auxiliary_loss_mlp": 0.01072428, + "balance_loss_clip": 1.06581664, + "balance_loss_mlp": 1.04045558, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.0537096944966797, + "language_loss": 0.74497032, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.76811767, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.31982422, + "step": 751, + "time_per_iteration": 2.4716179370880127 + }, + { + "auxiliary_loss_clip": 0.01247444, + "auxiliary_loss_mlp": 0.0106998, + "balance_loss_clip": 1.07300949, + "balance_loss_mlp": 1.04228735, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.7296101810390176, + "language_loss": 0.69471598, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7178902, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.27709961, + "step": 752, + "time_per_iteration": 2.493021011352539 + }, + { + "auxiliary_loss_clip": 0.0123698, + "auxiliary_loss_mlp": 0.01064704, + "balance_loss_clip": 1.06401014, + "balance_loss_mlp": 1.03678513, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 3.1493095504848156, + "language_loss": 0.92253309, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94554985, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.27929688, + "step": 753, + "time_per_iteration": 2.541750431060791 + }, + { + "auxiliary_loss_clip": 0.01242945, + "auxiliary_loss_mlp": 0.01087704, + "balance_loss_clip": 1.0680275, + "balance_loss_mlp": 1.05883145, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 3.148222280496891, + "language_loss": 0.91382384, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93713033, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.2890625, + "step": 754, + "time_per_iteration": 2.5794520378112793 + }, + { + "auxiliary_loss_clip": 0.01229935, + "auxiliary_loss_mlp": 0.01074538, + "balance_loss_clip": 1.06317091, + "balance_loss_mlp": 1.04344797, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.2970203346131672, + "language_loss": 0.78436017, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80740494, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.31054688, + "step": 755, + "time_per_iteration": 2.4052734375 + }, + { + "auxiliary_loss_clip": 0.01245809, + "auxiliary_loss_mlp": 0.01085733, + "balance_loss_clip": 1.06864786, + "balance_loss_mlp": 1.05602598, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.221857398886891, + "language_loss": 0.78717124, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81048667, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.29711914, + "step": 756, + "time_per_iteration": 2.4494922161102295 + }, + { + "auxiliary_loss_clip": 0.01126644, + "auxiliary_loss_mlp": 0.01037193, + "balance_loss_clip": 1.04170632, + "balance_loss_mlp": 1.03196013, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.820474573217874, + "language_loss": 0.62764168, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64928007, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.05236816, + "step": 757, + "time_per_iteration": 3.1016604900360107 + }, + { + "auxiliary_loss_clip": 0.01242925, + "auxiliary_loss_mlp": 0.01074918, + "balance_loss_clip": 1.0680511, + "balance_loss_mlp": 1.04633176, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.526829022165853, + "language_loss": 0.84330744, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86648583, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.28588867, + "step": 758, + "time_per_iteration": 3.8515853881835938 + }, + { + "auxiliary_loss_clip": 0.01234006, + "auxiliary_loss_mlp": 0.01067234, + "balance_loss_clip": 1.06144452, + "balance_loss_mlp": 1.03904068, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.1151719869443917, + "language_loss": 0.8822161, + "learning_rate": 3.997437148755101e-06, + "loss": 0.9052285, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.28186035, + "step": 759, + "time_per_iteration": 2.444981813430786 + }, + { + "auxiliary_loss_clip": 0.01241955, + "auxiliary_loss_mlp": 0.01078047, + "balance_loss_clip": 1.06805277, + "balance_loss_mlp": 1.04810143, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 1.9687708252231795, + "language_loss": 0.73808908, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.76128912, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.29943848, + "step": 760, + "time_per_iteration": 5.406291246414185 + }, + { + "auxiliary_loss_clip": 0.0124591, + "auxiliary_loss_mlp": 0.01081737, + "balance_loss_clip": 1.07241988, + "balance_loss_mlp": 1.05361557, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.669103260771828, + "language_loss": 0.82367349, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84694993, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.28137207, + "step": 761, + "time_per_iteration": 2.4736037254333496 + }, + { + "auxiliary_loss_clip": 0.01235525, + "auxiliary_loss_mlp": 0.01075452, + "balance_loss_clip": 1.06562829, + "balance_loss_mlp": 1.0463171, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7195562447645407, + "language_loss": 0.79567969, + "learning_rate": 3.997377677828266e-06, + "loss": 0.81878942, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.29174805, + "step": 762, + "time_per_iteration": 3.8688502311706543 + }, + { + "auxiliary_loss_clip": 0.01120666, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.03487062, + "balance_loss_mlp": 1.02225828, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.024938244411869, + "language_loss": 0.58690214, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60838443, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.05303955, + "step": 763, + "time_per_iteration": 3.109809160232544 + }, + { + "auxiliary_loss_clip": 0.01243506, + "auxiliary_loss_mlp": 0.01074864, + "balance_loss_clip": 1.06747174, + "balance_loss_mlp": 1.04549074, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.6171438752639986, + "language_loss": 0.87576485, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.89894843, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.29394531, + "step": 764, + "time_per_iteration": 2.4303460121154785 + }, + { + "auxiliary_loss_clip": 0.01242605, + "auxiliary_loss_mlp": 0.01079653, + "balance_loss_clip": 1.06757903, + "balance_loss_mlp": 1.0500654, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9053480773698215, + "language_loss": 0.85606396, + "learning_rate": 3.997317525234592e-06, + "loss": 0.87928659, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.2956543, + "step": 765, + "time_per_iteration": 2.5056800842285156 + }, + { + "auxiliary_loss_clip": 0.01248154, + "auxiliary_loss_mlp": 0.01076753, + "balance_loss_clip": 1.07026744, + "balance_loss_mlp": 1.04470932, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 2.7201216439158578, + "language_loss": 0.87818259, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90143168, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.32055664, + "step": 766, + "time_per_iteration": 2.4460854530334473 + }, + { + "auxiliary_loss_clip": 0.01245411, + "auxiliary_loss_mlp": 0.01073739, + "balance_loss_clip": 1.07061577, + "balance_loss_mlp": 1.04427075, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 3.3787636296742565, + "language_loss": 0.83971596, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86290747, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.29492188, + "step": 767, + "time_per_iteration": 2.4236490726470947 + }, + { + "auxiliary_loss_clip": 0.01242678, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.07096708, + "balance_loss_mlp": 1.03205609, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.2245724571142174, + "language_loss": 0.87003446, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89307886, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.29724121, + "step": 768, + "time_per_iteration": 2.542128324508667 + }, + { + "auxiliary_loss_clip": 0.01242768, + "auxiliary_loss_mlp": 0.01066293, + "balance_loss_clip": 1.06644201, + "balance_loss_mlp": 1.0383389, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 5.3510825508743425, + "language_loss": 0.75222027, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77531087, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.27966309, + "step": 769, + "time_per_iteration": 2.461986780166626 + }, + { + "auxiliary_loss_clip": 0.01232556, + "auxiliary_loss_mlp": 0.01067895, + "balance_loss_clip": 1.06344604, + "balance_loss_mlp": 1.04016721, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.7750213978717584, + "language_loss": 0.86043906, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88344353, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.27758789, + "step": 770, + "time_per_iteration": 2.4598135948181152 + }, + { + "auxiliary_loss_clip": 0.01246417, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.06837177, + "balance_loss_mlp": 1.04229295, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.4885783290156933, + "language_loss": 0.87065113, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89382994, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.29162598, + "step": 771, + "time_per_iteration": 2.498246669769287 + }, + { + "auxiliary_loss_clip": 0.01259143, + "auxiliary_loss_mlp": 0.01075616, + "balance_loss_clip": 1.07711422, + "balance_loss_mlp": 1.04459739, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 1.977263762404519, + "language_loss": 0.83898181, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86232936, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.31030273, + "step": 772, + "time_per_iteration": 2.5380191802978516 + }, + { + "auxiliary_loss_clip": 0.01244487, + "auxiliary_loss_mlp": 0.01069049, + "balance_loss_clip": 1.07041574, + "balance_loss_mlp": 1.04068875, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 3.5289173332643595, + "language_loss": 0.73541081, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75854611, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.28381348, + "step": 773, + "time_per_iteration": 2.4652347564697266 + }, + { + "auxiliary_loss_clip": 0.01238351, + "auxiliary_loss_mlp": 0.01071825, + "balance_loss_clip": 1.06862187, + "balance_loss_mlp": 1.04168844, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 1.9190096838615727, + "language_loss": 0.78524268, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80834436, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.30151367, + "step": 774, + "time_per_iteration": 2.557657480239868 + }, + { + "auxiliary_loss_clip": 0.01239906, + "auxiliary_loss_mlp": 0.01062301, + "balance_loss_clip": 1.06787229, + "balance_loss_mlp": 1.03469169, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.204525835636551, + "language_loss": 0.73680902, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75983113, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.27612305, + "step": 775, + "time_per_iteration": 2.5710320472717285 + }, + { + "auxiliary_loss_clip": 0.01239831, + "auxiliary_loss_mlp": 0.01065416, + "balance_loss_clip": 1.06693065, + "balance_loss_mlp": 1.03537548, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.613313204189802, + "language_loss": 0.76970851, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79276097, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.30029297, + "step": 776, + "time_per_iteration": 2.4581265449523926 + }, + { + "auxiliary_loss_clip": 0.01231725, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_clip": 1.06427264, + "balance_loss_mlp": 1.04112828, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 2.1776613539572276, + "language_loss": 0.71002859, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73303044, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.27355957, + "step": 777, + "time_per_iteration": 2.493703603744507 + }, + { + "auxiliary_loss_clip": 0.01241986, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_clip": 1.06958079, + "balance_loss_mlp": 1.04775023, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 2.1313474243902046, + "language_loss": 0.76573277, + "learning_rate": 3.997048987461856e-06, + "loss": 0.78893578, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.30566406, + "step": 778, + "time_per_iteration": 2.535916805267334 + }, + { + "auxiliary_loss_clip": 0.01237002, + "auxiliary_loss_mlp": 0.01067218, + "balance_loss_clip": 1.06744504, + "balance_loss_mlp": 1.0381906, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.4705193982372182, + "language_loss": 0.79517859, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81822079, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.29040527, + "step": 779, + "time_per_iteration": 2.460371732711792 + }, + { + "auxiliary_loss_clip": 0.01232541, + "auxiliary_loss_mlp": 0.01064485, + "balance_loss_clip": 1.06529415, + "balance_loss_mlp": 1.03628016, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.6624847921850525, + "language_loss": 0.77351749, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79648781, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.28222656, + "step": 780, + "time_per_iteration": 2.5924713611602783 + }, + { + "auxiliary_loss_clip": 0.01234122, + "auxiliary_loss_mlp": 0.01075272, + "balance_loss_clip": 1.06750321, + "balance_loss_mlp": 1.0485456, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 1.8249340377306043, + "language_loss": 0.76502633, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78812027, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.26708984, + "step": 781, + "time_per_iteration": 2.4463350772857666 + }, + { + "auxiliary_loss_clip": 0.01252646, + "auxiliary_loss_mlp": 0.01075601, + "balance_loss_clip": 1.07517004, + "balance_loss_mlp": 1.04572713, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.579136432224931, + "language_loss": 0.73891544, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76219785, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.29858398, + "step": 782, + "time_per_iteration": 2.5539681911468506 + }, + { + "auxiliary_loss_clip": 0.01237014, + "auxiliary_loss_mlp": 0.0106806, + "balance_loss_clip": 1.06764722, + "balance_loss_mlp": 1.04104769, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.173444935433693, + "language_loss": 0.80257785, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82562858, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.27001953, + "step": 783, + "time_per_iteration": 2.449090003967285 + }, + { + "auxiliary_loss_clip": 0.0123486, + "auxiliary_loss_mlp": 0.01064266, + "balance_loss_clip": 1.07012439, + "balance_loss_mlp": 1.03696728, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 1.8652308279956702, + "language_loss": 0.81621289, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83920419, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.27319336, + "step": 784, + "time_per_iteration": 2.4630348682403564 + }, + { + "auxiliary_loss_clip": 0.01237235, + "auxiliary_loss_mlp": 0.0107501, + "balance_loss_clip": 1.06311345, + "balance_loss_mlp": 1.04746103, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.2786401334836546, + "language_loss": 0.80753887, + "learning_rate": 3.996899089108607e-06, + "loss": 0.8306613, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.27526855, + "step": 785, + "time_per_iteration": 2.43149733543396 + }, + { + "auxiliary_loss_clip": 0.01242839, + "auxiliary_loss_mlp": 0.01066224, + "balance_loss_clip": 1.07210565, + "balance_loss_mlp": 1.03998637, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.403820339287408, + "language_loss": 0.89841759, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92150819, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.26257324, + "step": 786, + "time_per_iteration": 2.431323289871216 + }, + { + "auxiliary_loss_clip": 0.01237001, + "auxiliary_loss_mlp": 0.01070801, + "balance_loss_clip": 1.06053948, + "balance_loss_mlp": 1.03935385, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.5051794458752354, + "language_loss": 0.77086961, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.79394758, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.31420898, + "step": 787, + "time_per_iteration": 2.404057025909424 + }, + { + "auxiliary_loss_clip": 0.01249136, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_clip": 1.07540536, + "balance_loss_mlp": 1.04231453, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 10.009063230206408, + "language_loss": 0.81285775, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83606315, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.29101562, + "step": 788, + "time_per_iteration": 2.4710075855255127 + }, + { + "auxiliary_loss_clip": 0.01236239, + "auxiliary_loss_mlp": 0.01073331, + "balance_loss_clip": 1.06940651, + "balance_loss_mlp": 1.04340982, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 1.883361429676338, + "language_loss": 0.84498811, + "learning_rate": 3.996811766991355e-06, + "loss": 0.86808383, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.29956055, + "step": 789, + "time_per_iteration": 2.447232484817505 + }, + { + "auxiliary_loss_clip": 0.01241654, + "auxiliary_loss_mlp": 0.01074364, + "balance_loss_clip": 1.07032824, + "balance_loss_mlp": 1.04614687, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 2.352683603502587, + "language_loss": 0.82173419, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84489435, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.28210449, + "step": 790, + "time_per_iteration": 2.450972557067871 + }, + { + "auxiliary_loss_clip": 0.01235005, + "auxiliary_loss_mlp": 0.01064318, + "balance_loss_clip": 1.06562757, + "balance_loss_mlp": 1.03565991, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.180194843515433, + "language_loss": 0.88225329, + "learning_rate": 3.996767651613597e-06, + "loss": 0.9052465, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.28613281, + "step": 791, + "time_per_iteration": 2.616760730743408 + }, + { + "auxiliary_loss_clip": 0.01236809, + "auxiliary_loss_mlp": 0.01070161, + "balance_loss_clip": 1.06657457, + "balance_loss_mlp": 1.04052532, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.299163280278261, + "language_loss": 0.90471476, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92778444, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.29663086, + "step": 792, + "time_per_iteration": 2.421808958053589 + }, + { + "auxiliary_loss_clip": 0.01237067, + "auxiliary_loss_mlp": 0.01072992, + "balance_loss_clip": 1.06523085, + "balance_loss_mlp": 1.04546702, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.8634398887333128, + "language_loss": 0.7352525, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75835311, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.2755127, + "step": 793, + "time_per_iteration": 2.4779245853424072 + }, + { + "auxiliary_loss_clip": 0.01238407, + "auxiliary_loss_mlp": 0.01066266, + "balance_loss_clip": 1.06642759, + "balance_loss_mlp": 1.03732252, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.053685310444265, + "language_loss": 0.86596382, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88901055, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.28955078, + "step": 794, + "time_per_iteration": 2.505967378616333 + }, + { + "auxiliary_loss_clip": 0.01239794, + "auxiliary_loss_mlp": 0.01083636, + "balance_loss_clip": 1.06441045, + "balance_loss_mlp": 1.05183053, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 10.636856540676328, + "language_loss": 0.69815713, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72139144, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.31787109, + "step": 795, + "time_per_iteration": 2.4831531047821045 + }, + { + "auxiliary_loss_clip": 0.01231911, + "auxiliary_loss_mlp": 0.01079993, + "balance_loss_clip": 1.0632751, + "balance_loss_mlp": 1.05131102, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.147456032383815, + "language_loss": 0.81242907, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83554816, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.28686523, + "step": 796, + "time_per_iteration": 2.46995210647583 + }, + { + "auxiliary_loss_clip": 0.01236448, + "auxiliary_loss_mlp": 0.01067467, + "balance_loss_clip": 1.0651778, + "balance_loss_mlp": 1.03687763, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.2347824825803064, + "language_loss": 0.81567502, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83871412, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.3059082, + "step": 797, + "time_per_iteration": 2.4378857612609863 + }, + { + "auxiliary_loss_clip": 0.01118246, + "auxiliary_loss_mlp": 0.01043527, + "balance_loss_clip": 1.03813386, + "balance_loss_mlp": 1.03813827, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 1.1103693738033953, + "language_loss": 0.6447463, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66636401, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.05395508, + "step": 798, + "time_per_iteration": 3.035346746444702 + }, + { + "auxiliary_loss_clip": 0.01240839, + "auxiliary_loss_mlp": 0.01074954, + "balance_loss_clip": 1.06153476, + "balance_loss_mlp": 1.0471065, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 2.1338090776426473, + "language_loss": 0.91131943, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93447733, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.27856445, + "step": 799, + "time_per_iteration": 2.495826005935669 + }, + { + "auxiliary_loss_clip": 0.01238328, + "auxiliary_loss_mlp": 0.01073213, + "balance_loss_clip": 1.06862903, + "balance_loss_mlp": 1.04400706, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.014361232301576, + "language_loss": 0.8670246, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89014006, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.29223633, + "step": 800, + "time_per_iteration": 2.5158936977386475 + }, + { + "auxiliary_loss_clip": 0.01244707, + "auxiliary_loss_mlp": 0.01078337, + "balance_loss_clip": 1.06947517, + "balance_loss_mlp": 1.05002475, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 3.4940985164294256, + "language_loss": 0.84363341, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86686385, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.28308105, + "step": 801, + "time_per_iteration": 2.4652507305145264 + }, + { + "auxiliary_loss_clip": 0.01241101, + "auxiliary_loss_mlp": 0.01085202, + "balance_loss_clip": 1.06847978, + "balance_loss_mlp": 1.05601943, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.8476885845479196, + "language_loss": 0.79822636, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82148939, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.29199219, + "step": 802, + "time_per_iteration": 4.191574573516846 + }, + { + "auxiliary_loss_clip": 0.01238149, + "auxiliary_loss_mlp": 0.01090497, + "balance_loss_clip": 1.06697583, + "balance_loss_mlp": 1.05964613, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 1.7535063075082318, + "language_loss": 0.86361337, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88689983, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.30834961, + "step": 803, + "time_per_iteration": 4.075483798980713 + }, + { + "auxiliary_loss_clip": 0.01237291, + "auxiliary_loss_mlp": 0.01073425, + "balance_loss_clip": 1.06904602, + "balance_loss_mlp": 1.04467213, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.3637439643056752, + "language_loss": 0.84749782, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87060499, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.28759766, + "step": 804, + "time_per_iteration": 2.5117735862731934 + }, + { + "auxiliary_loss_clip": 0.01236643, + "auxiliary_loss_mlp": 0.01080947, + "balance_loss_clip": 1.06749165, + "balance_loss_mlp": 1.0527662, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.694108999639383, + "language_loss": 0.860237, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88341296, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.28198242, + "step": 805, + "time_per_iteration": 3.965545654296875 + }, + { + "auxiliary_loss_clip": 0.01241686, + "auxiliary_loss_mlp": 0.01082743, + "balance_loss_clip": 1.07033658, + "balance_loss_mlp": 1.05487156, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8598736699737597, + "language_loss": 0.68015879, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70340312, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.27856445, + "step": 806, + "time_per_iteration": 3.93170166015625 + }, + { + "auxiliary_loss_clip": 0.01227956, + "auxiliary_loss_mlp": 0.01070401, + "balance_loss_clip": 1.06286597, + "balance_loss_mlp": 1.04155231, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.0044989625991114, + "language_loss": 0.76835108, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79133463, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.28820801, + "step": 807, + "time_per_iteration": 2.5472707748413086 + }, + { + "auxiliary_loss_clip": 0.01229484, + "auxiliary_loss_mlp": 0.01076399, + "balance_loss_clip": 1.06027222, + "balance_loss_mlp": 1.04832482, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.249687148409363, + "language_loss": 0.86701614, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89007491, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.28076172, + "step": 808, + "time_per_iteration": 2.4533865451812744 + }, + { + "auxiliary_loss_clip": 0.01241288, + "auxiliary_loss_mlp": 0.01071452, + "balance_loss_clip": 1.06670988, + "balance_loss_mlp": 1.04306793, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.8676092201105035, + "language_loss": 0.90090895, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92403626, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.28381348, + "step": 809, + "time_per_iteration": 2.4787662029266357 + }, + { + "auxiliary_loss_clip": 0.01237995, + "auxiliary_loss_mlp": 0.01074077, + "balance_loss_clip": 1.06804013, + "balance_loss_mlp": 1.04553819, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.2319874145827203, + "language_loss": 0.84679091, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86991167, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.28564453, + "step": 810, + "time_per_iteration": 2.4931907653808594 + }, + { + "auxiliary_loss_clip": 0.01236621, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.06447649, + "balance_loss_mlp": 1.04450977, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 2.5199284061366707, + "language_loss": 0.81149524, + "learning_rate": 3.99630984108452e-06, + "loss": 0.834611, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.30419922, + "step": 811, + "time_per_iteration": 2.4871695041656494 + }, + { + "auxiliary_loss_clip": 0.01232741, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_clip": 1.06564832, + "balance_loss_mlp": 1.04142725, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 3.7659259595026104, + "language_loss": 0.74632519, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76935083, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.28393555, + "step": 812, + "time_per_iteration": 2.431903600692749 + }, + { + "auxiliary_loss_clip": 0.01230025, + "auxiliary_loss_mlp": 0.01079322, + "balance_loss_clip": 1.06611514, + "balance_loss_mlp": 1.05241704, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 3.312976756034056, + "language_loss": 0.90184319, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92493665, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.26879883, + "step": 813, + "time_per_iteration": 2.478720188140869 + }, + { + "auxiliary_loss_clip": 0.0123037, + "auxiliary_loss_mlp": 0.0106336, + "balance_loss_clip": 1.06427288, + "balance_loss_mlp": 1.03614497, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.311033321099835, + "language_loss": 0.75009185, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77302909, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.2722168, + "step": 814, + "time_per_iteration": 2.500075340270996 + }, + { + "auxiliary_loss_clip": 0.01239173, + "auxiliary_loss_mlp": 0.01078557, + "balance_loss_clip": 1.06768441, + "balance_loss_mlp": 1.04890943, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.133394108701896, + "language_loss": 0.8357743, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85895163, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.29650879, + "step": 815, + "time_per_iteration": 2.5385241508483887 + }, + { + "auxiliary_loss_clip": 0.01238343, + "auxiliary_loss_mlp": 0.01069123, + "balance_loss_clip": 1.06736946, + "balance_loss_mlp": 1.04063189, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.2306127365391033, + "language_loss": 0.90807498, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93114972, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.28466797, + "step": 816, + "time_per_iteration": 2.4840502738952637 + }, + { + "auxiliary_loss_clip": 0.01235304, + "auxiliary_loss_mlp": 0.01065074, + "balance_loss_clip": 1.0662725, + "balance_loss_mlp": 1.03791833, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.323364683990663, + "language_loss": 0.80206525, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82506907, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.2713623, + "step": 817, + "time_per_iteration": 2.5110692977905273 + }, + { + "auxiliary_loss_clip": 0.01235483, + "auxiliary_loss_mlp": 0.0107703, + "balance_loss_clip": 1.06837678, + "balance_loss_mlp": 1.04887259, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 2.2914503690642363, + "language_loss": 0.84862494, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87175012, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.28173828, + "step": 818, + "time_per_iteration": 2.498812437057495 + }, + { + "auxiliary_loss_clip": 0.01238676, + "auxiliary_loss_mlp": 0.01067734, + "balance_loss_clip": 1.06595898, + "balance_loss_mlp": 1.03830171, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.1227616424491638, + "language_loss": 0.75564289, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77870703, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.29431152, + "step": 819, + "time_per_iteration": 2.516632556915283 + }, + { + "auxiliary_loss_clip": 0.01244554, + "auxiliary_loss_mlp": 0.01069956, + "balance_loss_clip": 1.07319713, + "balance_loss_mlp": 1.04406369, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.6549928499996858, + "language_loss": 0.8485828, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87172794, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.25891113, + "step": 820, + "time_per_iteration": 2.4202637672424316 + }, + { + "auxiliary_loss_clip": 0.01239109, + "auxiliary_loss_mlp": 0.01069318, + "balance_loss_clip": 1.06853199, + "balance_loss_mlp": 1.04161382, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.8749092383732886, + "language_loss": 0.90644103, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92952538, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.27685547, + "step": 821, + "time_per_iteration": 2.5436220169067383 + }, + { + "auxiliary_loss_clip": 0.01232737, + "auxiliary_loss_mlp": 0.01090459, + "balance_loss_clip": 1.06430125, + "balance_loss_mlp": 1.06113315, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 1.7644907057673282, + "language_loss": 0.89440197, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91763395, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.29309082, + "step": 822, + "time_per_iteration": 2.449601888656616 + }, + { + "auxiliary_loss_clip": 0.01238462, + "auxiliary_loss_mlp": 0.01066634, + "balance_loss_clip": 1.0708406, + "balance_loss_mlp": 1.03516245, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 1.7876595537090376, + "language_loss": 0.674209, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69725996, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.31494141, + "step": 823, + "time_per_iteration": 2.5358848571777344 + }, + { + "auxiliary_loss_clip": 0.01127046, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.04725718, + "balance_loss_mlp": 1.02105379, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3448096583377178, + "language_loss": 0.62206036, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64359951, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.0581665, + "step": 824, + "time_per_iteration": 3.1174511909484863 + }, + { + "auxiliary_loss_clip": 0.01243636, + "auxiliary_loss_mlp": 0.01078879, + "balance_loss_clip": 1.07059407, + "balance_loss_mlp": 1.0489341, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.049188408297755, + "language_loss": 0.90451694, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92774212, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.29956055, + "step": 825, + "time_per_iteration": 2.5029590129852295 + }, + { + "auxiliary_loss_clip": 0.01238689, + "auxiliary_loss_mlp": 0.01068544, + "balance_loss_clip": 1.06855214, + "balance_loss_mlp": 1.03864646, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 2.5931041321203483, + "language_loss": 0.66886741, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69193971, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.29882812, + "step": 826, + "time_per_iteration": 2.4645016193389893 + }, + { + "auxiliary_loss_clip": 0.01241549, + "auxiliary_loss_mlp": 0.01075137, + "balance_loss_clip": 1.06983054, + "balance_loss_mlp": 1.04585958, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 1.7952062313806878, + "language_loss": 0.78089535, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80406219, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.29248047, + "step": 827, + "time_per_iteration": 2.4860146045684814 + }, + { + "auxiliary_loss_clip": 0.01243325, + "auxiliary_loss_mlp": 0.01084829, + "balance_loss_clip": 1.07084751, + "balance_loss_mlp": 1.05505085, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 1.8129277436481432, + "language_loss": 0.78723717, + "learning_rate": 3.995896894144294e-06, + "loss": 0.81051868, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.29772949, + "step": 828, + "time_per_iteration": 2.6703226566314697 + }, + { + "auxiliary_loss_clip": 0.0122776, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.06215215, + "balance_loss_mlp": 1.03489912, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.485311608841273, + "language_loss": 0.83664042, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85954475, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.27807617, + "step": 829, + "time_per_iteration": 2.6250312328338623 + }, + { + "auxiliary_loss_clip": 0.01241507, + "auxiliary_loss_mlp": 0.0108575, + "balance_loss_clip": 1.06921899, + "balance_loss_mlp": 1.05365908, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 1.925521886555161, + "language_loss": 0.75275111, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77602375, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.32080078, + "step": 830, + "time_per_iteration": 2.4667136669158936 + }, + { + "auxiliary_loss_clip": 0.01238308, + "auxiliary_loss_mlp": 0.01085603, + "balance_loss_clip": 1.06866014, + "balance_loss_mlp": 1.05267692, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.8999019118842562, + "language_loss": 0.79655838, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81979746, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.3293457, + "step": 831, + "time_per_iteration": 2.491442918777466 + }, + { + "auxiliary_loss_clip": 0.01241731, + "auxiliary_loss_mlp": 0.01080624, + "balance_loss_clip": 1.07146096, + "balance_loss_mlp": 1.05368233, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.342155075367856, + "language_loss": 0.91805941, + "learning_rate": 3.995796551235016e-06, + "loss": 0.94128299, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.26916504, + "step": 832, + "time_per_iteration": 2.4447245597839355 + }, + { + "auxiliary_loss_clip": 0.01230023, + "auxiliary_loss_mlp": 0.01078755, + "balance_loss_clip": 1.06622136, + "balance_loss_mlp": 1.05187321, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.851919001989235, + "language_loss": 0.83395326, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.857041, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.26879883, + "step": 833, + "time_per_iteration": 2.676199436187744 + }, + { + "auxiliary_loss_clip": 0.01239262, + "auxiliary_loss_mlp": 0.01064992, + "balance_loss_clip": 1.06795132, + "balance_loss_mlp": 1.03583384, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.160247337421589, + "language_loss": 0.82145476, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84449732, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.29125977, + "step": 834, + "time_per_iteration": 2.593688726425171 + }, + { + "auxiliary_loss_clip": 0.01239103, + "auxiliary_loss_mlp": 0.01070192, + "balance_loss_clip": 1.06704843, + "balance_loss_mlp": 1.04009175, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.073312546369088, + "language_loss": 0.91924536, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94233835, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.30041504, + "step": 835, + "time_per_iteration": 2.451903820037842 + }, + { + "auxiliary_loss_clip": 0.01241292, + "auxiliary_loss_mlp": 0.01072367, + "balance_loss_clip": 1.06632173, + "balance_loss_mlp": 1.04254079, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.072352627249585, + "language_loss": 0.76222742, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78536397, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.29797363, + "step": 836, + "time_per_iteration": 2.4283640384674072 + }, + { + "auxiliary_loss_clip": 0.01233707, + "auxiliary_loss_mlp": 0.01074153, + "balance_loss_clip": 1.06808245, + "balance_loss_mlp": 1.04649591, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.373922901228386, + "language_loss": 0.83420384, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.8572824, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.27661133, + "step": 837, + "time_per_iteration": 2.4606595039367676 + }, + { + "auxiliary_loss_clip": 0.01239786, + "auxiliary_loss_mlp": 0.01068026, + "balance_loss_clip": 1.07231843, + "balance_loss_mlp": 1.04029775, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.608293819401137, + "language_loss": 0.73021859, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75329667, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.27758789, + "step": 838, + "time_per_iteration": 2.4588494300842285 + }, + { + "auxiliary_loss_clip": 0.01230093, + "auxiliary_loss_mlp": 0.01071296, + "balance_loss_clip": 1.06131029, + "balance_loss_mlp": 1.04315114, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.6686282739431382, + "language_loss": 0.83686864, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85988253, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.28137207, + "step": 839, + "time_per_iteration": 2.4991838932037354 + }, + { + "auxiliary_loss_clip": 0.01231704, + "auxiliary_loss_mlp": 0.01076317, + "balance_loss_clip": 1.06567883, + "balance_loss_mlp": 1.04763532, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.4703449234058374, + "language_loss": 0.8542223, + "learning_rate": 3.995592232799595e-06, + "loss": 0.87730253, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.28710938, + "step": 840, + "time_per_iteration": 2.451474189758301 + }, + { + "auxiliary_loss_clip": 0.0123804, + "auxiliary_loss_mlp": 0.01064602, + "balance_loss_clip": 1.06910253, + "balance_loss_mlp": 1.03506243, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 1.8157279513133637, + "language_loss": 0.94593644, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96896291, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.29516602, + "step": 841, + "time_per_iteration": 2.440751552581787 + }, + { + "auxiliary_loss_clip": 0.01236122, + "auxiliary_loss_mlp": 0.01074988, + "balance_loss_clip": 1.06750607, + "balance_loss_mlp": 1.04478025, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.1297446934354336, + "language_loss": 0.77798259, + "learning_rate": 3.995540396440688e-06, + "loss": 0.8010937, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.30175781, + "step": 842, + "time_per_iteration": 2.5652413368225098 + }, + { + "auxiliary_loss_clip": 0.0124175, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.06880498, + "balance_loss_mlp": 1.04167867, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.160699780701912, + "language_loss": 0.7826165, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80574811, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.29699707, + "step": 843, + "time_per_iteration": 2.4780797958374023 + }, + { + "auxiliary_loss_clip": 0.01240819, + "auxiliary_loss_mlp": 0.01061911, + "balance_loss_clip": 1.07039022, + "balance_loss_mlp": 1.03485024, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 1.6725379513988372, + "language_loss": 0.82646734, + "learning_rate": 3.995488257397417e-06, + "loss": 0.84949458, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.27026367, + "step": 844, + "time_per_iteration": 2.6477553844451904 + }, + { + "auxiliary_loss_clip": 0.01236711, + "auxiliary_loss_mlp": 0.01068214, + "balance_loss_clip": 1.06814718, + "balance_loss_mlp": 1.0404861, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 1.99894677197578, + "language_loss": 0.76760054, + "learning_rate": 3.995462074371614e-06, + "loss": 0.79064983, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.27734375, + "step": 845, + "time_per_iteration": 2.499781847000122 + }, + { + "auxiliary_loss_clip": 0.01229094, + "auxiliary_loss_mlp": 0.01073783, + "balance_loss_clip": 1.062253, + "balance_loss_mlp": 1.04508901, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 1.7458315090269398, + "language_loss": 0.87770861, + "learning_rate": 3.99543581567769e-06, + "loss": 0.9007374, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.2869873, + "step": 846, + "time_per_iteration": 5.339660882949829 + }, + { + "auxiliary_loss_clip": 0.01238039, + "auxiliary_loss_mlp": 0.01071881, + "balance_loss_clip": 1.06934404, + "balance_loss_mlp": 1.0436759, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.6951022940329872, + "language_loss": 0.87496758, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89806676, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.28186035, + "step": 847, + "time_per_iteration": 2.480379819869995 + }, + { + "auxiliary_loss_clip": 0.01235267, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.07027197, + "balance_loss_mlp": 1.04109454, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.395718200883701, + "language_loss": 0.81844801, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84149766, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.28601074, + "step": 848, + "time_per_iteration": 3.902531385421753 + }, + { + "auxiliary_loss_clip": 0.0123584, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.06900263, + "balance_loss_mlp": 1.0476861, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.8126141092063277, + "language_loss": 0.87325549, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89637548, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.28466797, + "step": 849, + "time_per_iteration": 3.96751070022583 + }, + { + "auxiliary_loss_clip": 0.01226558, + "auxiliary_loss_mlp": 0.01060425, + "balance_loss_clip": 1.06177592, + "balance_loss_mlp": 1.03242278, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.7493966288729554, + "language_loss": 0.83455771, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85742754, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.27954102, + "step": 850, + "time_per_iteration": 2.5306918621063232 + }, + { + "auxiliary_loss_clip": 0.01229685, + "auxiliary_loss_mlp": 0.01064725, + "balance_loss_clip": 1.06251717, + "balance_loss_mlp": 1.03694928, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.122922446974982, + "language_loss": 0.65489745, + "learning_rate": 3.995303387221192e-06, + "loss": 0.6778416, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.27783203, + "step": 851, + "time_per_iteration": 2.750755786895752 + }, + { + "auxiliary_loss_clip": 0.01232479, + "auxiliary_loss_mlp": 0.01071955, + "balance_loss_clip": 1.06416488, + "balance_loss_mlp": 1.04236698, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.156011713286946, + "language_loss": 0.83621538, + "learning_rate": 3.995276674539547e-06, + "loss": 0.85925972, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.29589844, + "step": 852, + "time_per_iteration": 2.463674783706665 + }, + { + "auxiliary_loss_clip": 0.01232784, + "auxiliary_loss_mlp": 0.01076761, + "balance_loss_clip": 1.06434417, + "balance_loss_mlp": 1.04811525, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 6.10142846826634, + "language_loss": 0.80713695, + "learning_rate": 3.995249886196811e-06, + "loss": 0.83023238, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.28649902, + "step": 853, + "time_per_iteration": 2.398423910140991 + }, + { + "auxiliary_loss_clip": 0.01231516, + "auxiliary_loss_mlp": 0.01070801, + "balance_loss_clip": 1.06482065, + "balance_loss_mlp": 1.04142797, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 2.052980084606787, + "language_loss": 0.75908601, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78210914, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.29370117, + "step": 854, + "time_per_iteration": 2.515998601913452 + }, + { + "auxiliary_loss_clip": 0.01233471, + "auxiliary_loss_mlp": 0.01074028, + "balance_loss_clip": 1.06674314, + "balance_loss_mlp": 1.04439223, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 5.259389508582889, + "language_loss": 0.81547546, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83855045, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.29638672, + "step": 855, + "time_per_iteration": 2.533381700515747 + }, + { + "auxiliary_loss_clip": 0.01133155, + "auxiliary_loss_mlp": 0.01024049, + "balance_loss_clip": 1.05240655, + "balance_loss_mlp": 1.01641965, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0398327508714644, + "language_loss": 0.65686619, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67843831, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.07629395, + "step": 856, + "time_per_iteration": 3.0451385974884033 + }, + { + "auxiliary_loss_clip": 0.01227959, + "auxiliary_loss_mlp": 0.01062277, + "balance_loss_clip": 1.06503749, + "balance_loss_mlp": 1.03377378, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 2.47675271145022, + "language_loss": 0.77108514, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79398757, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.28503418, + "step": 857, + "time_per_iteration": 2.4633779525756836 + }, + { + "auxiliary_loss_clip": 0.01234673, + "auxiliary_loss_mlp": 0.01067478, + "balance_loss_clip": 1.06557274, + "balance_loss_mlp": 1.03834319, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.114578874035616, + "language_loss": 0.89366871, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91669023, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.2911377, + "step": 858, + "time_per_iteration": 2.4407105445861816 + }, + { + "auxiliary_loss_clip": 0.0123869, + "auxiliary_loss_mlp": 0.01067945, + "balance_loss_clip": 1.07147598, + "balance_loss_mlp": 1.03945351, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 2.1310795696374893, + "language_loss": 0.75688857, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77995503, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.2845459, + "step": 859, + "time_per_iteration": 2.4701075553894043 + }, + { + "auxiliary_loss_clip": 0.01239028, + "auxiliary_loss_mlp": 0.01071074, + "balance_loss_clip": 1.0669173, + "balance_loss_mlp": 1.04112887, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.21577914860876, + "language_loss": 0.90986669, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93296772, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.29919434, + "step": 860, + "time_per_iteration": 2.429656505584717 + }, + { + "auxiliary_loss_clip": 0.01233834, + "auxiliary_loss_mlp": 0.01062738, + "balance_loss_clip": 1.06752276, + "balance_loss_mlp": 1.03566563, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.8330317237832523, + "language_loss": 0.82339168, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84635735, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.27087402, + "step": 861, + "time_per_iteration": 2.510643482208252 + }, + { + "auxiliary_loss_clip": 0.01231616, + "auxiliary_loss_mlp": 0.01069192, + "balance_loss_clip": 1.06434298, + "balance_loss_mlp": 1.03929424, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 1.9040648601056456, + "language_loss": 0.7840482, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80705625, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.29931641, + "step": 862, + "time_per_iteration": 2.5081136226654053 + }, + { + "auxiliary_loss_clip": 0.01229307, + "auxiliary_loss_mlp": 0.01080399, + "balance_loss_clip": 1.06619763, + "balance_loss_mlp": 1.04978633, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.8452481378346817, + "language_loss": 0.88965857, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91275561, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.30639648, + "step": 863, + "time_per_iteration": 2.478184700012207 + }, + { + "auxiliary_loss_clip": 0.01243273, + "auxiliary_loss_mlp": 0.01076227, + "balance_loss_clip": 1.07445335, + "balance_loss_mlp": 1.04501772, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 1.9785701729751086, + "language_loss": 0.76340806, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78660309, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.31225586, + "step": 864, + "time_per_iteration": 2.5204856395721436 + }, + { + "auxiliary_loss_clip": 0.01234738, + "auxiliary_loss_mlp": 0.01081512, + "balance_loss_clip": 1.06704223, + "balance_loss_mlp": 1.049945, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 2.0990919694163828, + "language_loss": 0.79349077, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81665325, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.31567383, + "step": 865, + "time_per_iteration": 2.464245080947876 + }, + { + "auxiliary_loss_clip": 0.01231003, + "auxiliary_loss_mlp": 0.01075111, + "balance_loss_clip": 1.06356382, + "balance_loss_mlp": 1.04360366, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.0964544707247974, + "language_loss": 0.85985398, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88291514, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.31506348, + "step": 866, + "time_per_iteration": 2.423168897628784 + }, + { + "auxiliary_loss_clip": 0.01238014, + "auxiliary_loss_mlp": 0.01071308, + "balance_loss_clip": 1.07140744, + "balance_loss_mlp": 1.04291248, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.5471541403376263, + "language_loss": 0.87215519, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89524841, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.28405762, + "step": 867, + "time_per_iteration": 2.418318510055542 + }, + { + "auxiliary_loss_clip": 0.01231739, + "auxiliary_loss_mlp": 0.01075829, + "balance_loss_clip": 1.07024205, + "balance_loss_mlp": 1.05055618, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.5182598343743334, + "language_loss": 0.64025116, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.6633268, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.25280762, + "step": 868, + "time_per_iteration": 2.5802547931671143 + }, + { + "auxiliary_loss_clip": 0.01234475, + "auxiliary_loss_mlp": 0.01070835, + "balance_loss_clip": 1.06610322, + "balance_loss_mlp": 1.04155684, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 4.215234783867508, + "language_loss": 0.83614641, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85919952, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.29272461, + "step": 869, + "time_per_iteration": 2.496997356414795 + }, + { + "auxiliary_loss_clip": 0.01237386, + "auxiliary_loss_mlp": 0.01062362, + "balance_loss_clip": 1.06780601, + "balance_loss_mlp": 1.03472972, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.4164669251344124, + "language_loss": 0.87839282, + "learning_rate": 3.994782909218751e-06, + "loss": 0.90139025, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.27661133, + "step": 870, + "time_per_iteration": 2.5081093311309814 + }, + { + "auxiliary_loss_clip": 0.01235572, + "auxiliary_loss_mlp": 0.01069047, + "balance_loss_clip": 1.06805396, + "balance_loss_mlp": 1.04158139, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 2.3514466375394365, + "language_loss": 0.80992138, + "learning_rate": 3.994754759152854e-06, + "loss": 0.83296758, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.27429199, + "step": 871, + "time_per_iteration": 2.4413421154022217 + }, + { + "auxiliary_loss_clip": 0.01231456, + "auxiliary_loss_mlp": 0.01073942, + "balance_loss_clip": 1.06766057, + "balance_loss_mlp": 1.04787076, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.6010233692672036, + "language_loss": 0.8129251, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83597904, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.26086426, + "step": 872, + "time_per_iteration": 2.471717596054077 + }, + { + "auxiliary_loss_clip": 0.01133341, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.05193472, + "balance_loss_mlp": 1.02969289, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 1.1021071707026813, + "language_loss": 0.61628723, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.6379683, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.05078125, + "step": 873, + "time_per_iteration": 3.009821653366089 + }, + { + "auxiliary_loss_clip": 0.01243781, + "auxiliary_loss_mlp": 0.01059443, + "balance_loss_clip": 1.07487845, + "balance_loss_mlp": 1.03166723, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8414066437632708, + "language_loss": 0.89034402, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91337621, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.27746582, + "step": 874, + "time_per_iteration": 2.506301164627075 + }, + { + "auxiliary_loss_clip": 0.01230199, + "auxiliary_loss_mlp": 0.0106817, + "balance_loss_clip": 1.06308126, + "balance_loss_mlp": 1.04047775, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.814386344228283, + "language_loss": 0.74710107, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77008474, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.27697754, + "step": 875, + "time_per_iteration": 2.5207903385162354 + }, + { + "auxiliary_loss_clip": 0.01226085, + "auxiliary_loss_mlp": 0.01060679, + "balance_loss_clip": 1.06317079, + "balance_loss_mlp": 1.03180635, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.6912679678472649, + "language_loss": 0.92596579, + "learning_rate": 3.99461287422531e-06, + "loss": 0.94883335, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.2890625, + "step": 876, + "time_per_iteration": 2.512366533279419 + }, + { + "auxiliary_loss_clip": 0.01108198, + "auxiliary_loss_mlp": 0.01005832, + "balance_loss_clip": 1.03102493, + "balance_loss_mlp": 1.00124216, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8157598416602718, + "language_loss": 0.62933677, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65047705, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.04589844, + "step": 877, + "time_per_iteration": 3.0619263648986816 + }, + { + "auxiliary_loss_clip": 0.01233257, + "auxiliary_loss_mlp": 0.01069224, + "balance_loss_clip": 1.06733537, + "balance_loss_mlp": 1.04061341, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.108474563793598, + "language_loss": 0.8577708, + "learning_rate": 3.994555590795299e-06, + "loss": 0.88079566, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.28637695, + "step": 878, + "time_per_iteration": 2.438678503036499 + }, + { + "auxiliary_loss_clip": 0.01239836, + "auxiliary_loss_mlp": 0.01062762, + "balance_loss_clip": 1.07185757, + "balance_loss_mlp": 1.03584468, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.5450779927393206, + "language_loss": 0.82910722, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85213321, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.26904297, + "step": 879, + "time_per_iteration": 2.492424964904785 + }, + { + "auxiliary_loss_clip": 0.01225545, + "auxiliary_loss_mlp": 0.01067272, + "balance_loss_clip": 1.0637784, + "balance_loss_mlp": 1.03959179, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 2.1411000257303074, + "language_loss": 0.84393352, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.8668617, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.27697754, + "step": 880, + "time_per_iteration": 2.4576222896575928 + }, + { + "auxiliary_loss_clip": 0.0123297, + "auxiliary_loss_mlp": 0.01065516, + "balance_loss_clip": 1.06592989, + "balance_loss_mlp": 1.0377996, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.070137110646, + "language_loss": 0.87136263, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89434749, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.27722168, + "step": 881, + "time_per_iteration": 2.448498487472534 + }, + { + "auxiliary_loss_clip": 0.01236517, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.06850481, + "balance_loss_mlp": 1.04246116, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9971974925184002, + "language_loss": 0.87876844, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90184438, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.28613281, + "step": 882, + "time_per_iteration": 2.501431703567505 + }, + { + "auxiliary_loss_clip": 0.01233228, + "auxiliary_loss_mlp": 0.01069323, + "balance_loss_clip": 1.06534386, + "balance_loss_mlp": 1.03878188, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.137580318149728, + "language_loss": 0.69329876, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71632427, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.30541992, + "step": 883, + "time_per_iteration": 2.6513867378234863 + }, + { + "auxiliary_loss_clip": 0.01225781, + "auxiliary_loss_mlp": 0.01062964, + "balance_loss_clip": 1.06562781, + "balance_loss_mlp": 1.03633249, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 2.0826867135549723, + "language_loss": 0.76215148, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78503895, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.26660156, + "step": 884, + "time_per_iteration": 2.556825876235962 + }, + { + "auxiliary_loss_clip": 0.01224145, + "auxiliary_loss_mlp": 0.01062866, + "balance_loss_clip": 1.06348014, + "balance_loss_mlp": 1.03649735, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.5646045140709046, + "language_loss": 0.85753453, + "learning_rate": 3.994352716384659e-06, + "loss": 0.88040471, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.26342773, + "step": 885, + "time_per_iteration": 2.459530830383301 + }, + { + "auxiliary_loss_clip": 0.01229251, + "auxiliary_loss_mlp": 0.01070749, + "balance_loss_clip": 1.06386161, + "balance_loss_mlp": 1.04323566, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.791474282603582, + "language_loss": 0.86167258, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88467258, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.27539062, + "step": 886, + "time_per_iteration": 2.4496042728424072 + }, + { + "auxiliary_loss_clip": 0.01225018, + "auxiliary_loss_mlp": 0.01064291, + "balance_loss_clip": 1.06259453, + "balance_loss_mlp": 1.03589511, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 1.9841073758618601, + "language_loss": 0.89672601, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91961914, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.28381348, + "step": 887, + "time_per_iteration": 2.442070245742798 + }, + { + "auxiliary_loss_clip": 0.01223037, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.05883789, + "balance_loss_mlp": 1.04088712, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.0455718628209367, + "language_loss": 0.75477028, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77770001, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.29052734, + "step": 888, + "time_per_iteration": 2.5099077224731445 + }, + { + "auxiliary_loss_clip": 0.01238423, + "auxiliary_loss_mlp": 0.01078577, + "balance_loss_clip": 1.07165241, + "balance_loss_mlp": 1.04970419, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 1.880019730181634, + "language_loss": 0.88142896, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90459895, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.2890625, + "step": 889, + "time_per_iteration": 3.8910531997680664 + }, + { + "auxiliary_loss_clip": 0.01248206, + "auxiliary_loss_mlp": 0.01051488, + "balance_loss_clip": 1.08295524, + "balance_loss_mlp": 1.02505922, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.7409150333894947, + "language_loss": 0.88388127, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90687823, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.2644043, + "step": 890, + "time_per_iteration": 3.9073686599731445 + }, + { + "auxiliary_loss_clip": 0.01228517, + "auxiliary_loss_mlp": 0.01071596, + "balance_loss_clip": 1.06517124, + "balance_loss_mlp": 1.0458113, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.2366968138465446, + "language_loss": 0.93168914, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95469034, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.25769043, + "step": 891, + "time_per_iteration": 2.495420217514038 + }, + { + "auxiliary_loss_clip": 0.01222574, + "auxiliary_loss_mlp": 0.01067653, + "balance_loss_clip": 1.06240261, + "balance_loss_mlp": 1.03925765, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 1.8566309926503914, + "language_loss": 0.72009993, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74300224, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.28417969, + "step": 892, + "time_per_iteration": 4.0269935131073 + }, + { + "auxiliary_loss_clip": 0.01222963, + "auxiliary_loss_mlp": 0.01069752, + "balance_loss_clip": 1.06191444, + "balance_loss_mlp": 1.04401481, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 1.6970048708168828, + "language_loss": 0.8234024, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84632957, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.25744629, + "step": 893, + "time_per_iteration": 2.516876459121704 + }, + { + "auxiliary_loss_clip": 0.01226893, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_clip": 1.06160367, + "balance_loss_mlp": 1.03885078, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 1.9107233256205725, + "language_loss": 0.81562936, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83855087, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.26403809, + "step": 894, + "time_per_iteration": 3.908628225326538 + }, + { + "auxiliary_loss_clip": 0.01225105, + "auxiliary_loss_mlp": 0.01063487, + "balance_loss_clip": 1.06300402, + "balance_loss_mlp": 1.03773797, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.238143806904381, + "language_loss": 0.75525534, + "learning_rate": 3.994056467679221e-06, + "loss": 0.77814126, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.25732422, + "step": 895, + "time_per_iteration": 2.49837064743042 + }, + { + "auxiliary_loss_clip": 0.0123094, + "auxiliary_loss_mlp": 0.01056961, + "balance_loss_clip": 1.06475449, + "balance_loss_mlp": 1.03038979, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 1.7467011455714396, + "language_loss": 0.8622058, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88508487, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.26574707, + "step": 896, + "time_per_iteration": 2.5856387615203857 + }, + { + "auxiliary_loss_clip": 0.01231994, + "auxiliary_loss_mlp": 0.01059622, + "balance_loss_clip": 1.06491482, + "balance_loss_mlp": 1.03173935, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 8.89527021758301, + "language_loss": 0.88504112, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90795732, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.27856445, + "step": 897, + "time_per_iteration": 2.4232540130615234 + }, + { + "auxiliary_loss_clip": 0.01221983, + "auxiliary_loss_mlp": 0.01066619, + "balance_loss_clip": 1.06097126, + "balance_loss_mlp": 1.0376277, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.785465945518256, + "language_loss": 0.90033728, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92322332, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.2902832, + "step": 898, + "time_per_iteration": 2.4672799110412598 + }, + { + "auxiliary_loss_clip": 0.01230171, + "auxiliary_loss_mlp": 0.01079344, + "balance_loss_clip": 1.06431901, + "balance_loss_mlp": 1.05156779, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 2.47862238253773, + "language_loss": 0.91496408, + "learning_rate": 3.993935850918845e-06, + "loss": 0.93805921, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.27807617, + "step": 899, + "time_per_iteration": 2.3994693756103516 + }, + { + "auxiliary_loss_clip": 0.0122385, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_clip": 1.06148982, + "balance_loss_mlp": 1.04038954, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 1.9276140806159374, + "language_loss": 0.75509393, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77799249, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.25634766, + "step": 900, + "time_per_iteration": 2.5380148887634277 + }, + { + "auxiliary_loss_clip": 0.01245583, + "auxiliary_loss_mlp": 0.01061871, + "balance_loss_clip": 1.07541513, + "balance_loss_mlp": 1.03625333, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.4452851023916993, + "language_loss": 0.74057907, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76365364, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.25646973, + "step": 901, + "time_per_iteration": 2.5205724239349365 + }, + { + "auxiliary_loss_clip": 0.01226699, + "auxiliary_loss_mlp": 0.01070232, + "balance_loss_clip": 1.06731701, + "balance_loss_mlp": 1.04479277, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.162950837898326, + "language_loss": 0.84966719, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87263644, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.25427246, + "step": 902, + "time_per_iteration": 2.4286975860595703 + }, + { + "auxiliary_loss_clip": 0.0122243, + "auxiliary_loss_mlp": 0.01062341, + "balance_loss_clip": 1.06025028, + "balance_loss_mlp": 1.03572118, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 1.857454331632814, + "language_loss": 0.86746287, + "learning_rate": 3.993814024394569e-06, + "loss": 0.89031053, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.26647949, + "step": 903, + "time_per_iteration": 2.478497266769409 + }, + { + "auxiliary_loss_clip": 0.01224166, + "auxiliary_loss_mlp": 0.01057945, + "balance_loss_clip": 1.06264162, + "balance_loss_mlp": 1.0327208, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.331638361585453, + "language_loss": 0.75459421, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77741528, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.25219727, + "step": 904, + "time_per_iteration": 2.4676804542541504 + }, + { + "auxiliary_loss_clip": 0.01229825, + "auxiliary_loss_mlp": 0.01075333, + "balance_loss_clip": 1.06529212, + "balance_loss_mlp": 1.04925001, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.30478199242483, + "language_loss": 0.86049795, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88354957, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.26086426, + "step": 905, + "time_per_iteration": 2.455324411392212 + }, + { + "auxiliary_loss_clip": 0.0122767, + "auxiliary_loss_mlp": 0.01071194, + "balance_loss_clip": 1.06734657, + "balance_loss_mlp": 1.0461359, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.8601815286007235, + "language_loss": 0.74173748, + "learning_rate": 3.993721860638241e-06, + "loss": 0.7647261, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.25073242, + "step": 906, + "time_per_iteration": 2.451489210128784 + }, + { + "auxiliary_loss_clip": 0.01228777, + "auxiliary_loss_mlp": 0.01066155, + "balance_loss_clip": 1.06453609, + "balance_loss_mlp": 1.03959465, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.175899201397055, + "language_loss": 0.87505054, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89799988, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.26574707, + "step": 907, + "time_per_iteration": 2.5327723026275635 + }, + { + "auxiliary_loss_clip": 0.01225095, + "auxiliary_loss_mlp": 0.01080966, + "balance_loss_clip": 1.06371248, + "balance_loss_mlp": 1.05171156, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.6615158763352187, + "language_loss": 0.87129223, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89435279, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.29272461, + "step": 908, + "time_per_iteration": 2.524829864501953 + }, + { + "auxiliary_loss_clip": 0.01223708, + "auxiliary_loss_mlp": 0.01070456, + "balance_loss_clip": 1.06294763, + "balance_loss_mlp": 1.042418, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.171934217260778, + "language_loss": 0.89397383, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91691542, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.28076172, + "step": 909, + "time_per_iteration": 2.4779372215270996 + }, + { + "auxiliary_loss_clip": 0.01230408, + "auxiliary_loss_mlp": 0.01079292, + "balance_loss_clip": 1.06599855, + "balance_loss_mlp": 1.04974031, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.644665382581738, + "language_loss": 0.71214962, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73524666, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.29528809, + "step": 910, + "time_per_iteration": 2.529709815979004 + }, + { + "auxiliary_loss_clip": 0.01226063, + "auxiliary_loss_mlp": 0.01058767, + "balance_loss_clip": 1.0643568, + "balance_loss_mlp": 1.03298223, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.7355121110171388, + "language_loss": 0.83656198, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85941029, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.2578125, + "step": 911, + "time_per_iteration": 2.48345947265625 + }, + { + "auxiliary_loss_clip": 0.01229333, + "auxiliary_loss_mlp": 0.01073267, + "balance_loss_clip": 1.06544733, + "balance_loss_mlp": 1.04575384, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.954647410924617, + "language_loss": 0.75980854, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78283459, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.27502441, + "step": 912, + "time_per_iteration": 2.478236436843872 + }, + { + "auxiliary_loss_clip": 0.01216961, + "auxiliary_loss_mlp": 0.01063589, + "balance_loss_clip": 1.06042695, + "balance_loss_mlp": 1.03531265, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.442632868795501, + "language_loss": 0.82700217, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84980774, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.28271484, + "step": 913, + "time_per_iteration": 2.452585458755493 + }, + { + "auxiliary_loss_clip": 0.01222863, + "auxiliary_loss_mlp": 0.01061338, + "balance_loss_clip": 1.06650627, + "balance_loss_mlp": 1.03597021, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 1.824637939597149, + "language_loss": 0.83946711, + "learning_rate": 3.993472764213772e-06, + "loss": 0.8623091, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.25390625, + "step": 914, + "time_per_iteration": 2.545531749725342 + }, + { + "auxiliary_loss_clip": 0.0122728, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.06605458, + "balance_loss_mlp": 1.03664029, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.375624165921153, + "language_loss": 0.90119624, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92408991, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.25476074, + "step": 915, + "time_per_iteration": 2.526975393295288 + }, + { + "auxiliary_loss_clip": 0.01225209, + "auxiliary_loss_mlp": 0.01057825, + "balance_loss_clip": 1.06551456, + "balance_loss_mlp": 1.03232658, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.6820751736901085, + "language_loss": 0.89916885, + "learning_rate": 3.993409734157064e-06, + "loss": 0.92199922, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.25500488, + "step": 916, + "time_per_iteration": 2.511216402053833 + }, + { + "auxiliary_loss_clip": 0.01228892, + "auxiliary_loss_mlp": 0.0107189, + "balance_loss_clip": 1.06540394, + "balance_loss_mlp": 1.04521084, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 2.5428490442238814, + "language_loss": 0.80033648, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82334435, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.26672363, + "step": 917, + "time_per_iteration": 2.4681501388549805 + }, + { + "auxiliary_loss_clip": 0.01227422, + "auxiliary_loss_mlp": 0.0106577, + "balance_loss_clip": 1.06380463, + "balance_loss_mlp": 1.03917432, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9198925170679295, + "language_loss": 0.79818046, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.82111239, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.26623535, + "step": 918, + "time_per_iteration": 2.4879539012908936 + }, + { + "auxiliary_loss_clip": 0.01222697, + "auxiliary_loss_mlp": 0.01061175, + "balance_loss_clip": 1.06148458, + "balance_loss_mlp": 1.03524661, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9717061195284256, + "language_loss": 0.89191687, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91475558, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.2590332, + "step": 919, + "time_per_iteration": 2.437268018722534 + }, + { + "auxiliary_loss_clip": 0.01218692, + "auxiliary_loss_mlp": 0.01065295, + "balance_loss_clip": 1.05987346, + "balance_loss_mlp": 1.0388782, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 3.2180795585529918, + "language_loss": 0.87122852, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89406842, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.2644043, + "step": 920, + "time_per_iteration": 2.5781993865966797 + }, + { + "auxiliary_loss_clip": 0.01220096, + "auxiliary_loss_mlp": 0.01063587, + "balance_loss_clip": 1.06273651, + "balance_loss_mlp": 1.03810024, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.2033735707044193, + "language_loss": 0.66161078, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68444765, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.25488281, + "step": 921, + "time_per_iteration": 2.6077892780303955 + }, + { + "auxiliary_loss_clip": 0.01229301, + "auxiliary_loss_mlp": 0.01068198, + "balance_loss_clip": 1.06498384, + "balance_loss_mlp": 1.03899181, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 1.8281920419401891, + "language_loss": 0.72068375, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74365878, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.29211426, + "step": 922, + "time_per_iteration": 2.435030937194824 + }, + { + "auxiliary_loss_clip": 0.01229553, + "auxiliary_loss_mlp": 0.01069199, + "balance_loss_clip": 1.06594396, + "balance_loss_mlp": 1.04243648, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 3.923207568293388, + "language_loss": 0.82316554, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84615302, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.26757812, + "step": 923, + "time_per_iteration": 2.4467434883117676 + }, + { + "auxiliary_loss_clip": 0.01224896, + "auxiliary_loss_mlp": 0.01070188, + "balance_loss_clip": 1.06324506, + "balance_loss_mlp": 1.04405713, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2846744325795623, + "language_loss": 0.7870357, + "learning_rate": 3.993154590414675e-06, + "loss": 0.80998647, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.26123047, + "step": 924, + "time_per_iteration": 2.4422872066497803 + }, + { + "auxiliary_loss_clip": 0.01218866, + "auxiliary_loss_mlp": 0.01059788, + "balance_loss_clip": 1.06015277, + "balance_loss_mlp": 1.03319287, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 2.3485487537058347, + "language_loss": 1.0227344, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04552102, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.26574707, + "step": 925, + "time_per_iteration": 2.515535831451416 + }, + { + "auxiliary_loss_clip": 0.01227385, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.06403136, + "balance_loss_mlp": 1.02710724, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 1.9681583638409963, + "language_loss": 0.80962956, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83243418, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.2598877, + "step": 926, + "time_per_iteration": 2.4878387451171875 + }, + { + "auxiliary_loss_clip": 0.01231337, + "auxiliary_loss_mlp": 0.0106665, + "balance_loss_clip": 1.06608796, + "balance_loss_mlp": 1.03900528, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 3.7232114451642513, + "language_loss": 0.7352066, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75818646, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.2767334, + "step": 927, + "time_per_iteration": 2.4624483585357666 + }, + { + "auxiliary_loss_clip": 0.01108699, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.03095889, + "balance_loss_mlp": 1.031147, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7935495794327488, + "language_loss": 0.59788609, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.61935651, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.07189941, + "step": 928, + "time_per_iteration": 3.115497589111328 + }, + { + "auxiliary_loss_clip": 0.01228528, + "auxiliary_loss_mlp": 0.01065897, + "balance_loss_clip": 1.06554174, + "balance_loss_mlp": 1.03849053, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.486700103784199, + "language_loss": 0.94964671, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97259092, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.27404785, + "step": 929, + "time_per_iteration": 2.483705520629883 + }, + { + "auxiliary_loss_clip": 0.01228449, + "auxiliary_loss_mlp": 0.01072244, + "balance_loss_clip": 1.06520748, + "balance_loss_mlp": 1.04182148, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.0284713705441435, + "language_loss": 0.71968091, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74268782, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.30419922, + "step": 930, + "time_per_iteration": 2.556813955307007 + }, + { + "auxiliary_loss_clip": 0.01230017, + "auxiliary_loss_mlp": 0.01066558, + "balance_loss_clip": 1.06577373, + "balance_loss_mlp": 1.03917575, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 3.2763129866123464, + "language_loss": 0.85403901, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87700474, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.27380371, + "step": 931, + "time_per_iteration": 2.4151053428649902 + }, + { + "auxiliary_loss_clip": 0.01223609, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_clip": 1.06192172, + "balance_loss_mlp": 1.044891, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.7486888204919528, + "language_loss": 0.8368606, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85983574, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.29052734, + "step": 932, + "time_per_iteration": 2.5269272327423096 + }, + { + "auxiliary_loss_clip": 0.01244946, + "auxiliary_loss_mlp": 0.01074053, + "balance_loss_clip": 1.08144367, + "balance_loss_mlp": 1.0454905, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.892401174343615, + "language_loss": 0.7350142, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75820422, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.28588867, + "step": 933, + "time_per_iteration": 5.308983325958252 + }, + { + "auxiliary_loss_clip": 0.01219368, + "auxiliary_loss_mlp": 0.01080612, + "balance_loss_clip": 1.05934119, + "balance_loss_mlp": 1.05119085, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 7.104857454200674, + "language_loss": 0.8671267, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89012647, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.29418945, + "step": 934, + "time_per_iteration": 2.4449172019958496 + }, + { + "auxiliary_loss_clip": 0.01220211, + "auxiliary_loss_mlp": 0.01070396, + "balance_loss_clip": 1.06321955, + "balance_loss_mlp": 1.04480135, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.213340993052146, + "language_loss": 0.8037554, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82666147, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.2557373, + "step": 935, + "time_per_iteration": 2.4034488201141357 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.01017889, + "balance_loss_clip": 1.03093696, + "balance_loss_mlp": 1.01257241, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.823379866070044, + "language_loss": 0.69191229, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71316671, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.05322266, + "step": 936, + "time_per_iteration": 4.3521177768707275 + }, + { + "auxiliary_loss_clip": 0.0122501, + "auxiliary_loss_mlp": 0.01068592, + "balance_loss_clip": 1.06230402, + "balance_loss_mlp": 1.04261589, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.624081435648179, + "language_loss": 0.75908083, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78201687, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.26000977, + "step": 937, + "time_per_iteration": 3.8954944610595703 + }, + { + "auxiliary_loss_clip": 0.01105988, + "auxiliary_loss_mlp": 0.01015675, + "balance_loss_clip": 1.02995634, + "balance_loss_mlp": 1.0104537, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8528772053181057, + "language_loss": 0.64383119, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66504776, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.05218506, + "step": 938, + "time_per_iteration": 3.0503649711608887 + }, + { + "auxiliary_loss_clip": 0.01234728, + "auxiliary_loss_mlp": 0.01071719, + "balance_loss_clip": 1.06720483, + "balance_loss_mlp": 1.04430056, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 2.3656827341774034, + "language_loss": 0.79556072, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81862521, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.27416992, + "step": 939, + "time_per_iteration": 2.477509021759033 + }, + { + "auxiliary_loss_clip": 0.01222537, + "auxiliary_loss_mlp": 0.01064509, + "balance_loss_clip": 1.06392288, + "balance_loss_mlp": 1.03834224, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.678135974363741, + "language_loss": 0.74089611, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76376653, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.26171875, + "step": 940, + "time_per_iteration": 2.5253589153289795 + }, + { + "auxiliary_loss_clip": 0.01221939, + "auxiliary_loss_mlp": 0.01063816, + "balance_loss_clip": 1.0643518, + "balance_loss_mlp": 1.0379951, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.461191439665901, + "language_loss": 0.71400726, + "learning_rate": 3.992596349869216e-06, + "loss": 0.73686481, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.25817871, + "step": 941, + "time_per_iteration": 2.564823627471924 + }, + { + "auxiliary_loss_clip": 0.01219113, + "auxiliary_loss_mlp": 0.01063739, + "balance_loss_clip": 1.06246054, + "balance_loss_mlp": 1.03717947, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.1105430254222957, + "language_loss": 0.80845374, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83128226, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.265625, + "step": 942, + "time_per_iteration": 2.4409353733062744 + }, + { + "auxiliary_loss_clip": 0.0121904, + "auxiliary_loss_mlp": 0.01067235, + "balance_loss_clip": 1.05993617, + "balance_loss_mlp": 1.04142666, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.0706515859031143, + "language_loss": 0.88426387, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90712667, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.25805664, + "step": 943, + "time_per_iteration": 2.4589760303497314 + }, + { + "auxiliary_loss_clip": 0.01222476, + "auxiliary_loss_mlp": 0.0106422, + "balance_loss_clip": 1.06443667, + "balance_loss_mlp": 1.03886461, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.120084860221882, + "language_loss": 0.75282371, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77569067, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.25341797, + "step": 944, + "time_per_iteration": 2.42444109916687 + }, + { + "auxiliary_loss_clip": 0.01218364, + "auxiliary_loss_mlp": 0.01065028, + "balance_loss_clip": 1.06071734, + "balance_loss_mlp": 1.04103136, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7062303037401447, + "language_loss": 0.79403317, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81686711, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.23999023, + "step": 945, + "time_per_iteration": 2.461439847946167 + }, + { + "auxiliary_loss_clip": 0.01235681, + "auxiliary_loss_mlp": 0.01071593, + "balance_loss_clip": 1.07106149, + "balance_loss_mlp": 1.04622519, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.4153711259483592, + "language_loss": 0.82800663, + "learning_rate": 3.992428005427252e-06, + "loss": 0.85107934, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.25366211, + "step": 946, + "time_per_iteration": 2.432187080383301 + }, + { + "auxiliary_loss_clip": 0.01232981, + "auxiliary_loss_mlp": 0.01062904, + "balance_loss_clip": 1.0689975, + "balance_loss_mlp": 1.03564095, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 1.7642998561483771, + "language_loss": 0.78916216, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81212103, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.27258301, + "step": 947, + "time_per_iteration": 2.4638116359710693 + }, + { + "auxiliary_loss_clip": 0.01231744, + "auxiliary_loss_mlp": 0.01072022, + "balance_loss_clip": 1.06867719, + "balance_loss_mlp": 1.04636836, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 3.516163612984281, + "language_loss": 0.85670769, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.8797453, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.25671387, + "step": 948, + "time_per_iteration": 2.3983516693115234 + }, + { + "auxiliary_loss_clip": 0.01220772, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_clip": 1.06245708, + "balance_loss_mlp": 1.03786826, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 2.0993078914640027, + "language_loss": 0.87463784, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89749449, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.2701416, + "step": 949, + "time_per_iteration": 2.4703152179718018 + }, + { + "auxiliary_loss_clip": 0.01227584, + "auxiliary_loss_mlp": 0.01066534, + "balance_loss_clip": 1.06936169, + "balance_loss_mlp": 1.04201281, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.4085520687843536, + "language_loss": 0.78937668, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81231791, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.24536133, + "step": 950, + "time_per_iteration": 2.5238752365112305 + }, + { + "auxiliary_loss_clip": 0.01224549, + "auxiliary_loss_mlp": 0.01075131, + "balance_loss_clip": 1.06239152, + "balance_loss_mlp": 1.04720044, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.1934888316495074, + "language_loss": 0.82517815, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84817493, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.27941895, + "step": 951, + "time_per_iteration": 2.676485776901245 + }, + { + "auxiliary_loss_clip": 0.01216995, + "auxiliary_loss_mlp": 0.01061676, + "balance_loss_clip": 1.05812144, + "balance_loss_mlp": 1.0343653, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.47424307949693, + "language_loss": 0.86875451, + "learning_rate": 3.992223498859958e-06, + "loss": 0.89154124, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.27307129, + "step": 952, + "time_per_iteration": 2.481956720352173 + }, + { + "auxiliary_loss_clip": 0.01233376, + "auxiliary_loss_mlp": 0.01064971, + "balance_loss_clip": 1.06645823, + "balance_loss_mlp": 1.03645587, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.9262918398271864, + "language_loss": 0.79108977, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81407326, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.28491211, + "step": 953, + "time_per_iteration": 2.4861347675323486 + }, + { + "auxiliary_loss_clip": 0.01225091, + "auxiliary_loss_mlp": 0.0106976, + "balance_loss_clip": 1.06584728, + "balance_loss_mlp": 1.04031515, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 3.6976471567420397, + "language_loss": 0.86629343, + "learning_rate": 3.992154725627848e-06, + "loss": 0.88924193, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.29443359, + "step": 954, + "time_per_iteration": 2.4797918796539307 + }, + { + "auxiliary_loss_clip": 0.01227866, + "auxiliary_loss_mlp": 0.0106314, + "balance_loss_clip": 1.0651592, + "balance_loss_mlp": 1.03700948, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3414510500689465, + "language_loss": 0.88451809, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90742821, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.26123047, + "step": 955, + "time_per_iteration": 2.490349054336548 + }, + { + "auxiliary_loss_clip": 0.01216924, + "auxiliary_loss_mlp": 0.01067295, + "balance_loss_clip": 1.05934715, + "balance_loss_mlp": 1.04080701, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 2.2360617657252155, + "language_loss": 0.89516568, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91800785, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.26501465, + "step": 956, + "time_per_iteration": 2.4413809776306152 + }, + { + "auxiliary_loss_clip": 0.01222915, + "auxiliary_loss_mlp": 0.01057751, + "balance_loss_clip": 1.06666124, + "balance_loss_mlp": 1.03073776, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7530972480240365, + "language_loss": 0.75672376, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77953041, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.27001953, + "step": 957, + "time_per_iteration": 2.5036394596099854 + }, + { + "auxiliary_loss_clip": 0.01229636, + "auxiliary_loss_mlp": 0.01076955, + "balance_loss_clip": 1.06530631, + "balance_loss_mlp": 1.04764163, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 2.2766146762869717, + "language_loss": 0.79880226, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82186818, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.29309082, + "step": 958, + "time_per_iteration": 2.4148318767547607 + }, + { + "auxiliary_loss_clip": 0.01228352, + "auxiliary_loss_mlp": 0.01064665, + "balance_loss_clip": 1.06563377, + "balance_loss_mlp": 1.03899908, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 3.5313577742863256, + "language_loss": 0.88007337, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90300345, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.25671387, + "step": 959, + "time_per_iteration": 2.492666482925415 + }, + { + "auxiliary_loss_clip": 0.01219089, + "auxiliary_loss_mlp": 0.010666, + "balance_loss_clip": 1.06249714, + "balance_loss_mlp": 1.04043341, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.391592329229761, + "language_loss": 0.78943652, + "learning_rate": 3.991946592948529e-06, + "loss": 0.81229341, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.26147461, + "step": 960, + "time_per_iteration": 2.50657057762146 + }, + { + "auxiliary_loss_clip": 0.01224044, + "auxiliary_loss_mlp": 0.01064458, + "balance_loss_clip": 1.06216145, + "balance_loss_mlp": 1.03662217, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 3.1829589878304376, + "language_loss": 0.92779028, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95067531, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.27832031, + "step": 961, + "time_per_iteration": 2.5631918907165527 + }, + { + "auxiliary_loss_clip": 0.01223459, + "auxiliary_loss_mlp": 0.01067383, + "balance_loss_clip": 1.06161451, + "balance_loss_mlp": 1.03876138, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.16363971203755, + "language_loss": 0.68305469, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70596308, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.28613281, + "step": 962, + "time_per_iteration": 2.504175901412964 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.0107052, + "balance_loss_clip": 1.06263256, + "balance_loss_mlp": 1.04440093, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.329806015272656, + "language_loss": 0.8905037, + "learning_rate": 3.991841506871084e-06, + "loss": 0.91341525, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.26123047, + "step": 963, + "time_per_iteration": 2.4489948749542236 + }, + { + "auxiliary_loss_clip": 0.01231931, + "auxiliary_loss_mlp": 0.0105812, + "balance_loss_clip": 1.06833601, + "balance_loss_mlp": 1.03096461, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.466916277581825, + "language_loss": 0.84887761, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87177813, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.27172852, + "step": 964, + "time_per_iteration": 2.466386556625366 + }, + { + "auxiliary_loss_clip": 0.01224085, + "auxiliary_loss_mlp": 0.01075131, + "balance_loss_clip": 1.06363416, + "balance_loss_mlp": 1.04399323, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 3.578510744768065, + "language_loss": 0.77484274, + "learning_rate": 3.99177107182976e-06, + "loss": 0.79783487, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.3112793, + "step": 965, + "time_per_iteration": 2.516301155090332 + }, + { + "auxiliary_loss_clip": 0.0121854, + "auxiliary_loss_mlp": 0.01069312, + "balance_loss_clip": 1.061221, + "balance_loss_mlp": 1.04382467, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.057069285054627, + "language_loss": 0.81609225, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83897078, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.25476074, + "step": 966, + "time_per_iteration": 2.4719748497009277 + }, + { + "auxiliary_loss_clip": 0.01235779, + "auxiliary_loss_mlp": 0.01069842, + "balance_loss_clip": 1.07249379, + "balance_loss_mlp": 1.04427218, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 3.00047061690183, + "language_loss": 0.76267308, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78572929, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.25561523, + "step": 967, + "time_per_iteration": 2.4731788635253906 + }, + { + "auxiliary_loss_clip": 0.01114426, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.03991628, + "balance_loss_mlp": 1.0346868, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.8090175136042265, + "language_loss": 0.57386565, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59540045, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.04376221, + "step": 968, + "time_per_iteration": 2.96885085105896 + }, + { + "auxiliary_loss_clip": 0.01234711, + "auxiliary_loss_mlp": 0.01060733, + "balance_loss_clip": 1.07176518, + "balance_loss_mlp": 1.03217077, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.42618029891339, + "language_loss": 0.82112896, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84408343, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.28552246, + "step": 969, + "time_per_iteration": 2.4804275035858154 + }, + { + "auxiliary_loss_clip": 0.01227404, + "auxiliary_loss_mlp": 0.01059661, + "balance_loss_clip": 1.06551516, + "balance_loss_mlp": 1.03258848, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.314175106508411, + "language_loss": 0.77930021, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80217087, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.27062988, + "step": 970, + "time_per_iteration": 2.5059261322021484 + }, + { + "auxiliary_loss_clip": 0.01235227, + "auxiliary_loss_mlp": 0.01064077, + "balance_loss_clip": 1.07066154, + "balance_loss_mlp": 1.03637254, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.8669165415466313, + "language_loss": 0.92225909, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94525218, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.27709961, + "step": 971, + "time_per_iteration": 2.4729137420654297 + }, + { + "auxiliary_loss_clip": 0.01219986, + "auxiliary_loss_mlp": 0.01058788, + "balance_loss_clip": 1.06024122, + "balance_loss_mlp": 1.03116691, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 1.747992174214764, + "language_loss": 0.85968935, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88247716, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.27612305, + "step": 972, + "time_per_iteration": 2.481921911239624 + }, + { + "auxiliary_loss_clip": 0.01222885, + "auxiliary_loss_mlp": 0.01067167, + "balance_loss_clip": 1.06577754, + "balance_loss_mlp": 1.0417037, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.2394725908704856, + "language_loss": 0.87179399, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89469445, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.25463867, + "step": 973, + "time_per_iteration": 2.4816393852233887 + }, + { + "auxiliary_loss_clip": 0.0122216, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.06334162, + "balance_loss_mlp": 1.04259837, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 2.0501809059528155, + "language_loss": 0.74956059, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77248901, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.28088379, + "step": 974, + "time_per_iteration": 2.43070650100708 + }, + { + "auxiliary_loss_clip": 0.01223544, + "auxiliary_loss_mlp": 0.01063978, + "balance_loss_clip": 1.06529307, + "balance_loss_mlp": 1.03691792, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 2.0264212908513213, + "language_loss": 0.76691246, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78978777, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.27050781, + "step": 975, + "time_per_iteration": 2.578415632247925 + }, + { + "auxiliary_loss_clip": 0.01231699, + "auxiliary_loss_mlp": 0.01063223, + "balance_loss_clip": 1.06791496, + "balance_loss_mlp": 1.03657985, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 1.9921221244190526, + "language_loss": 0.76780188, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79075104, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.26635742, + "step": 976, + "time_per_iteration": 3.966905355453491 + }, + { + "auxiliary_loss_clip": 0.01223719, + "auxiliary_loss_mlp": 0.01070763, + "balance_loss_clip": 1.06282449, + "balance_loss_mlp": 1.04419124, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8939830532732767, + "language_loss": 0.87446308, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89740789, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.26574707, + "step": 977, + "time_per_iteration": 4.064071416854858 + }, + { + "auxiliary_loss_clip": 0.01221699, + "auxiliary_loss_mlp": 0.01069791, + "balance_loss_clip": 1.06440091, + "balance_loss_mlp": 1.04244423, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.9040444337384375, + "language_loss": 0.79288781, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81580269, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.27319336, + "step": 978, + "time_per_iteration": 2.475630283355713 + }, + { + "auxiliary_loss_clip": 0.01230405, + "auxiliary_loss_mlp": 0.01080181, + "balance_loss_clip": 1.06481576, + "balance_loss_mlp": 1.05114186, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 1.9033831380995085, + "language_loss": 0.80702007, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83012593, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.29077148, + "step": 979, + "time_per_iteration": 3.9052536487579346 + }, + { + "auxiliary_loss_clip": 0.01106631, + "auxiliary_loss_mlp": 0.01009053, + "balance_loss_clip": 1.0316714, + "balance_loss_mlp": 1.00524402, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.932097330775353, + "language_loss": 0.59020364, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61136049, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.0380249, + "step": 980, + "time_per_iteration": 4.384996175765991 + }, + { + "auxiliary_loss_clip": 0.01221543, + "auxiliary_loss_mlp": 0.01069038, + "balance_loss_clip": 1.06310821, + "balance_loss_mlp": 1.04183435, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.0369159027988566, + "language_loss": 0.86649561, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88940144, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.2722168, + "step": 981, + "time_per_iteration": 2.4370198249816895 + }, + { + "auxiliary_loss_clip": 0.01228477, + "auxiliary_loss_mlp": 0.01062837, + "balance_loss_clip": 1.06689835, + "balance_loss_mlp": 1.03762388, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.0118743968689534, + "language_loss": 0.79462266, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81753582, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.25231934, + "step": 982, + "time_per_iteration": 2.5124058723449707 + }, + { + "auxiliary_loss_clip": 0.01235228, + "auxiliary_loss_mlp": 0.01061148, + "balance_loss_clip": 1.06762266, + "balance_loss_mlp": 1.03480268, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.345954236635011, + "language_loss": 0.84354937, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86651313, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.26318359, + "step": 983, + "time_per_iteration": 2.473412036895752 + }, + { + "auxiliary_loss_clip": 0.01220635, + "auxiliary_loss_mlp": 0.01067144, + "balance_loss_clip": 1.06420636, + "balance_loss_mlp": 1.04231238, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 2.1841698015645354, + "language_loss": 0.84629691, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86917472, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.24865723, + "step": 984, + "time_per_iteration": 2.461419105529785 + }, + { + "auxiliary_loss_clip": 0.01218247, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_clip": 1.06342435, + "balance_loss_mlp": 1.03067136, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.1210150770729945, + "language_loss": 0.77317446, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79591143, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.2479248, + "step": 985, + "time_per_iteration": 2.480978488922119 + }, + { + "auxiliary_loss_clip": 0.01232014, + "auxiliary_loss_mlp": 0.01067001, + "balance_loss_clip": 1.06900144, + "balance_loss_mlp": 1.04156232, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.009657787449727, + "language_loss": 0.90766543, + "learning_rate": 3.991013265915661e-06, + "loss": 0.93065554, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.25427246, + "step": 986, + "time_per_iteration": 2.4539194107055664 + }, + { + "auxiliary_loss_clip": 0.01225609, + "auxiliary_loss_mlp": 0.01062573, + "balance_loss_clip": 1.06282842, + "balance_loss_mlp": 1.03408146, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.350333297766312, + "language_loss": 0.75728524, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78016698, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.28491211, + "step": 987, + "time_per_iteration": 2.5214710235595703 + }, + { + "auxiliary_loss_clip": 0.01245406, + "auxiliary_loss_mlp": 0.01056886, + "balance_loss_clip": 1.07689774, + "balance_loss_mlp": 1.03008747, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.5348716739122095, + "language_loss": 0.71767265, + "learning_rate": 3.990939357235621e-06, + "loss": 0.7406956, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.26818848, + "step": 988, + "time_per_iteration": 2.636216640472412 + }, + { + "auxiliary_loss_clip": 0.01101384, + "auxiliary_loss_mlp": 0.01006822, + "balance_loss_clip": 1.02762127, + "balance_loss_mlp": 1.00232172, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9354882724111098, + "language_loss": 0.71115267, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73223472, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.04504395, + "step": 989, + "time_per_iteration": 2.919203281402588 + }, + { + "auxiliary_loss_clip": 0.01235347, + "auxiliary_loss_mlp": 0.01067512, + "balance_loss_clip": 1.06862879, + "balance_loss_mlp": 1.03922391, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.9390361022341254, + "language_loss": 0.78487903, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80790764, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.28283691, + "step": 990, + "time_per_iteration": 2.4882943630218506 + }, + { + "auxiliary_loss_clip": 0.01222581, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_clip": 1.06343365, + "balance_loss_mlp": 1.02891457, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.0226285209088215, + "language_loss": 0.86136436, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88414943, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.27001953, + "step": 991, + "time_per_iteration": 2.4629719257354736 + }, + { + "auxiliary_loss_clip": 0.01227104, + "auxiliary_loss_mlp": 0.01061551, + "balance_loss_clip": 1.06433439, + "balance_loss_mlp": 1.03570676, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 1.970727353218588, + "language_loss": 0.77023685, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79312336, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.25842285, + "step": 992, + "time_per_iteration": 2.456021547317505 + }, + { + "auxiliary_loss_clip": 0.01220427, + "auxiliary_loss_mlp": 0.01057725, + "balance_loss_clip": 1.06231821, + "balance_loss_mlp": 1.03385985, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 3.405762497803714, + "language_loss": 0.74788666, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77066827, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.23901367, + "step": 993, + "time_per_iteration": 2.455122947692871 + }, + { + "auxiliary_loss_clip": 0.01219742, + "auxiliary_loss_mlp": 0.01072012, + "balance_loss_clip": 1.06129241, + "balance_loss_mlp": 1.04569077, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 1.9672626520060266, + "language_loss": 0.78846377, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81138134, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.26293945, + "step": 994, + "time_per_iteration": 2.526312828063965 + }, + { + "auxiliary_loss_clip": 0.01226584, + "auxiliary_loss_mlp": 0.01089635, + "balance_loss_clip": 1.06684422, + "balance_loss_mlp": 1.06395781, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.789481012483251, + "language_loss": 0.79981464, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82297683, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.25683594, + "step": 995, + "time_per_iteration": 2.4325740337371826 + }, + { + "auxiliary_loss_clip": 0.01224287, + "auxiliary_loss_mlp": 0.01065404, + "balance_loss_clip": 1.0637846, + "balance_loss_mlp": 1.03883171, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.00564592930647, + "language_loss": 0.87104738, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89394426, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.265625, + "step": 996, + "time_per_iteration": 2.50943922996521 + }, + { + "auxiliary_loss_clip": 0.01230673, + "auxiliary_loss_mlp": 0.01070017, + "balance_loss_clip": 1.06858814, + "balance_loss_mlp": 1.0402149, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.664858474082693, + "language_loss": 0.87611693, + "learning_rate": 3.990603031255718e-06, + "loss": 0.89912385, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.2980957, + "step": 997, + "time_per_iteration": 2.4723739624023438 + }, + { + "auxiliary_loss_clip": 0.01099889, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.02672124, + "balance_loss_mlp": 1.02029014, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0218586493414334, + "language_loss": 0.75419843, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77544254, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.04241943, + "step": 998, + "time_per_iteration": 3.1743974685668945 + }, + { + "auxiliary_loss_clip": 0.01221705, + "auxiliary_loss_mlp": 0.01059464, + "balance_loss_clip": 1.06442785, + "balance_loss_mlp": 1.03378654, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 2.243866966494867, + "language_loss": 0.75816691, + "learning_rate": 3.990527461790013e-06, + "loss": 0.78097862, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.25695801, + "step": 999, + "time_per_iteration": 2.485086441040039 + }, + { + "auxiliary_loss_clip": 0.01225028, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_clip": 1.06274033, + "balance_loss_mlp": 1.03720498, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 2.161568144546207, + "language_loss": 0.83008087, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85296762, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.26428223, + "step": 1000, + "time_per_iteration": 2.698613166809082 + }, + { + "auxiliary_loss_clip": 0.0122181, + "auxiliary_loss_mlp": 0.01056624, + "balance_loss_clip": 1.06289303, + "balance_loss_mlp": 1.0318526, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.2784639200820034, + "language_loss": 0.86264086, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88542527, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.2479248, + "step": 1001, + "time_per_iteration": 2.5122084617614746 + }, + { + "auxiliary_loss_clip": 0.01216698, + "auxiliary_loss_mlp": 0.01068713, + "balance_loss_clip": 1.06217921, + "balance_loss_mlp": 1.04112816, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.1185467215957794, + "language_loss": 0.74312419, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76597834, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.27600098, + "step": 1002, + "time_per_iteration": 2.5356736183166504 + }, + { + "auxiliary_loss_clip": 0.01222295, + "auxiliary_loss_mlp": 0.01064055, + "balance_loss_clip": 1.06353855, + "balance_loss_mlp": 1.03795993, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 2.5759437154242413, + "language_loss": 0.75877088, + "learning_rate": 3.990375417098112e-06, + "loss": 0.78163445, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.2611084, + "step": 1003, + "time_per_iteration": 2.479090452194214 + }, + { + "auxiliary_loss_clip": 0.01224794, + "auxiliary_loss_mlp": 0.01072346, + "balance_loss_clip": 1.06338656, + "balance_loss_mlp": 1.04402161, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 1.9955694927991683, + "language_loss": 0.7017929, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72476429, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.28344727, + "step": 1004, + "time_per_iteration": 2.496659755706787 + }, + { + "auxiliary_loss_clip": 0.01236783, + "auxiliary_loss_mlp": 0.0107723, + "balance_loss_clip": 1.07412207, + "balance_loss_mlp": 1.05045557, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 3.119843487175604, + "language_loss": 0.83687806, + "learning_rate": 3.990298941894976e-06, + "loss": 0.86001813, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.26794434, + "step": 1005, + "time_per_iteration": 2.451118230819702 + }, + { + "auxiliary_loss_clip": 0.01103936, + "auxiliary_loss_mlp": 0.01015046, + "balance_loss_clip": 1.03214383, + "balance_loss_mlp": 1.00941372, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.9121864342586453, + "language_loss": 0.59009403, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61128384, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.05627441, + "step": 1006, + "time_per_iteration": 3.1547646522521973 + }, + { + "auxiliary_loss_clip": 0.01223576, + "auxiliary_loss_mlp": 0.01051607, + "balance_loss_clip": 1.06211901, + "balance_loss_mlp": 1.02554834, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 1.9355060263131982, + "language_loss": 0.74729133, + "learning_rate": 3.990222164802503e-06, + "loss": 0.77004313, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.26074219, + "step": 1007, + "time_per_iteration": 2.4606831073760986 + }, + { + "auxiliary_loss_clip": 0.01227975, + "auxiliary_loss_mlp": 0.01061791, + "balance_loss_clip": 1.06610608, + "balance_loss_mlp": 1.03515995, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8971968128609773, + "language_loss": 0.8049593, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.82785702, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.26635742, + "step": 1008, + "time_per_iteration": 2.531032085418701 + }, + { + "auxiliary_loss_clip": 0.01230046, + "auxiliary_loss_mlp": 0.01059233, + "balance_loss_clip": 1.07031512, + "balance_loss_mlp": 1.03300714, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.779049560232923, + "language_loss": 0.77912915, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80202192, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.26245117, + "step": 1009, + "time_per_iteration": 2.434574842453003 + }, + { + "auxiliary_loss_clip": 0.01216395, + "auxiliary_loss_mlp": 0.01064199, + "balance_loss_clip": 1.06196785, + "balance_loss_mlp": 1.03695989, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 1.7742303425160395, + "language_loss": 0.92788029, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95068622, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.27258301, + "step": 1010, + "time_per_iteration": 2.532193899154663 + }, + { + "auxiliary_loss_clip": 0.01237071, + "auxiliary_loss_mlp": 0.01066004, + "balance_loss_clip": 1.06854856, + "balance_loss_mlp": 1.03689361, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.6773379694768638, + "language_loss": 0.72062474, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74365544, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.29125977, + "step": 1011, + "time_per_iteration": 2.431678056716919 + }, + { + "auxiliary_loss_clip": 0.01229435, + "auxiliary_loss_mlp": 0.01069582, + "balance_loss_clip": 1.06542289, + "balance_loss_mlp": 1.04127026, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.8952556767229445, + "language_loss": 0.87633073, + "learning_rate": 3.990028901381999e-06, + "loss": 0.8993209, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.2833252, + "step": 1012, + "time_per_iteration": 2.4895174503326416 + }, + { + "auxiliary_loss_clip": 0.01216136, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.0585891, + "balance_loss_mlp": 1.03465438, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.9777753742154505, + "language_loss": 0.76833892, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79109555, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.24890137, + "step": 1013, + "time_per_iteration": 2.467646837234497 + }, + { + "auxiliary_loss_clip": 0.01229404, + "auxiliary_loss_mlp": 0.01067218, + "balance_loss_clip": 1.06577039, + "balance_loss_mlp": 1.03901291, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.2368823795044968, + "language_loss": 0.8521868, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87515301, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.28198242, + "step": 1014, + "time_per_iteration": 2.4581491947174072 + }, + { + "auxiliary_loss_clip": 0.01229918, + "auxiliary_loss_mlp": 0.01069905, + "balance_loss_clip": 1.0655055, + "balance_loss_mlp": 1.04174769, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.0765209198057613, + "language_loss": 0.72989547, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75289369, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.28198242, + "step": 1015, + "time_per_iteration": 2.4669463634490967 + }, + { + "auxiliary_loss_clip": 0.01214256, + "auxiliary_loss_mlp": 0.01068565, + "balance_loss_clip": 1.06123865, + "balance_loss_mlp": 1.04212415, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 7.013310461472865, + "language_loss": 0.79140919, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81423748, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.26403809, + "step": 1016, + "time_per_iteration": 2.496116876602173 + }, + { + "auxiliary_loss_clip": 0.01221261, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.06284857, + "balance_loss_mlp": 1.0295639, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 2.0134322249843697, + "language_loss": 0.76216239, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78492308, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.25219727, + "step": 1017, + "time_per_iteration": 2.468050956726074 + }, + { + "auxiliary_loss_clip": 0.0122952, + "auxiliary_loss_mlp": 0.01072172, + "balance_loss_clip": 1.06647408, + "balance_loss_mlp": 1.04629123, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.1035550687519335, + "language_loss": 0.85655385, + "learning_rate": 3.989794495044685e-06, + "loss": 0.87957078, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.25891113, + "step": 1018, + "time_per_iteration": 2.4636266231536865 + }, + { + "auxiliary_loss_clip": 0.0122104, + "auxiliary_loss_mlp": 0.01073126, + "balance_loss_clip": 1.06332588, + "balance_loss_mlp": 1.04518366, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.920845927147898, + "language_loss": 0.77560246, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79854411, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.27941895, + "step": 1019, + "time_per_iteration": 2.473684072494507 + }, + { + "auxiliary_loss_clip": 0.01222402, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_clip": 1.06291926, + "balance_loss_mlp": 1.0331068, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.9602366988326825, + "language_loss": 0.84117424, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86398947, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.26013184, + "step": 1020, + "time_per_iteration": 3.903506278991699 + }, + { + "auxiliary_loss_clip": 0.0122566, + "auxiliary_loss_mlp": 0.01065021, + "balance_loss_clip": 1.06538081, + "balance_loss_mlp": 1.03662527, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.984873346183833, + "language_loss": 0.79273319, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81564003, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.28393555, + "step": 1021, + "time_per_iteration": 3.967851161956787 + }, + { + "auxiliary_loss_clip": 0.01218969, + "auxiliary_loss_mlp": 0.01061056, + "balance_loss_clip": 1.06205511, + "balance_loss_mlp": 1.03777397, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.3830088114772363, + "language_loss": 0.87877947, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90157974, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.23278809, + "step": 1022, + "time_per_iteration": 2.4524729251861572 + }, + { + "auxiliary_loss_clip": 0.01225676, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_clip": 1.06689405, + "balance_loss_mlp": 1.0362525, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 2.0590285654445712, + "language_loss": 0.83134347, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85422468, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.26220703, + "step": 1023, + "time_per_iteration": 3.854219436645508 + }, + { + "auxiliary_loss_clip": 0.01107605, + "auxiliary_loss_mlp": 0.01005447, + "balance_loss_clip": 1.03602481, + "balance_loss_mlp": 1.00155437, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8915311598247012, + "language_loss": 0.65075779, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67188835, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.03887939, + "step": 1024, + "time_per_iteration": 4.620063304901123 + }, + { + "auxiliary_loss_clip": 0.01231074, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_clip": 1.06982327, + "balance_loss_mlp": 1.04292321, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.0847692277535255, + "language_loss": 0.88350189, + "learning_rate": 3.989517587886636e-06, + "loss": 0.9065156, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.27429199, + "step": 1025, + "time_per_iteration": 2.5146524906158447 + }, + { + "auxiliary_loss_clip": 0.01232955, + "auxiliary_loss_mlp": 0.01066127, + "balance_loss_clip": 1.07191694, + "balance_loss_mlp": 1.04087877, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.4956388458418177, + "language_loss": 0.84561855, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86860937, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.25244141, + "step": 1026, + "time_per_iteration": 2.4973742961883545 + }, + { + "auxiliary_loss_clip": 0.01227207, + "auxiliary_loss_mlp": 0.01067803, + "balance_loss_clip": 1.06537724, + "balance_loss_mlp": 1.04249501, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 2.315814409363287, + "language_loss": 0.82009566, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84304571, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.25317383, + "step": 1027, + "time_per_iteration": 2.4778454303741455 + }, + { + "auxiliary_loss_clip": 0.01220735, + "auxiliary_loss_mlp": 0.01059913, + "balance_loss_clip": 1.06393862, + "balance_loss_mlp": 1.03487968, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.3012544197594185, + "language_loss": 0.84625316, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86905968, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.25048828, + "step": 1028, + "time_per_iteration": 2.435084819793701 + }, + { + "auxiliary_loss_clip": 0.0109558, + "auxiliary_loss_mlp": 0.01009377, + "balance_loss_clip": 1.02618885, + "balance_loss_mlp": 1.00540149, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9379335975338022, + "language_loss": 0.60585809, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62690765, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.03979492, + "step": 1029, + "time_per_iteration": 2.91326642036438 + }, + { + "auxiliary_loss_clip": 0.01213057, + "auxiliary_loss_mlp": 0.0106373, + "balance_loss_clip": 1.05835915, + "balance_loss_mlp": 1.03919721, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 1.983430827538946, + "language_loss": 0.82753199, + "learning_rate": 3.98931753374834e-06, + "loss": 0.85029984, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.24536133, + "step": 1030, + "time_per_iteration": 2.45068359375 + }, + { + "auxiliary_loss_clip": 0.01232275, + "auxiliary_loss_mlp": 0.01067751, + "balance_loss_clip": 1.07054257, + "balance_loss_mlp": 1.04109561, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.759683818641451, + "language_loss": 0.80127704, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82427728, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.26660156, + "step": 1031, + "time_per_iteration": 2.5035741329193115 + }, + { + "auxiliary_loss_clip": 0.01228075, + "auxiliary_loss_mlp": 0.01077231, + "balance_loss_clip": 1.06777549, + "balance_loss_mlp": 1.05050421, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.5767196075996803, + "language_loss": 0.77028012, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79333317, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.26757812, + "step": 1032, + "time_per_iteration": 2.4717278480529785 + }, + { + "auxiliary_loss_clip": 0.0122084, + "auxiliary_loss_mlp": 0.01070425, + "balance_loss_clip": 1.06181717, + "balance_loss_mlp": 1.04437792, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.275522165439231, + "language_loss": 0.89184058, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91475326, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.26062012, + "step": 1033, + "time_per_iteration": 2.4418892860412598 + }, + { + "auxiliary_loss_clip": 0.01229726, + "auxiliary_loss_mlp": 0.01059218, + "balance_loss_clip": 1.06769884, + "balance_loss_mlp": 1.03407645, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.1545430479109466, + "language_loss": 0.8451159, + "learning_rate": 3.989156132596479e-06, + "loss": 0.86800528, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.25109863, + "step": 1034, + "time_per_iteration": 2.4965343475341797 + }, + { + "auxiliary_loss_clip": 0.01217505, + "auxiliary_loss_mlp": 0.0106015, + "balance_loss_clip": 1.06565523, + "balance_loss_mlp": 1.03339922, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 2.910953671913234, + "language_loss": 0.80932927, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83210582, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.26782227, + "step": 1035, + "time_per_iteration": 2.5723862648010254 + }, + { + "auxiliary_loss_clip": 0.01221187, + "auxiliary_loss_mlp": 0.01073766, + "balance_loss_clip": 1.06459069, + "balance_loss_mlp": 1.04575217, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 3.010021819165663, + "language_loss": 0.78598225, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80893183, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.28027344, + "step": 1036, + "time_per_iteration": 2.513409376144409 + }, + { + "auxiliary_loss_clip": 0.01218688, + "auxiliary_loss_mlp": 0.01069077, + "balance_loss_clip": 1.06358719, + "balance_loss_mlp": 1.04303002, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.9647816262930493, + "language_loss": 0.86835337, + "learning_rate": 3.989034289722739e-06, + "loss": 0.89123106, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.26013184, + "step": 1037, + "time_per_iteration": 2.5691654682159424 + }, + { + "auxiliary_loss_clip": 0.01212625, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_clip": 1.05994248, + "balance_loss_mlp": 1.02803683, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.240465574126536, + "language_loss": 0.80542254, + "learning_rate": 3.988993524580676e-06, + "loss": 0.8281033, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.27441406, + "step": 1038, + "time_per_iteration": 2.508871078491211 + }, + { + "auxiliary_loss_clip": 0.01218734, + "auxiliary_loss_mlp": 0.01061801, + "balance_loss_clip": 1.06565917, + "balance_loss_mlp": 1.03451395, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 1.8600098656550674, + "language_loss": 0.8556121, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87841743, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.27270508, + "step": 1039, + "time_per_iteration": 2.4680092334747314 + }, + { + "auxiliary_loss_clip": 0.01222925, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_clip": 1.06599236, + "balance_loss_mlp": 1.0377233, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 2.164754522584476, + "language_loss": 0.8109116, + "learning_rate": 3.9889117680296e-06, + "loss": 0.8337847, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.26660156, + "step": 1040, + "time_per_iteration": 2.412050485610962 + }, + { + "auxiliary_loss_clip": 0.01232608, + "auxiliary_loss_mlp": 0.01071362, + "balance_loss_clip": 1.07377863, + "balance_loss_mlp": 1.04308534, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.3814801775311234, + "language_loss": 0.69564307, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71868271, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.28320312, + "step": 1041, + "time_per_iteration": 2.516806125640869 + }, + { + "auxiliary_loss_clip": 0.01221541, + "auxiliary_loss_mlp": 0.01062685, + "balance_loss_clip": 1.06255341, + "balance_loss_mlp": 1.03335905, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9807444297327887, + "language_loss": 0.8119024, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83474469, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.29345703, + "step": 1042, + "time_per_iteration": 2.438831329345703 + }, + { + "auxiliary_loss_clip": 0.01220162, + "auxiliary_loss_mlp": 0.0105358, + "balance_loss_clip": 1.06297946, + "balance_loss_mlp": 1.0282836, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 11.13630014592298, + "language_loss": 0.76107895, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78381646, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.25305176, + "step": 1043, + "time_per_iteration": 2.632460355758667 + }, + { + "auxiliary_loss_clip": 0.01219395, + "auxiliary_loss_mlp": 0.01070441, + "balance_loss_clip": 1.06708789, + "balance_loss_mlp": 1.04465628, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8110103807283096, + "language_loss": 0.92354, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94643831, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.2578125, + "step": 1044, + "time_per_iteration": 2.472562074661255 + }, + { + "auxiliary_loss_clip": 0.01226362, + "auxiliary_loss_mlp": 0.01070634, + "balance_loss_clip": 1.06898451, + "balance_loss_mlp": 1.04462242, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 3.3030245497874042, + "language_loss": 0.8611927, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88416266, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.26037598, + "step": 1045, + "time_per_iteration": 2.528935670852661 + }, + { + "auxiliary_loss_clip": 0.0121497, + "auxiliary_loss_mlp": 0.01066606, + "balance_loss_clip": 1.06235909, + "balance_loss_mlp": 1.04148829, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 1.9272095593259697, + "language_loss": 0.779836, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80265176, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.25146484, + "step": 1046, + "time_per_iteration": 2.6286842823028564 + }, + { + "auxiliary_loss_clip": 0.01215742, + "auxiliary_loss_mlp": 0.01066055, + "balance_loss_clip": 1.06214345, + "balance_loss_mlp": 1.04142654, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 3.493199636407342, + "language_loss": 0.77452296, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79734099, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.24633789, + "step": 1047, + "time_per_iteration": 2.5343008041381836 + }, + { + "auxiliary_loss_clip": 0.01228799, + "auxiliary_loss_mlp": 0.01061964, + "balance_loss_clip": 1.06575704, + "balance_loss_mlp": 1.03568983, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.5654669243706234, + "language_loss": 0.77112806, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79403567, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.26269531, + "step": 1048, + "time_per_iteration": 2.6242260932922363 + }, + { + "auxiliary_loss_clip": 0.01232932, + "auxiliary_loss_mlp": 0.01065406, + "balance_loss_clip": 1.0735817, + "balance_loss_mlp": 1.03931105, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.3085862730045155, + "language_loss": 0.77545863, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79844201, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.2611084, + "step": 1049, + "time_per_iteration": 2.567737579345703 + }, + { + "auxiliary_loss_clip": 0.01218212, + "auxiliary_loss_mlp": 0.01057985, + "balance_loss_clip": 1.06239247, + "balance_loss_mlp": 1.03252208, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.169811073188759, + "language_loss": 0.83397442, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85673642, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.2545166, + "step": 1050, + "time_per_iteration": 2.5298197269439697 + }, + { + "auxiliary_loss_clip": 0.01215043, + "auxiliary_loss_mlp": 0.01064941, + "balance_loss_clip": 1.06311965, + "balance_loss_mlp": 1.03993046, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 1.7744887112986454, + "language_loss": 0.77069664, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79349643, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.25024414, + "step": 1051, + "time_per_iteration": 2.543041467666626 + }, + { + "auxiliary_loss_clip": 0.01218386, + "auxiliary_loss_mlp": 0.01064589, + "balance_loss_clip": 1.06301928, + "balance_loss_mlp": 1.03866148, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.7991021459716086, + "language_loss": 0.80529767, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82812744, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.25915527, + "step": 1052, + "time_per_iteration": 2.461150884628296 + }, + { + "auxiliary_loss_clip": 0.01219831, + "auxiliary_loss_mlp": 0.01061439, + "balance_loss_clip": 1.06435955, + "balance_loss_mlp": 1.03614306, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.3905561656299943, + "language_loss": 0.77712977, + "learning_rate": 3.988372997582155e-06, + "loss": 0.79994243, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.25305176, + "step": 1053, + "time_per_iteration": 2.477282762527466 + }, + { + "auxiliary_loss_clip": 0.01218291, + "auxiliary_loss_mlp": 0.01054952, + "balance_loss_clip": 1.06264138, + "balance_loss_mlp": 1.03021669, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.933680858989368, + "language_loss": 0.84858871, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8713212, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.24755859, + "step": 1054, + "time_per_iteration": 2.4789538383483887 + }, + { + "auxiliary_loss_clip": 0.01214378, + "auxiliary_loss_mlp": 0.01057797, + "balance_loss_clip": 1.06218863, + "balance_loss_mlp": 1.03335929, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 2.0594640049744735, + "language_loss": 0.85838062, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.88110238, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.24450684, + "step": 1055, + "time_per_iteration": 2.4427261352539062 + }, + { + "auxiliary_loss_clip": 0.012297, + "auxiliary_loss_mlp": 0.01074215, + "balance_loss_clip": 1.06820941, + "balance_loss_mlp": 1.04854965, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.446944314652519, + "language_loss": 0.81084746, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83388662, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.25671387, + "step": 1056, + "time_per_iteration": 2.479607582092285 + }, + { + "auxiliary_loss_clip": 0.01223771, + "auxiliary_loss_mlp": 0.0106527, + "balance_loss_clip": 1.05973721, + "balance_loss_mlp": 1.0372802, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 4.491762548442671, + "language_loss": 0.81486857, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83775902, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.2800293, + "step": 1057, + "time_per_iteration": 2.5157995223999023 + }, + { + "auxiliary_loss_clip": 0.0121284, + "auxiliary_loss_mlp": 0.01065394, + "balance_loss_clip": 1.06205082, + "balance_loss_mlp": 1.0422318, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 3.401214622531918, + "language_loss": 0.83363616, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85641849, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.23156738, + "step": 1058, + "time_per_iteration": 2.468759059906006 + }, + { + "auxiliary_loss_clip": 0.01219429, + "auxiliary_loss_mlp": 0.01064669, + "balance_loss_clip": 1.06396472, + "balance_loss_mlp": 1.03696489, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.089097211034978, + "language_loss": 0.87201744, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89485848, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.27709961, + "step": 1059, + "time_per_iteration": 2.5172524452209473 + }, + { + "auxiliary_loss_clip": 0.01226359, + "auxiliary_loss_mlp": 0.0106717, + "balance_loss_clip": 1.06583834, + "balance_loss_mlp": 1.04076469, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.2078060930711176, + "language_loss": 0.91273123, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9356665, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.26403809, + "step": 1060, + "time_per_iteration": 2.3971235752105713 + }, + { + "auxiliary_loss_clip": 0.01216719, + "auxiliary_loss_mlp": 0.01066404, + "balance_loss_clip": 1.06093097, + "balance_loss_mlp": 1.03984439, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 2.56234419380928, + "language_loss": 0.87256217, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89539337, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.26574707, + "step": 1061, + "time_per_iteration": 2.4478020668029785 + }, + { + "auxiliary_loss_clip": 0.01223703, + "auxiliary_loss_mlp": 0.01065893, + "balance_loss_clip": 1.06168628, + "balance_loss_mlp": 1.03882074, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.1646895562134416, + "language_loss": 0.77107644, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79397237, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.27111816, + "step": 1062, + "time_per_iteration": 2.51279878616333 + }, + { + "auxiliary_loss_clip": 0.01214445, + "auxiliary_loss_mlp": 0.0106245, + "balance_loss_clip": 1.05766892, + "balance_loss_mlp": 1.0379529, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 1.9885863606295924, + "language_loss": 0.86356932, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88633823, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.24499512, + "step": 1063, + "time_per_iteration": 2.44962739944458 + }, + { + "auxiliary_loss_clip": 0.01216742, + "auxiliary_loss_mlp": 0.0106512, + "balance_loss_clip": 1.05960488, + "balance_loss_mlp": 1.03913283, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.1575758014632243, + "language_loss": 0.80265892, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82547748, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.26000977, + "step": 1064, + "time_per_iteration": 3.8728983402252197 + }, + { + "auxiliary_loss_clip": 0.01215675, + "auxiliary_loss_mlp": 0.01064116, + "balance_loss_clip": 1.05922437, + "balance_loss_mlp": 1.03724647, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 3.1808595863151554, + "language_loss": 0.84591824, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86871624, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.26855469, + "step": 1065, + "time_per_iteration": 3.927436113357544 + }, + { + "auxiliary_loss_clip": 0.01222458, + "auxiliary_loss_mlp": 0.01061169, + "balance_loss_clip": 1.06525338, + "balance_loss_mlp": 1.03592062, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.9878516986429255, + "language_loss": 0.68470043, + "learning_rate": 3.987821484659211e-06, + "loss": 0.7075367, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.25244141, + "step": 1066, + "time_per_iteration": 4.044774532318115 + }, + { + "auxiliary_loss_clip": 0.01213964, + "auxiliary_loss_mlp": 0.01073202, + "balance_loss_clip": 1.06052613, + "balance_loss_mlp": 1.04627216, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 2.714656165869689, + "language_loss": 0.90165174, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92452341, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.26965332, + "step": 1067, + "time_per_iteration": 3.8565146923065186 + }, + { + "auxiliary_loss_clip": 0.01216112, + "auxiliary_loss_mlp": 0.01070223, + "balance_loss_clip": 1.05931377, + "balance_loss_mlp": 1.04555869, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 1.7335193914506362, + "language_loss": 0.83679867, + "learning_rate": 3.987735505752391e-06, + "loss": 0.859662, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.24682617, + "step": 1068, + "time_per_iteration": 2.4815192222595215 + }, + { + "auxiliary_loss_clip": 0.01216847, + "auxiliary_loss_mlp": 0.01059222, + "balance_loss_clip": 1.06413233, + "balance_loss_mlp": 1.03476048, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.6287268771852355, + "language_loss": 0.8951596, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91792035, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.24450684, + "step": 1069, + "time_per_iteration": 2.5169551372528076 + }, + { + "auxiliary_loss_clip": 0.01216562, + "auxiliary_loss_mlp": 0.01074095, + "balance_loss_clip": 1.05920982, + "balance_loss_mlp": 1.0474633, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.9578255390005874, + "language_loss": 0.95461208, + "learning_rate": 3.987649225345056e-06, + "loss": 0.97751862, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.26635742, + "step": 1070, + "time_per_iteration": 2.418107748031616 + }, + { + "auxiliary_loss_clip": 0.01225968, + "auxiliary_loss_mlp": 0.01059828, + "balance_loss_clip": 1.06529093, + "balance_loss_mlp": 1.03258848, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8415540378062505, + "language_loss": 0.8845377, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90739566, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.27258301, + "step": 1071, + "time_per_iteration": 2.584108352661133 + }, + { + "auxiliary_loss_clip": 0.0121316, + "auxiliary_loss_mlp": 0.01056684, + "balance_loss_clip": 1.05891311, + "balance_loss_mlp": 1.03206778, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6888984180456923, + "language_loss": 0.76357883, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78627729, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.24633789, + "step": 1072, + "time_per_iteration": 2.4933054447174072 + }, + { + "auxiliary_loss_clip": 0.0122033, + "auxiliary_loss_mlp": 0.01068498, + "balance_loss_clip": 1.06104088, + "balance_loss_mlp": 1.04105639, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 2.208595488841536, + "language_loss": 0.80495811, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82784641, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.27416992, + "step": 1073, + "time_per_iteration": 2.5306951999664307 + }, + { + "auxiliary_loss_clip": 0.01214066, + "auxiliary_loss_mlp": 0.01059217, + "balance_loss_clip": 1.0610888, + "balance_loss_mlp": 1.03401566, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7880275008769817, + "language_loss": 0.80368489, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82641768, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.25219727, + "step": 1074, + "time_per_iteration": 2.466097593307495 + }, + { + "auxiliary_loss_clip": 0.01220222, + "auxiliary_loss_mlp": 0.01061319, + "balance_loss_clip": 1.06321907, + "balance_loss_mlp": 1.03502095, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 1.729317021953293, + "language_loss": 0.79234368, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81515908, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.26306152, + "step": 1075, + "time_per_iteration": 2.477886199951172 + }, + { + "auxiliary_loss_clip": 0.01218878, + "auxiliary_loss_mlp": 0.01061787, + "balance_loss_clip": 1.06306815, + "balance_loss_mlp": 1.03752756, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 3.0220042763503128, + "language_loss": 0.87822217, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90102887, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.24267578, + "step": 1076, + "time_per_iteration": 2.499601364135742 + }, + { + "auxiliary_loss_clip": 0.01209787, + "auxiliary_loss_mlp": 0.01058497, + "balance_loss_clip": 1.05749559, + "balance_loss_mlp": 1.03354669, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 1.9866683423159532, + "language_loss": 0.80678666, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82946944, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.24963379, + "step": 1077, + "time_per_iteration": 2.460789680480957 + }, + { + "auxiliary_loss_clip": 0.01219924, + "auxiliary_loss_mlp": 0.01076457, + "balance_loss_clip": 1.06023896, + "balance_loss_mlp": 1.04772806, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.066784662234816, + "language_loss": 0.9173491, + "learning_rate": 3.987301088972986e-06, + "loss": 0.94031298, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.2869873, + "step": 1078, + "time_per_iteration": 2.457035779953003 + }, + { + "auxiliary_loss_clip": 0.01229143, + "auxiliary_loss_mlp": 0.01058564, + "balance_loss_clip": 1.06589913, + "balance_loss_mlp": 1.03171837, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.136939504094452, + "language_loss": 0.78590798, + "learning_rate": 3.987257232795137e-06, + "loss": 0.80878508, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.26867676, + "step": 1079, + "time_per_iteration": 2.435544967651367 + }, + { + "auxiliary_loss_clip": 0.01226719, + "auxiliary_loss_mlp": 0.01069893, + "balance_loss_clip": 1.06590295, + "balance_loss_mlp": 1.04270101, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.058676007415544, + "language_loss": 0.69181186, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71477795, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.27185059, + "step": 1080, + "time_per_iteration": 2.498629570007324 + }, + { + "auxiliary_loss_clip": 0.01225841, + "auxiliary_loss_mlp": 0.01068439, + "balance_loss_clip": 1.06573415, + "balance_loss_mlp": 1.04075825, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 2.0708926474254636, + "language_loss": 0.72200173, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74494457, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.27709961, + "step": 1081, + "time_per_iteration": 2.535149097442627 + }, + { + "auxiliary_loss_clip": 0.01222106, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_clip": 1.06453419, + "balance_loss_mlp": 1.03632987, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.9844657661771015, + "language_loss": 0.84525663, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86811364, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.27307129, + "step": 1082, + "time_per_iteration": 2.457193374633789 + }, + { + "auxiliary_loss_clip": 0.01230115, + "auxiliary_loss_mlp": 0.01077275, + "balance_loss_clip": 1.06274629, + "balance_loss_mlp": 1.04694808, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.6182378117438407, + "language_loss": 0.82373488, + "learning_rate": 3.987081054530478e-06, + "loss": 0.84680879, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.30322266, + "step": 1083, + "time_per_iteration": 2.490072727203369 + }, + { + "auxiliary_loss_clip": 0.01224362, + "auxiliary_loss_mlp": 0.01065368, + "balance_loss_clip": 1.06438971, + "balance_loss_mlp": 1.03727055, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 3.664880714166626, + "language_loss": 0.79691327, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81981051, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.28088379, + "step": 1084, + "time_per_iteration": 2.4540913105010986 + }, + { + "auxiliary_loss_clip": 0.01229551, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.06895709, + "balance_loss_mlp": 1.03674185, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.504470621350621, + "language_loss": 0.66347307, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68639982, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.26391602, + "step": 1085, + "time_per_iteration": 2.5589637756347656 + }, + { + "auxiliary_loss_clip": 0.01217804, + "auxiliary_loss_mlp": 0.01073389, + "balance_loss_clip": 1.06345773, + "balance_loss_mlp": 1.04727006, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.8087532255025875, + "language_loss": 0.76722986, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79014176, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.26147461, + "step": 1086, + "time_per_iteration": 2.5155861377716064 + }, + { + "auxiliary_loss_clip": 0.01217441, + "auxiliary_loss_mlp": 0.01063097, + "balance_loss_clip": 1.06196439, + "balance_loss_mlp": 1.03657258, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.2370403769741376, + "language_loss": 0.84834528, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87115061, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.26550293, + "step": 1087, + "time_per_iteration": 2.4900543689727783 + }, + { + "auxiliary_loss_clip": 0.01225146, + "auxiliary_loss_mlp": 0.01062626, + "balance_loss_clip": 1.06562924, + "balance_loss_mlp": 1.03634071, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 2.1947642986839444, + "language_loss": 0.78097272, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80385041, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.26306152, + "step": 1088, + "time_per_iteration": 2.4857311248779297 + }, + { + "auxiliary_loss_clip": 0.01222469, + "auxiliary_loss_mlp": 0.01061383, + "balance_loss_clip": 1.06520092, + "balance_loss_mlp": 1.03727877, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.7313262173032522, + "language_loss": 0.71375906, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73659754, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.2409668, + "step": 1089, + "time_per_iteration": 2.469877004623413 + }, + { + "auxiliary_loss_clip": 0.01217644, + "auxiliary_loss_mlp": 0.01061637, + "balance_loss_clip": 1.06332362, + "balance_loss_mlp": 1.03680611, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7793366486911097, + "language_loss": 0.8556059, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87839866, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.24829102, + "step": 1090, + "time_per_iteration": 2.468898057937622 + }, + { + "auxiliary_loss_clip": 0.0122161, + "auxiliary_loss_mlp": 0.01063839, + "balance_loss_clip": 1.06382728, + "balance_loss_mlp": 1.03767252, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 1.964222841553947, + "language_loss": 0.71650481, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.73935932, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.26135254, + "step": 1091, + "time_per_iteration": 2.5359206199645996 + }, + { + "auxiliary_loss_clip": 0.01221175, + "auxiliary_loss_mlp": 0.0106676, + "balance_loss_clip": 1.0636915, + "balance_loss_mlp": 1.04071283, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.8478402630133326, + "language_loss": 0.82854527, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85142457, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.26074219, + "step": 1092, + "time_per_iteration": 2.461223840713501 + }, + { + "auxiliary_loss_clip": 0.01227056, + "auxiliary_loss_mlp": 0.01065646, + "balance_loss_clip": 1.06504905, + "balance_loss_mlp": 1.03758478, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 1.8528581436567604, + "language_loss": 0.711604, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73453104, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.28051758, + "step": 1093, + "time_per_iteration": 2.571866273880005 + }, + { + "auxiliary_loss_clip": 0.01227716, + "auxiliary_loss_mlp": 0.01063684, + "balance_loss_clip": 1.06887186, + "balance_loss_mlp": 1.03539515, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.719088104230173, + "language_loss": 0.88246518, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90537918, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.28283691, + "step": 1094, + "time_per_iteration": 2.505981922149658 + }, + { + "auxiliary_loss_clip": 0.01223387, + "auxiliary_loss_mlp": 0.01069444, + "balance_loss_clip": 1.06533372, + "balance_loss_mlp": 1.04058385, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6067686740457843, + "language_loss": 0.8165642, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83949256, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.28869629, + "step": 1095, + "time_per_iteration": 2.5248794555664062 + }, + { + "auxiliary_loss_clip": 0.01217306, + "auxiliary_loss_mlp": 0.01058376, + "balance_loss_clip": 1.061939, + "balance_loss_mlp": 1.03496385, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 1.9865506219309248, + "language_loss": 0.70210654, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72486335, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.23425293, + "step": 1096, + "time_per_iteration": 2.50469708442688 + }, + { + "auxiliary_loss_clip": 0.01219529, + "auxiliary_loss_mlp": 0.01072326, + "balance_loss_clip": 1.06462622, + "balance_loss_mlp": 1.04589725, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 2.0681965135295974, + "language_loss": 0.77550733, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79842585, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.26452637, + "step": 1097, + "time_per_iteration": 2.5984320640563965 + }, + { + "auxiliary_loss_clip": 0.01219798, + "auxiliary_loss_mlp": 0.01063066, + "balance_loss_clip": 1.06335044, + "balance_loss_mlp": 1.03773415, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 2.317312365432794, + "language_loss": 0.78419441, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80702305, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.25341797, + "step": 1098, + "time_per_iteration": 2.598593235015869 + }, + { + "auxiliary_loss_clip": 0.01221841, + "auxiliary_loss_mlp": 0.01076929, + "balance_loss_clip": 1.06501794, + "balance_loss_mlp": 1.05018997, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 2.049218457968883, + "language_loss": 0.81965542, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84264308, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.26721191, + "step": 1099, + "time_per_iteration": 2.496033191680908 + }, + { + "auxiliary_loss_clip": 0.01211134, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.05856907, + "balance_loss_mlp": 1.02712774, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.4734838083131847, + "language_loss": 0.83369851, + "learning_rate": 3.986318848181186e-06, + "loss": 0.85633928, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.25842285, + "step": 1100, + "time_per_iteration": 2.6667592525482178 + }, + { + "auxiliary_loss_clip": 0.01226105, + "auxiliary_loss_mlp": 0.01061345, + "balance_loss_clip": 1.0696255, + "balance_loss_mlp": 1.03576231, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.077609061800926, + "language_loss": 0.73576647, + "learning_rate": 3.986273334538702e-06, + "loss": 0.758641, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.25598145, + "step": 1101, + "time_per_iteration": 2.4276180267333984 + }, + { + "auxiliary_loss_clip": 0.01218179, + "auxiliary_loss_mlp": 0.01066773, + "balance_loss_clip": 1.06363547, + "balance_loss_mlp": 1.04134524, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.38646851141196, + "language_loss": 0.86160785, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88445735, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.25415039, + "step": 1102, + "time_per_iteration": 2.384006977081299 + }, + { + "auxiliary_loss_clip": 0.01214407, + "auxiliary_loss_mlp": 0.01060126, + "balance_loss_clip": 1.06285334, + "balance_loss_mlp": 1.034639, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.015599998728417, + "language_loss": 0.82060933, + "learning_rate": 3.98618208129641e-06, + "loss": 0.8433547, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.25488281, + "step": 1103, + "time_per_iteration": 2.449492931365967 + }, + { + "auxiliary_loss_clip": 0.01216547, + "auxiliary_loss_mlp": 0.01063275, + "balance_loss_clip": 1.06496203, + "balance_loss_mlp": 1.03946912, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.94163987305535, + "language_loss": 0.8227542, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84555244, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.2376709, + "step": 1104, + "time_per_iteration": 2.5549888610839844 + }, + { + "auxiliary_loss_clip": 0.01211285, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_clip": 1.06031418, + "balance_loss_mlp": 1.02402842, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.8426204815634284, + "language_loss": 0.8044225, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82703364, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.25805664, + "step": 1105, + "time_per_iteration": 2.5400824546813965 + }, + { + "auxiliary_loss_clip": 0.01210583, + "auxiliary_loss_mlp": 0.01063491, + "balance_loss_clip": 1.06195426, + "balance_loss_mlp": 1.04006672, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.834494232835714, + "language_loss": 0.96941006, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99215078, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.23413086, + "step": 1106, + "time_per_iteration": 2.398604393005371 + }, + { + "auxiliary_loss_clip": 0.01217457, + "auxiliary_loss_mlp": 0.01058591, + "balance_loss_clip": 1.06142116, + "balance_loss_mlp": 1.03337848, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.7955481044136299, + "language_loss": 0.83048451, + "learning_rate": 3.985998671031039e-06, + "loss": 0.8532449, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.25231934, + "step": 1107, + "time_per_iteration": 3.8147833347320557 + }, + { + "auxiliary_loss_clip": 0.01102724, + "auxiliary_loss_mlp": 0.01014575, + "balance_loss_clip": 1.03371906, + "balance_loss_mlp": 1.01041508, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8020382664974331, + "language_loss": 0.56711906, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.588292, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.04162598, + "step": 1108, + "time_per_iteration": 4.4223411083221436 + }, + { + "auxiliary_loss_clip": 0.01224391, + "auxiliary_loss_mlp": 0.01063999, + "balance_loss_clip": 1.06723571, + "balance_loss_mlp": 1.03761768, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 3.9667862561473894, + "language_loss": 0.73287892, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75576276, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.26403809, + "step": 1109, + "time_per_iteration": 2.452484130859375 + }, + { + "auxiliary_loss_clip": 0.01212528, + "auxiliary_loss_mlp": 0.01065706, + "balance_loss_clip": 1.06022263, + "balance_loss_mlp": 1.03932476, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 1.729081154061561, + "language_loss": 0.78359509, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80637741, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.26391602, + "step": 1110, + "time_per_iteration": 3.8643341064453125 + }, + { + "auxiliary_loss_clip": 0.01213409, + "auxiliary_loss_mlp": 0.01067346, + "balance_loss_clip": 1.06062365, + "balance_loss_mlp": 1.04063106, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.107857946456181, + "language_loss": 0.71169293, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73450047, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.26721191, + "step": 1111, + "time_per_iteration": 4.00914454460144 + }, + { + "auxiliary_loss_clip": 0.01217869, + "auxiliary_loss_mlp": 0.01064472, + "balance_loss_clip": 1.06293976, + "balance_loss_mlp": 1.03997397, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 2.1140003056331635, + "language_loss": 0.78750885, + "learning_rate": 3.985767713753971e-06, + "loss": 0.8103323, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.24511719, + "step": 1112, + "time_per_iteration": 2.4615626335144043 + }, + { + "auxiliary_loss_clip": 0.01217861, + "auxiliary_loss_mlp": 0.01077463, + "balance_loss_clip": 1.06301498, + "balance_loss_mlp": 1.05012774, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.7203877846036835, + "language_loss": 0.79110622, + "learning_rate": 3.985721296390005e-06, + "loss": 0.8140595, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.27355957, + "step": 1113, + "time_per_iteration": 2.427503824234009 + }, + { + "auxiliary_loss_clip": 0.01207725, + "auxiliary_loss_mlp": 0.0105399, + "balance_loss_clip": 1.05774498, + "balance_loss_mlp": 1.02970731, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 1.8627089336544083, + "language_loss": 0.8253541, + "learning_rate": 3.985674803727289e-06, + "loss": 0.8479712, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.24291992, + "step": 1114, + "time_per_iteration": 2.4537441730499268 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01008414, + "balance_loss_clip": 1.03165746, + "balance_loss_mlp": 1.00461698, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8382262850613459, + "language_loss": 0.58166379, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60272765, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.66308594, + "router_z_loss_mlp": 0.03790283, + "step": 1115, + "time_per_iteration": 3.0172603130340576 + }, + { + "auxiliary_loss_clip": 0.01214251, + "auxiliary_loss_mlp": 0.01058916, + "balance_loss_clip": 1.06079769, + "balance_loss_mlp": 1.03227234, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.5229927951405173, + "language_loss": 0.91322714, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93595886, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.26623535, + "step": 1116, + "time_per_iteration": 2.4248433113098145 + }, + { + "auxiliary_loss_clip": 0.0122707, + "auxiliary_loss_mlp": 0.01063526, + "balance_loss_clip": 1.06793356, + "balance_loss_mlp": 1.03851557, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.131197479069308, + "language_loss": 0.87361109, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89651704, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.25, + "step": 1117, + "time_per_iteration": 2.4322237968444824 + }, + { + "auxiliary_loss_clip": 0.0109216, + "auxiliary_loss_mlp": 0.01005526, + "balance_loss_clip": 1.02644622, + "balance_loss_mlp": 1.00184214, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8506421164071046, + "language_loss": 0.59823751, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61921436, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.65722656, + "router_z_loss_mlp": 0.0368042, + "step": 1118, + "time_per_iteration": 3.0439653396606445 + }, + { + "auxiliary_loss_clip": 0.01212957, + "auxiliary_loss_mlp": 0.01052971, + "balance_loss_clip": 1.05933404, + "balance_loss_mlp": 1.028736, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.3657360758547203, + "language_loss": 0.83363795, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85629714, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.24255371, + "step": 1119, + "time_per_iteration": 2.4267470836639404 + }, + { + "auxiliary_loss_clip": 0.01209656, + "auxiliary_loss_mlp": 0.0106009, + "balance_loss_clip": 1.06031895, + "balance_loss_mlp": 1.03680813, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.822344996638882, + "language_loss": 0.84741008, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87010753, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.23266602, + "step": 1120, + "time_per_iteration": 2.4780220985412598 + }, + { + "auxiliary_loss_clip": 0.01220417, + "auxiliary_loss_mlp": 0.01069975, + "balance_loss_clip": 1.06510651, + "balance_loss_mlp": 1.0446192, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.055081170388844, + "language_loss": 0.78827083, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81117475, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.25390625, + "step": 1121, + "time_per_iteration": 2.454805612564087 + }, + { + "auxiliary_loss_clip": 0.01092782, + "auxiliary_loss_mlp": 0.01004738, + "balance_loss_clip": 1.02870655, + "balance_loss_mlp": 1.00078011, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.752117759654879, + "language_loss": 0.58420432, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60517949, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 0.03955078, + "step": 1122, + "time_per_iteration": 3.1875557899475098 + }, + { + "auxiliary_loss_clip": 0.01216243, + "auxiliary_loss_mlp": 0.0106126, + "balance_loss_clip": 1.06352711, + "balance_loss_mlp": 1.03628612, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 1.9070278814429082, + "language_loss": 0.71553415, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73830914, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.24987793, + "step": 1123, + "time_per_iteration": 2.4553351402282715 + }, + { + "auxiliary_loss_clip": 0.01216272, + "auxiliary_loss_mlp": 0.01060708, + "balance_loss_clip": 1.06221223, + "balance_loss_mlp": 1.03448188, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 2.5710475130115165, + "language_loss": 0.78865981, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81142962, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.26220703, + "step": 1124, + "time_per_iteration": 2.4715189933776855 + }, + { + "auxiliary_loss_clip": 0.0120755, + "auxiliary_loss_mlp": 0.0105453, + "balance_loss_clip": 1.05798221, + "balance_loss_mlp": 1.03127241, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 1.9056311523644172, + "language_loss": 0.72050714, + "learning_rate": 3.985158415226128e-06, + "loss": 0.74312794, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.2322998, + "step": 1125, + "time_per_iteration": 2.4459035396575928 + }, + { + "auxiliary_loss_clip": 0.01215629, + "auxiliary_loss_mlp": 0.01071123, + "balance_loss_clip": 1.06335509, + "balance_loss_mlp": 1.04372835, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.781156800507553, + "language_loss": 0.81447208, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83733964, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.27392578, + "step": 1126, + "time_per_iteration": 2.4815666675567627 + }, + { + "auxiliary_loss_clip": 0.01087194, + "auxiliary_loss_mlp": 0.01007966, + "balance_loss_clip": 1.02356005, + "balance_loss_mlp": 1.00446141, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7812975397517428, + "language_loss": 0.59773219, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61868382, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 0.03503418, + "step": 1127, + "time_per_iteration": 3.080500602722168 + }, + { + "auxiliary_loss_clip": 0.01221959, + "auxiliary_loss_mlp": 0.01056531, + "balance_loss_clip": 1.06994343, + "balance_loss_mlp": 1.03268874, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.5868548788468986, + "language_loss": 0.81520271, + "learning_rate": 3.985016001072925e-06, + "loss": 0.8379876, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.23852539, + "step": 1128, + "time_per_iteration": 2.4518299102783203 + }, + { + "auxiliary_loss_clip": 0.01226048, + "auxiliary_loss_mlp": 0.0106288, + "balance_loss_clip": 1.06792784, + "balance_loss_mlp": 1.035712, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.417979769628653, + "language_loss": 0.76040781, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78329706, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.27148438, + "step": 1129, + "time_per_iteration": 2.4281859397888184 + }, + { + "auxiliary_loss_clip": 0.01213412, + "auxiliary_loss_mlp": 0.01057696, + "balance_loss_clip": 1.06030834, + "balance_loss_mlp": 1.03288889, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8297113813242891, + "language_loss": 0.72392142, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74663258, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.24804688, + "step": 1130, + "time_per_iteration": 2.5520009994506836 + }, + { + "auxiliary_loss_clip": 0.01206218, + "auxiliary_loss_mlp": 0.01066817, + "balance_loss_clip": 1.0579685, + "balance_loss_mlp": 1.04034054, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.7498954480049105, + "language_loss": 0.80773664, + "learning_rate": 3.984872909471688e-06, + "loss": 0.83046699, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.26464844, + "step": 1131, + "time_per_iteration": 2.484354019165039 + }, + { + "auxiliary_loss_clip": 0.01217255, + "auxiliary_loss_mlp": 0.01067487, + "balance_loss_clip": 1.0666604, + "balance_loss_mlp": 1.04314399, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 7.922481015356311, + "language_loss": 0.80631483, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82916224, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 1.50683594, + "router_z_loss_mlp": 0.2434082, + "step": 1132, + "time_per_iteration": 2.3997228145599365 + }, + { + "auxiliary_loss_clip": 0.01212338, + "auxiliary_loss_mlp": 0.01063932, + "balance_loss_clip": 1.06199682, + "balance_loss_mlp": 1.03942287, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.6073366861705622, + "language_loss": 0.63497519, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65773797, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.24499512, + "step": 1133, + "time_per_iteration": 2.6972646713256836 + }, + { + "auxiliary_loss_clip": 0.01220069, + "auxiliary_loss_mlp": 0.01058375, + "balance_loss_clip": 1.06303275, + "balance_loss_mlp": 1.03120732, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 1.9778609445680997, + "language_loss": 0.74934334, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77212775, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.27160645, + "step": 1134, + "time_per_iteration": 2.4243195056915283 + }, + { + "auxiliary_loss_clip": 0.01215329, + "auxiliary_loss_mlp": 0.01063929, + "balance_loss_clip": 1.06648254, + "balance_loss_mlp": 1.04058766, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0359972357032317, + "language_loss": 0.87243521, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89522779, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.23327637, + "step": 1135, + "time_per_iteration": 2.442702531814575 + }, + { + "auxiliary_loss_clip": 0.01213631, + "auxiliary_loss_mlp": 0.01052683, + "balance_loss_clip": 1.05884063, + "balance_loss_mlp": 1.02679062, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.9897356734085587, + "language_loss": 0.78376597, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80642909, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.2590332, + "step": 1136, + "time_per_iteration": 2.449934720993042 + }, + { + "auxiliary_loss_clip": 0.01224682, + "auxiliary_loss_mlp": 0.01061019, + "balance_loss_clip": 1.07040739, + "balance_loss_mlp": 1.03625917, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.1016175494816802, + "language_loss": 0.84516603, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86802298, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.2479248, + "step": 1137, + "time_per_iteration": 2.5724868774414062 + }, + { + "auxiliary_loss_clip": 0.0120743, + "auxiliary_loss_mlp": 0.01058585, + "balance_loss_clip": 1.05961072, + "balance_loss_mlp": 1.03465974, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.140624631472405, + "language_loss": 0.79034746, + "learning_rate": 3.984536394823418e-06, + "loss": 0.81300759, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.23950195, + "step": 1138, + "time_per_iteration": 2.4570326805114746 + }, + { + "auxiliary_loss_clip": 0.01220255, + "auxiliary_loss_mlp": 0.01060395, + "balance_loss_clip": 1.06729198, + "balance_loss_mlp": 1.03544462, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.3899804966308884, + "language_loss": 0.85788977, + "learning_rate": 3.984488020272336e-06, + "loss": 0.8806963, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.24975586, + "step": 1139, + "time_per_iteration": 2.548875331878662 + }, + { + "auxiliary_loss_clip": 0.01207741, + "auxiliary_loss_mlp": 0.01057968, + "balance_loss_clip": 1.05976629, + "balance_loss_mlp": 1.03357792, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 3.91100968245827, + "language_loss": 0.75186807, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77452523, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.24389648, + "step": 1140, + "time_per_iteration": 2.6590476036071777 + }, + { + "auxiliary_loss_clip": 0.01214653, + "auxiliary_loss_mlp": 0.01060072, + "balance_loss_clip": 1.06520367, + "balance_loss_mlp": 1.0345968, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.4590049220948433, + "language_loss": 0.68130684, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70405406, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.25488281, + "step": 1141, + "time_per_iteration": 2.5373246669769287 + }, + { + "auxiliary_loss_clip": 0.01218937, + "auxiliary_loss_mlp": 0.01076221, + "balance_loss_clip": 1.06594908, + "balance_loss_mlp": 1.04787278, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.8747405202964935, + "language_loss": 0.78949183, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81244338, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.28356934, + "step": 1142, + "time_per_iteration": 2.4641876220703125 + }, + { + "auxiliary_loss_clip": 0.01212744, + "auxiliary_loss_mlp": 0.01063864, + "balance_loss_clip": 1.06305945, + "balance_loss_mlp": 1.03964031, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 3.2831785596996044, + "language_loss": 0.68952751, + "learning_rate": 3.984293769566553e-06, + "loss": 0.71229362, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 1.49511719, + "router_z_loss_mlp": 0.2421875, + "step": 1143, + "time_per_iteration": 2.53178334236145 + }, + { + "auxiliary_loss_clip": 0.01208561, + "auxiliary_loss_mlp": 0.01057309, + "balance_loss_clip": 1.06479883, + "balance_loss_mlp": 1.03527951, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 4.181888890933308, + "language_loss": 0.74598598, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76864469, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.2199707, + "step": 1144, + "time_per_iteration": 2.483867645263672 + }, + { + "auxiliary_loss_clip": 0.01217707, + "auxiliary_loss_mlp": 0.01064532, + "balance_loss_clip": 1.06387806, + "balance_loss_mlp": 1.03953409, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.167185561476923, + "language_loss": 0.92031711, + "learning_rate": 3.984196192738577e-06, + "loss": 0.94313949, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.25012207, + "step": 1145, + "time_per_iteration": 2.4306564331054688 + }, + { + "auxiliary_loss_clip": 0.01218342, + "auxiliary_loss_mlp": 0.010625, + "balance_loss_clip": 1.06187749, + "balance_loss_mlp": 1.03690588, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.8219877931278194, + "language_loss": 0.8265304, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84933883, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.25610352, + "step": 1146, + "time_per_iteration": 2.446762800216675 + }, + { + "auxiliary_loss_clip": 0.01207449, + "auxiliary_loss_mlp": 0.01057129, + "balance_loss_clip": 1.06169868, + "balance_loss_mlp": 1.03378737, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.3165297447508064, + "language_loss": 0.85181272, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87445849, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.23364258, + "step": 1147, + "time_per_iteration": 2.444101572036743 + }, + { + "auxiliary_loss_clip": 0.01210584, + "auxiliary_loss_mlp": 0.010607, + "balance_loss_clip": 1.05944264, + "balance_loss_mlp": 1.03683472, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 1.8779179034166873, + "language_loss": 0.86077052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88348335, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.23852539, + "step": 1148, + "time_per_iteration": 2.3799333572387695 + }, + { + "auxiliary_loss_clip": 0.01211562, + "auxiliary_loss_mlp": 0.01056628, + "balance_loss_clip": 1.06169462, + "balance_loss_mlp": 1.03298831, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.0306565107175834, + "language_loss": 0.69788861, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.7205705, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 1.49902344, + "router_z_loss_mlp": 0.23669434, + "step": 1149, + "time_per_iteration": 2.40360951423645 + }, + { + "auxiliary_loss_clip": 0.0121304, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.06010294, + "balance_loss_mlp": 1.02974105, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 1.9919192644056243, + "language_loss": 0.83484459, + "learning_rate": 3.983950933985064e-06, + "loss": 0.85751367, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.24121094, + "step": 1150, + "time_per_iteration": 3.8667056560516357 + }, + { + "auxiliary_loss_clip": 0.01213603, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.06286347, + "balance_loss_mlp": 1.03142548, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.570442427448977, + "language_loss": 0.82031763, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84301519, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.24719238, + "step": 1151, + "time_per_iteration": 3.7871108055114746 + }, + { + "auxiliary_loss_clip": 0.01215885, + "auxiliary_loss_mlp": 0.01064931, + "balance_loss_clip": 1.06688797, + "balance_loss_mlp": 1.04074335, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.9917321292157188, + "language_loss": 0.85748631, + "learning_rate": 3.983852303849291e-06, + "loss": 0.88029444, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.24206543, + "step": 1152, + "time_per_iteration": 2.473520040512085 + }, + { + "auxiliary_loss_clip": 0.01209463, + "auxiliary_loss_mlp": 0.01067875, + "balance_loss_clip": 1.06251812, + "balance_loss_mlp": 1.042853, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1021250610303137, + "language_loss": 0.90549767, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92827106, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.25024414, + "step": 1153, + "time_per_iteration": 3.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01212047, + "auxiliary_loss_mlp": 0.01052016, + "balance_loss_clip": 1.06335783, + "balance_loss_mlp": 1.0276854, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.1716954585766164, + "language_loss": 0.82169962, + "learning_rate": 3.983753372802008e-06, + "loss": 0.84434026, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.24304199, + "step": 1154, + "time_per_iteration": 2.5177814960479736 + }, + { + "auxiliary_loss_clip": 0.01215965, + "auxiliary_loss_mlp": 0.01064159, + "balance_loss_clip": 1.0682646, + "balance_loss_mlp": 1.04086578, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 2.2629154906816327, + "language_loss": 0.75259995, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77540123, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.2331543, + "step": 1155, + "time_per_iteration": 3.9166786670684814 + }, + { + "auxiliary_loss_clip": 0.01207665, + "auxiliary_loss_mlp": 0.01063668, + "balance_loss_clip": 1.06122422, + "balance_loss_mlp": 1.03892016, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.8263368909776458, + "language_loss": 0.71395922, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73667264, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.24768066, + "step": 1156, + "time_per_iteration": 2.48241925239563 + }, + { + "auxiliary_loss_clip": 0.01211219, + "auxiliary_loss_mlp": 0.01069832, + "balance_loss_clip": 1.06342101, + "balance_loss_mlp": 1.04492879, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 1.8995812107268601, + "language_loss": 0.75191677, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77472723, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.24902344, + "step": 1157, + "time_per_iteration": 2.529043674468994 + }, + { + "auxiliary_loss_clip": 0.01211851, + "auxiliary_loss_mlp": 0.01061181, + "balance_loss_clip": 1.05993152, + "balance_loss_mlp": 1.03643322, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 2.426334681106429, + "language_loss": 0.71543121, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73816156, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.24743652, + "step": 1158, + "time_per_iteration": 2.5169577598571777 + }, + { + "auxiliary_loss_clip": 0.0121663, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.06435394, + "balance_loss_mlp": 1.03745484, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.7897590188704173, + "language_loss": 0.79960263, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82239509, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.25134277, + "step": 1159, + "time_per_iteration": 2.5328783988952637 + }, + { + "auxiliary_loss_clip": 0.01229562, + "auxiliary_loss_mlp": 0.01065653, + "balance_loss_clip": 1.0750581, + "balance_loss_mlp": 1.03849769, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 2.6625128375385523, + "language_loss": 0.80875117, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83170336, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.27197266, + "step": 1160, + "time_per_iteration": 2.4534661769866943 + }, + { + "auxiliary_loss_clip": 0.01207037, + "auxiliary_loss_mlp": 0.01063399, + "balance_loss_clip": 1.05784655, + "balance_loss_mlp": 1.03862691, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.6227147720000987, + "language_loss": 0.7623148, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78501916, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.24768066, + "step": 1161, + "time_per_iteration": 2.5549509525299072 + }, + { + "auxiliary_loss_clip": 0.01213739, + "auxiliary_loss_mlp": 0.01064926, + "balance_loss_clip": 1.06316543, + "balance_loss_mlp": 1.04001093, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 2.120000115051156, + "language_loss": 0.82827455, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85106122, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 1.50488281, + "router_z_loss_mlp": 0.24914551, + "step": 1162, + "time_per_iteration": 2.4599781036376953 + }, + { + "auxiliary_loss_clip": 0.01205837, + "auxiliary_loss_mlp": 0.01051524, + "balance_loss_clip": 1.05983889, + "balance_loss_mlp": 1.02752745, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.9547540693200136, + "language_loss": 0.7953769, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81795049, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.2401123, + "step": 1163, + "time_per_iteration": 2.514007329940796 + }, + { + "auxiliary_loss_clip": 0.01210692, + "auxiliary_loss_mlp": 0.01060002, + "balance_loss_clip": 1.060655, + "balance_loss_mlp": 1.03413391, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 1.8730733785668336, + "language_loss": 0.7929281, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81563497, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.25866699, + "step": 1164, + "time_per_iteration": 2.479809284210205 + }, + { + "auxiliary_loss_clip": 0.01209246, + "auxiliary_loss_mlp": 0.01061922, + "balance_loss_clip": 1.06148946, + "balance_loss_mlp": 1.03659046, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4174316130146067, + "language_loss": 0.73187184, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75458348, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.25354004, + "step": 1165, + "time_per_iteration": 2.5346786975860596 + }, + { + "auxiliary_loss_clip": 0.01220162, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.0701375, + "balance_loss_mlp": 1.03033781, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 2.4311648691785464, + "language_loss": 0.8104524, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83320045, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.24304199, + "step": 1166, + "time_per_iteration": 2.5795319080352783 + }, + { + "auxiliary_loss_clip": 0.01217691, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.06895995, + "balance_loss_mlp": 1.0234375, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 5.224846005423918, + "language_loss": 0.84467274, + "learning_rate": 3.983102987317295e-06, + "loss": 0.8673327, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.24890137, + "step": 1167, + "time_per_iteration": 2.5454986095428467 + }, + { + "auxiliary_loss_clip": 0.01209002, + "auxiliary_loss_mlp": 0.01056752, + "balance_loss_clip": 1.06011164, + "balance_loss_mlp": 1.03195667, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.767458450105885, + "language_loss": 0.89070874, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91336632, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.24829102, + "step": 1168, + "time_per_iteration": 2.5772578716278076 + }, + { + "auxiliary_loss_clip": 0.01215006, + "auxiliary_loss_mlp": 0.01070459, + "balance_loss_clip": 1.06178272, + "balance_loss_mlp": 1.03977466, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.266640009931649, + "language_loss": 0.89443207, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91728675, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.30664062, + "step": 1169, + "time_per_iteration": 2.6837167739868164 + }, + { + "auxiliary_loss_clip": 0.01214225, + "auxiliary_loss_mlp": 0.01080676, + "balance_loss_clip": 1.06298375, + "balance_loss_mlp": 1.05274558, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0833134378723512, + "language_loss": 0.84160566, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86455464, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.27941895, + "step": 1170, + "time_per_iteration": 2.8745877742767334 + }, + { + "auxiliary_loss_clip": 0.012122, + "auxiliary_loss_mlp": 0.01068582, + "balance_loss_clip": 1.06331325, + "balance_loss_mlp": 1.04208159, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 2.1884838381095677, + "language_loss": 0.75623089, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77903867, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.26501465, + "step": 1171, + "time_per_iteration": 2.552346706390381 + }, + { + "auxiliary_loss_clip": 0.01208991, + "auxiliary_loss_mlp": 0.01052881, + "balance_loss_clip": 1.06158352, + "balance_loss_mlp": 1.02857447, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.5014211442472263, + "language_loss": 0.89262903, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91524768, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.24316406, + "step": 1172, + "time_per_iteration": 2.6138551235198975 + }, + { + "auxiliary_loss_clip": 0.01212311, + "auxiliary_loss_mlp": 0.01052963, + "balance_loss_clip": 1.05952978, + "balance_loss_mlp": 1.02643919, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.8018651185809031, + "language_loss": 0.81904477, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84169751, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.26501465, + "step": 1173, + "time_per_iteration": 2.6185309886932373 + }, + { + "auxiliary_loss_clip": 0.01208438, + "auxiliary_loss_mlp": 0.01061356, + "balance_loss_clip": 1.06039333, + "balance_loss_mlp": 1.03604817, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.3602158400725766, + "language_loss": 0.824224, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84692192, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.25305176, + "step": 1174, + "time_per_iteration": 2.8527324199676514 + }, + { + "auxiliary_loss_clip": 0.01208437, + "auxiliary_loss_mlp": 0.01061589, + "balance_loss_clip": 1.05963016, + "balance_loss_mlp": 1.03618515, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 2.0319184424549497, + "language_loss": 0.85309982, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87580001, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.25427246, + "step": 1175, + "time_per_iteration": 2.5193254947662354 + }, + { + "auxiliary_loss_clip": 0.0121683, + "auxiliary_loss_mlp": 0.01068878, + "balance_loss_clip": 1.06391633, + "balance_loss_mlp": 1.04386759, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.787899319255974, + "language_loss": 0.83203518, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85489225, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.24987793, + "step": 1176, + "time_per_iteration": 2.468564987182617 + }, + { + "auxiliary_loss_clip": 0.01215502, + "auxiliary_loss_mlp": 0.01069401, + "balance_loss_clip": 1.06537867, + "balance_loss_mlp": 1.04338908, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.1835521252267074, + "language_loss": 0.74224341, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76509249, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.26037598, + "step": 1177, + "time_per_iteration": 2.4816977977752686 + }, + { + "auxiliary_loss_clip": 0.01218999, + "auxiliary_loss_mlp": 0.01059802, + "balance_loss_clip": 1.06714177, + "balance_loss_mlp": 1.03356385, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.0862956924049216, + "language_loss": 0.86055368, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88334167, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.26245117, + "step": 1178, + "time_per_iteration": 2.501650810241699 + }, + { + "auxiliary_loss_clip": 0.01092344, + "auxiliary_loss_mlp": 0.01013335, + "balance_loss_clip": 1.02854586, + "balance_loss_mlp": 1.00900793, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8341334284715195, + "language_loss": 0.63211298, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65316975, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.63769531, + "router_z_loss_mlp": 0.04333496, + "step": 1179, + "time_per_iteration": 3.2017087936401367 + }, + { + "auxiliary_loss_clip": 0.01233595, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.07624209, + "balance_loss_mlp": 1.03712559, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.774983949236673, + "language_loss": 0.8391124, + "learning_rate": 3.98243989312991e-06, + "loss": 0.86206663, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.24694824, + "step": 1180, + "time_per_iteration": 2.4428014755249023 + }, + { + "auxiliary_loss_clip": 0.0120913, + "auxiliary_loss_mlp": 0.01058939, + "balance_loss_clip": 1.06253576, + "balance_loss_mlp": 1.03389275, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.507340847271734, + "language_loss": 0.88515794, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90783858, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.25036621, + "step": 1181, + "time_per_iteration": 2.4856972694396973 + }, + { + "auxiliary_loss_clip": 0.01210686, + "auxiliary_loss_mlp": 0.01060084, + "balance_loss_clip": 1.06376529, + "balance_loss_mlp": 1.03545582, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 1.953665562478814, + "language_loss": 0.83284062, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85554832, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.24609375, + "step": 1182, + "time_per_iteration": 2.6925482749938965 + }, + { + "auxiliary_loss_clip": 0.01207872, + "auxiliary_loss_mlp": 0.01060007, + "balance_loss_clip": 1.06519341, + "balance_loss_mlp": 1.0348537, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.5392672222035784, + "language_loss": 0.7910372, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81371605, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.25170898, + "step": 1183, + "time_per_iteration": 2.501127004623413 + }, + { + "auxiliary_loss_clip": 0.01210675, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.05805564, + "balance_loss_mlp": 1.03483343, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.1783978428989696, + "language_loss": 0.79067075, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81338739, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.26171875, + "step": 1184, + "time_per_iteration": 2.593522071838379 + }, + { + "auxiliary_loss_clip": 0.01207263, + "auxiliary_loss_mlp": 0.01061961, + "balance_loss_clip": 1.06245279, + "balance_loss_mlp": 1.03783298, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.21758645292927, + "language_loss": 0.77175951, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79445177, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.24145508, + "step": 1185, + "time_per_iteration": 2.504546642303467 + }, + { + "auxiliary_loss_clip": 0.01215892, + "auxiliary_loss_mlp": 0.01075245, + "balance_loss_clip": 1.06781149, + "balance_loss_mlp": 1.0484581, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.530088846377455, + "language_loss": 0.65422165, + "learning_rate": 3.982129564464596e-06, + "loss": 0.67713302, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.26794434, + "step": 1186, + "time_per_iteration": 2.412702798843384 + }, + { + "auxiliary_loss_clip": 0.01209587, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_clip": 1.06400728, + "balance_loss_mlp": 1.02641416, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 1.9713148659821225, + "language_loss": 0.69622004, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71882653, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.24658203, + "step": 1187, + "time_per_iteration": 2.4747912883758545 + }, + { + "auxiliary_loss_clip": 0.01212639, + "auxiliary_loss_mlp": 0.01057632, + "balance_loss_clip": 1.06763577, + "balance_loss_mlp": 1.03399241, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.4247182574613486, + "language_loss": 0.78565454, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80835724, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.23632812, + "step": 1188, + "time_per_iteration": 2.3981621265411377 + }, + { + "auxiliary_loss_clip": 0.01210436, + "auxiliary_loss_mlp": 0.0105911, + "balance_loss_clip": 1.06437874, + "balance_loss_mlp": 1.03268123, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.1115757631120506, + "language_loss": 0.84595275, + "learning_rate": 3.981973385410981e-06, + "loss": 0.86864817, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.26428223, + "step": 1189, + "time_per_iteration": 2.4109456539154053 + }, + { + "auxiliary_loss_clip": 0.01211404, + "auxiliary_loss_mlp": 0.01060493, + "balance_loss_clip": 1.0655998, + "balance_loss_mlp": 1.03455317, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.816185578730457, + "language_loss": 0.76635891, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78907782, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.25952148, + "step": 1190, + "time_per_iteration": 2.442227602005005 + }, + { + "auxiliary_loss_clip": 0.01210833, + "auxiliary_loss_mlp": 0.01070981, + "balance_loss_clip": 1.06335354, + "balance_loss_mlp": 1.04438579, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.86867204750091, + "language_loss": 0.75822008, + "learning_rate": 3.981868890255468e-06, + "loss": 0.78103828, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.26574707, + "step": 1191, + "time_per_iteration": 2.430720567703247 + }, + { + "auxiliary_loss_clip": 0.01212445, + "auxiliary_loss_mlp": 0.01059265, + "balance_loss_clip": 1.06445622, + "balance_loss_mlp": 1.03258562, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 2.8414225029240843, + "language_loss": 0.74147046, + "learning_rate": 3.981816529947719e-06, + "loss": 0.76418751, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.26733398, + "step": 1192, + "time_per_iteration": 2.4091804027557373 + }, + { + "auxiliary_loss_clip": 0.01206106, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_clip": 1.05982566, + "balance_loss_mlp": 1.02847219, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.313921567586214, + "language_loss": 0.78487206, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80745524, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.23754883, + "step": 1193, + "time_per_iteration": 2.4408318996429443 + }, + { + "auxiliary_loss_clip": 0.01209599, + "auxiliary_loss_mlp": 0.01053814, + "balance_loss_clip": 1.0633074, + "balance_loss_mlp": 1.02775526, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 2.1225123281395764, + "language_loss": 0.85801089, + "learning_rate": 3.981711583882166e-06, + "loss": 0.8806451, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.26062012, + "step": 1194, + "time_per_iteration": 3.911187171936035 + }, + { + "auxiliary_loss_clip": 0.01209498, + "auxiliary_loss_mlp": 0.01070121, + "balance_loss_clip": 1.06291306, + "balance_loss_mlp": 1.04141521, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9179714545927746, + "language_loss": 0.81665421, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83945042, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.28735352, + "step": 1195, + "time_per_iteration": 2.4525163173675537 + }, + { + "auxiliary_loss_clip": 0.01222311, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.0726403, + "balance_loss_mlp": 1.03589058, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.8568893325591256, + "language_loss": 0.80150044, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82431269, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 1.49609375, + "router_z_loss_mlp": 0.23022461, + "step": 1196, + "time_per_iteration": 2.4511704444885254 + }, + { + "auxiliary_loss_clip": 0.01208044, + "auxiliary_loss_mlp": 0.010694, + "balance_loss_clip": 1.06472313, + "balance_loss_mlp": 1.04251885, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.33971826638352, + "language_loss": 0.71185303, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73462743, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.26879883, + "step": 1197, + "time_per_iteration": 2.545807361602783 + }, + { + "auxiliary_loss_clip": 0.01204558, + "auxiliary_loss_mlp": 0.01050061, + "balance_loss_clip": 1.06108832, + "balance_loss_mlp": 1.02569461, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0094974666022005, + "language_loss": 0.85963106, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88217723, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.24377441, + "step": 1198, + "time_per_iteration": 5.182141065597534 + }, + { + "auxiliary_loss_clip": 0.01221929, + "auxiliary_loss_mlp": 0.01061193, + "balance_loss_clip": 1.07416487, + "balance_loss_mlp": 1.03584957, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 1.9500423875705688, + "language_loss": 0.8356086, + "learning_rate": 3.981447903685947e-06, + "loss": 0.85843992, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.25366211, + "step": 1199, + "time_per_iteration": 2.466513156890869 + }, + { + "auxiliary_loss_clip": 0.01212471, + "auxiliary_loss_mlp": 0.01056413, + "balance_loss_clip": 1.06703806, + "balance_loss_mlp": 1.03253579, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.2206339608847054, + "language_loss": 0.76733148, + "learning_rate": 3.981394942228581e-06, + "loss": 0.79002035, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.2388916, + "step": 1200, + "time_per_iteration": 2.4711921215057373 + }, + { + "auxiliary_loss_clip": 0.01206837, + "auxiliary_loss_mlp": 0.01067403, + "balance_loss_clip": 1.06256855, + "balance_loss_mlp": 1.04253602, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 2.5559892926308727, + "language_loss": 0.82448399, + "learning_rate": 3.98134190563652e-06, + "loss": 0.84722638, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.24853516, + "step": 1201, + "time_per_iteration": 2.505872964859009 + }, + { + "auxiliary_loss_clip": 0.01209099, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.06151009, + "balance_loss_mlp": 1.03361583, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 3.118593573579874, + "language_loss": 0.68975616, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71244335, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.26013184, + "step": 1202, + "time_per_iteration": 2.454775810241699 + }, + { + "auxiliary_loss_clip": 0.01209898, + "auxiliary_loss_mlp": 0.01065475, + "balance_loss_clip": 1.06426239, + "balance_loss_mlp": 1.03892732, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9832050281974967, + "language_loss": 0.88008285, + "learning_rate": 3.98123560705636e-06, + "loss": 0.90283662, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.265625, + "step": 1203, + "time_per_iteration": 2.466115713119507 + }, + { + "auxiliary_loss_clip": 0.01214252, + "auxiliary_loss_mlp": 0.01077694, + "balance_loss_clip": 1.06351948, + "balance_loss_mlp": 1.0489645, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.8037841803970307, + "language_loss": 0.7849735, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80789298, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.28710938, + "step": 1204, + "time_per_iteration": 2.417275905609131 + }, + { + "auxiliary_loss_clip": 0.01209826, + "auxiliary_loss_mlp": 0.01063134, + "balance_loss_clip": 1.06421769, + "balance_loss_mlp": 1.0390892, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.5574041079237606, + "language_loss": 0.81951076, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84224033, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.24084473, + "step": 1205, + "time_per_iteration": 2.5188028812408447 + }, + { + "auxiliary_loss_clip": 0.01214024, + "auxiliary_loss_mlp": 0.01066301, + "balance_loss_clip": 1.06651282, + "balance_loss_mlp": 1.03971767, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 1.6652203856367624, + "language_loss": 0.76572669, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78852987, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.26611328, + "step": 1206, + "time_per_iteration": 2.490355968475342 + }, + { + "auxiliary_loss_clip": 0.01212163, + "auxiliary_loss_mlp": 0.01061225, + "balance_loss_clip": 1.06700754, + "balance_loss_mlp": 1.03615558, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.9329258020120001, + "language_loss": 0.77593815, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79867196, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.25073242, + "step": 1207, + "time_per_iteration": 2.481684684753418 + }, + { + "auxiliary_loss_clip": 0.0120775, + "auxiliary_loss_mlp": 0.01051631, + "balance_loss_clip": 1.06516385, + "balance_loss_mlp": 1.02902901, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 7.553736629678636, + "language_loss": 0.79636693, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81896067, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.22619629, + "step": 1208, + "time_per_iteration": 2.476978063583374 + }, + { + "auxiliary_loss_clip": 0.01219704, + "auxiliary_loss_mlp": 0.01058654, + "balance_loss_clip": 1.07343209, + "balance_loss_mlp": 1.03522944, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.233064802492943, + "language_loss": 0.78883731, + "learning_rate": 3.980914908292955e-06, + "loss": 0.81162089, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 1.46484375, + "router_z_loss_mlp": 0.23449707, + "step": 1209, + "time_per_iteration": 2.4763834476470947 + }, + { + "auxiliary_loss_clip": 0.01221018, + "auxiliary_loss_mlp": 0.01058211, + "balance_loss_clip": 1.07485676, + "balance_loss_mlp": 1.03516793, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.7669347780105715, + "language_loss": 0.81764042, + "learning_rate": 3.980861195579486e-06, + "loss": 0.8404327, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.23046875, + "step": 1210, + "time_per_iteration": 2.4725959300994873 + }, + { + "auxiliary_loss_clip": 0.0121298, + "auxiliary_loss_mlp": 0.01063416, + "balance_loss_clip": 1.07030904, + "balance_loss_mlp": 1.03890657, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.258951678783081, + "language_loss": 0.84565675, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86842072, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.24536133, + "step": 1211, + "time_per_iteration": 2.490461587905884 + }, + { + "auxiliary_loss_clip": 0.01204393, + "auxiliary_loss_mlp": 0.01052619, + "balance_loss_clip": 1.06193161, + "balance_loss_mlp": 1.02988625, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.725880556302708, + "language_loss": 0.90863448, + "learning_rate": 3.98075354481122e-06, + "loss": 0.93120468, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.22741699, + "step": 1212, + "time_per_iteration": 2.3850815296173096 + }, + { + "auxiliary_loss_clip": 0.01206675, + "auxiliary_loss_mlp": 0.01066992, + "balance_loss_clip": 1.06517863, + "balance_loss_mlp": 1.04055166, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.0354088936467614, + "language_loss": 0.72514683, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.7478835, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.2644043, + "step": 1213, + "time_per_iteration": 2.497105360031128 + }, + { + "auxiliary_loss_clip": 0.01210685, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.06470418, + "balance_loss_mlp": 1.02716172, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9473127040008968, + "language_loss": 0.84773988, + "learning_rate": 3.980645593601465e-06, + "loss": 0.8703568, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.23840332, + "step": 1214, + "time_per_iteration": 2.472391128540039 + }, + { + "auxiliary_loss_clip": 0.01214827, + "auxiliary_loss_mlp": 0.01067036, + "balance_loss_clip": 1.06780696, + "balance_loss_mlp": 1.04015398, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.617679178992512, + "language_loss": 0.84334582, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86616445, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.26916504, + "step": 1215, + "time_per_iteration": 2.494641065597534 + }, + { + "auxiliary_loss_clip": 0.01214954, + "auxiliary_loss_mlp": 0.01058549, + "balance_loss_clip": 1.06943059, + "balance_loss_mlp": 1.03445721, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.6619707078176325, + "language_loss": 0.81346107, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83619606, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.24084473, + "step": 1216, + "time_per_iteration": 2.5613090991973877 + }, + { + "auxiliary_loss_clip": 0.012119, + "auxiliary_loss_mlp": 0.01058317, + "balance_loss_clip": 1.06783032, + "balance_loss_mlp": 1.03576207, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 1.8378492289622668, + "language_loss": 0.75876451, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78146672, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.22546387, + "step": 1217, + "time_per_iteration": 2.5232012271881104 + }, + { + "auxiliary_loss_clip": 0.012073, + "auxiliary_loss_mlp": 0.01063697, + "balance_loss_clip": 1.06630516, + "balance_loss_mlp": 1.03979516, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.058154882918209, + "language_loss": 0.86465037, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88736033, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.23913574, + "step": 1218, + "time_per_iteration": 2.4462850093841553 + }, + { + "auxiliary_loss_clip": 0.01208902, + "auxiliary_loss_mlp": 0.01062829, + "balance_loss_clip": 1.06484354, + "balance_loss_mlp": 1.03960669, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 2.6459072993395023, + "language_loss": 0.86714453, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.88986182, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.2322998, + "step": 1219, + "time_per_iteration": 2.4574291706085205 + }, + { + "auxiliary_loss_clip": 0.01204331, + "auxiliary_loss_mlp": 0.01057807, + "balance_loss_clip": 1.06149316, + "balance_loss_mlp": 1.03502607, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.1647334009276626, + "language_loss": 0.84082615, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86344749, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.22802734, + "step": 1220, + "time_per_iteration": 2.404709577560425 + }, + { + "auxiliary_loss_clip": 0.01221537, + "auxiliary_loss_mlp": 0.01061726, + "balance_loss_clip": 1.07391167, + "balance_loss_mlp": 1.03800297, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 3.091343094295821, + "language_loss": 0.77216065, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79499322, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.23730469, + "step": 1221, + "time_per_iteration": 2.630052089691162 + }, + { + "auxiliary_loss_clip": 0.01214925, + "auxiliary_loss_mlp": 0.01060733, + "balance_loss_clip": 1.07077336, + "balance_loss_mlp": 1.03710604, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 1.7439233774927578, + "language_loss": 0.9190942, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94185084, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.23632812, + "step": 1222, + "time_per_iteration": 2.4399118423461914 + }, + { + "auxiliary_loss_clip": 0.01214367, + "auxiliary_loss_mlp": 0.01053489, + "balance_loss_clip": 1.06945896, + "balance_loss_mlp": 1.03124428, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.5272802584020804, + "language_loss": 0.90849686, + "learning_rate": 3.980156095634242e-06, + "loss": 0.93117541, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.22229004, + "step": 1223, + "time_per_iteration": 2.432142496109009 + }, + { + "auxiliary_loss_clip": 0.01228672, + "auxiliary_loss_mlp": 0.01070036, + "balance_loss_clip": 1.08280313, + "balance_loss_mlp": 1.04679012, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 1.9580020036818138, + "language_loss": 0.8198626, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84284973, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.23242188, + "step": 1224, + "time_per_iteration": 2.5144426822662354 + }, + { + "auxiliary_loss_clip": 0.01205858, + "auxiliary_loss_mlp": 0.01057467, + "balance_loss_clip": 1.06319773, + "balance_loss_mlp": 1.03211164, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.5194243557753317, + "language_loss": 0.83356708, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8562004, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.25366211, + "step": 1225, + "time_per_iteration": 2.4840774536132812 + }, + { + "auxiliary_loss_clip": 0.01205821, + "auxiliary_loss_mlp": 0.01050987, + "balance_loss_clip": 1.06217098, + "balance_loss_mlp": 1.02771759, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.347871620537749, + "language_loss": 0.90218532, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92475343, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.23278809, + "step": 1226, + "time_per_iteration": 2.480632781982422 + }, + { + "auxiliary_loss_clip": 0.01223267, + "auxiliary_loss_mlp": 0.01047257, + "balance_loss_clip": 1.07086062, + "balance_loss_mlp": 1.02254534, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 4.192736542317146, + "language_loss": 0.76640958, + "learning_rate": 3.97993658861193e-06, + "loss": 0.78911483, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.24694824, + "step": 1227, + "time_per_iteration": 2.4789748191833496 + }, + { + "auxiliary_loss_clip": 0.01207025, + "auxiliary_loss_mlp": 0.01049835, + "balance_loss_clip": 1.06879687, + "balance_loss_mlp": 1.02650571, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 2.6367179124185927, + "language_loss": 0.85685116, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.8794198, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.23327637, + "step": 1228, + "time_per_iteration": 2.538839101791382 + }, + { + "auxiliary_loss_clip": 0.01209253, + "auxiliary_loss_mlp": 0.0106104, + "balance_loss_clip": 1.06518936, + "balance_loss_mlp": 1.0378778, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.27477029556615, + "language_loss": 0.79585397, + "learning_rate": 3.97982638461608e-06, + "loss": 0.8185569, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.23181152, + "step": 1229, + "time_per_iteration": 2.4748551845550537 + }, + { + "auxiliary_loss_clip": 0.01209003, + "auxiliary_loss_mlp": 0.01061215, + "balance_loss_clip": 1.06578469, + "balance_loss_mlp": 1.0375278, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.7817536644848917, + "language_loss": 0.78301793, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80572009, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.23718262, + "step": 1230, + "time_per_iteration": 2.4498143196105957 + }, + { + "auxiliary_loss_clip": 0.01203669, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.06382704, + "balance_loss_mlp": 1.02404618, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 2.3297097244768756, + "language_loss": 0.81561661, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83813524, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.24169922, + "step": 1231, + "time_per_iteration": 2.507520914077759 + }, + { + "auxiliary_loss_clip": 0.01205563, + "auxiliary_loss_mlp": 0.01066035, + "balance_loss_clip": 1.06071329, + "balance_loss_mlp": 1.03978527, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.870694208561745, + "language_loss": 0.9540047, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97672069, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.2623291, + "step": 1232, + "time_per_iteration": 2.508561849594116 + }, + { + "auxiliary_loss_clip": 0.0120269, + "auxiliary_loss_mlp": 0.01066606, + "balance_loss_clip": 1.06139326, + "balance_loss_mlp": 1.0418458, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.2344836709432765, + "language_loss": 0.80754137, + "learning_rate": 3.979605075738569e-06, + "loss": 0.83023429, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.24743652, + "step": 1233, + "time_per_iteration": 2.4956774711608887 + }, + { + "auxiliary_loss_clip": 0.01212986, + "auxiliary_loss_mlp": 0.01063853, + "balance_loss_clip": 1.06441188, + "balance_loss_mlp": 1.03797293, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 3.144306247592431, + "language_loss": 0.70549631, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72826469, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.25891113, + "step": 1234, + "time_per_iteration": 2.6145968437194824 + }, + { + "auxiliary_loss_clip": 0.0121109, + "auxiliary_loss_mlp": 0.01060092, + "balance_loss_clip": 1.06872201, + "balance_loss_mlp": 1.03580928, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 3.7129632283913967, + "language_loss": 0.77017438, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79288626, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.24304199, + "step": 1235, + "time_per_iteration": 2.4916763305664062 + }, + { + "auxiliary_loss_clip": 0.01200197, + "auxiliary_loss_mlp": 0.01064555, + "balance_loss_clip": 1.05966568, + "balance_loss_mlp": 1.03842425, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 1.9389473055722866, + "language_loss": 0.82555699, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84820449, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.26123047, + "step": 1236, + "time_per_iteration": 2.487182378768921 + }, + { + "auxiliary_loss_clip": 0.01205252, + "auxiliary_loss_mlp": 0.01056244, + "balance_loss_clip": 1.06012559, + "balance_loss_mlp": 1.03134108, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 1.7521606089705515, + "language_loss": 0.75690234, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77951729, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.24890137, + "step": 1237, + "time_per_iteration": 3.988358736038208 + }, + { + "auxiliary_loss_clip": 0.01205743, + "auxiliary_loss_mlp": 0.010579, + "balance_loss_clip": 1.06238008, + "balance_loss_mlp": 1.03428411, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.943643276696429, + "language_loss": 0.77656877, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79920518, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.23608398, + "step": 1238, + "time_per_iteration": 4.133524417877197 + }, + { + "auxiliary_loss_clip": 0.01213813, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_clip": 1.06498635, + "balance_loss_mlp": 1.03074765, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.8488854234475114, + "language_loss": 0.86444545, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.8871417, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.25061035, + "step": 1239, + "time_per_iteration": 2.6079206466674805 + }, + { + "auxiliary_loss_clip": 0.0120469, + "auxiliary_loss_mlp": 0.01050815, + "balance_loss_clip": 1.06026483, + "balance_loss_mlp": 1.0251615, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 1.9455487367006752, + "language_loss": 0.89504999, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91760504, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 1.44433594, + "router_z_loss_mlp": 0.25671387, + "step": 1240, + "time_per_iteration": 2.5605764389038086 + }, + { + "auxiliary_loss_clip": 0.01207232, + "auxiliary_loss_mlp": 0.01057768, + "balance_loss_clip": 1.06383038, + "balance_loss_mlp": 1.0324955, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.066161257205739, + "language_loss": 0.88547421, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90812421, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 1.43359375, + "router_z_loss_mlp": 0.25268555, + "step": 1241, + "time_per_iteration": 3.900541067123413 + }, + { + "auxiliary_loss_clip": 0.01119095, + "auxiliary_loss_mlp": 0.0101437, + "balance_loss_clip": 1.05861068, + "balance_loss_mlp": 1.0105021, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.8940507941688088, + "language_loss": 0.63119054, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65252519, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.60449219, + "router_z_loss_mlp": 0.03869629, + "step": 1242, + "time_per_iteration": 4.643768787384033 + }, + { + "auxiliary_loss_clip": 0.01217926, + "auxiliary_loss_mlp": 0.010591, + "balance_loss_clip": 1.06324911, + "balance_loss_mlp": 1.03310108, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.930905163781513, + "language_loss": 0.63229388, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65506423, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.2598877, + "step": 1243, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01207522, + "auxiliary_loss_mlp": 0.01051903, + "balance_loss_clip": 1.06358171, + "balance_loss_mlp": 1.0274055, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.4499937990074176, + "language_loss": 0.76407963, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78667384, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.24499512, + "step": 1244, + "time_per_iteration": 2.5441794395446777 + }, + { + "auxiliary_loss_clip": 0.01211841, + "auxiliary_loss_mlp": 0.01057046, + "balance_loss_clip": 1.06579208, + "balance_loss_mlp": 1.03269136, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 3.6114522467071617, + "language_loss": 0.69565231, + "learning_rate": 3.978933943232123e-06, + "loss": 0.71834117, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.24365234, + "step": 1245, + "time_per_iteration": 2.4118082523345947 + }, + { + "auxiliary_loss_clip": 0.01202772, + "auxiliary_loss_mlp": 0.01055542, + "balance_loss_clip": 1.05947447, + "balance_loss_mlp": 1.03046072, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 2.0495702755745535, + "language_loss": 0.88991761, + "learning_rate": 3.978877527703576e-06, + "loss": 0.91250074, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.25073242, + "step": 1246, + "time_per_iteration": 2.5497145652770996 + }, + { + "auxiliary_loss_clip": 0.01217587, + "auxiliary_loss_mlp": 0.0106516, + "balance_loss_clip": 1.06416059, + "balance_loss_mlp": 1.03780127, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 3.499665104476357, + "language_loss": 0.87981117, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90263867, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.27355957, + "step": 1247, + "time_per_iteration": 2.4394028186798096 + }, + { + "auxiliary_loss_clip": 0.01205774, + "auxiliary_loss_mlp": 0.01062092, + "balance_loss_clip": 1.06246042, + "balance_loss_mlp": 1.03690279, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2678222751119503, + "language_loss": 0.64342004, + "learning_rate": 3.978764471530921e-06, + "loss": 0.66609871, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.25170898, + "step": 1248, + "time_per_iteration": 2.5453619956970215 + }, + { + "auxiliary_loss_clip": 0.01206092, + "auxiliary_loss_mlp": 0.01061759, + "balance_loss_clip": 1.06554425, + "balance_loss_mlp": 1.03846574, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 4.387075888592589, + "language_loss": 0.74073708, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76341558, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.23291016, + "step": 1249, + "time_per_iteration": 2.419100761413574 + }, + { + "auxiliary_loss_clip": 0.01209282, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.0631144, + "balance_loss_mlp": 1.04219019, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 24.347419074118253, + "language_loss": 0.82138693, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84416181, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.26037598, + "step": 1250, + "time_per_iteration": 2.4984521865844727 + }, + { + "auxiliary_loss_clip": 0.01202147, + "auxiliary_loss_mlp": 0.01059098, + "balance_loss_clip": 1.06089008, + "balance_loss_mlp": 1.03466034, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.2699657961483855, + "language_loss": 0.67135781, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69397026, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.24462891, + "step": 1251, + "time_per_iteration": 2.536086082458496 + }, + { + "auxiliary_loss_clip": 0.01106209, + "auxiliary_loss_mlp": 0.01009979, + "balance_loss_clip": 1.04628158, + "balance_loss_mlp": 1.00647986, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 1.0404015660910158, + "language_loss": 0.70429814, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72546005, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.59960938, + "router_z_loss_mlp": 0.03497314, + "step": 1252, + "time_per_iteration": 3.100125551223755 + }, + { + "auxiliary_loss_clip": 0.01201568, + "auxiliary_loss_mlp": 0.01064836, + "balance_loss_clip": 1.05769753, + "balance_loss_mlp": 1.04019594, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.345832677883682, + "language_loss": 0.79818523, + "learning_rate": 3.97848051802535e-06, + "loss": 0.82084918, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.24658203, + "step": 1253, + "time_per_iteration": 2.455687999725342 + }, + { + "auxiliary_loss_clip": 0.01207371, + "auxiliary_loss_mlp": 0.01060422, + "balance_loss_clip": 1.06301081, + "balance_loss_mlp": 1.03637791, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.6078923138455874, + "language_loss": 0.93145645, + "learning_rate": 3.978423502243069e-06, + "loss": 0.9541344, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.24060059, + "step": 1254, + "time_per_iteration": 2.5013930797576904 + }, + { + "auxiliary_loss_clip": 0.01204815, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_clip": 1.06365156, + "balance_loss_mlp": 1.03631175, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 2.5341239528494435, + "language_loss": 0.8815704, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90421867, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.23681641, + "step": 1255, + "time_per_iteration": 2.538715362548828 + }, + { + "auxiliary_loss_clip": 0.01204685, + "auxiliary_loss_mlp": 0.01063802, + "balance_loss_clip": 1.0620997, + "balance_loss_mlp": 1.03845811, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.8884839372603288, + "language_loss": 0.79288733, + "learning_rate": 3.978309245614618e-06, + "loss": 0.8155722, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.25341797, + "step": 1256, + "time_per_iteration": 2.480844259262085 + }, + { + "auxiliary_loss_clip": 0.01110162, + "auxiliary_loss_mlp": 0.01012158, + "balance_loss_clip": 1.04523158, + "balance_loss_mlp": 1.0080874, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7912105867675026, + "language_loss": 0.57999361, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60121679, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 0.04071045, + "step": 1257, + "time_per_iteration": 3.1560873985290527 + }, + { + "auxiliary_loss_clip": 0.01205243, + "auxiliary_loss_mlp": 0.01060984, + "balance_loss_clip": 1.06368089, + "balance_loss_mlp": 1.03728533, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.0299068134034153, + "language_loss": 0.89916098, + "learning_rate": 3.978194688915432e-06, + "loss": 0.9218232, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.23681641, + "step": 1258, + "time_per_iteration": 2.4911327362060547 + }, + { + "auxiliary_loss_clip": 0.01199339, + "auxiliary_loss_mlp": 0.01053297, + "balance_loss_clip": 1.0627178, + "balance_loss_mlp": 1.03017056, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 1.973848442764191, + "language_loss": 0.81381464, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83634102, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.2310791, + "step": 1259, + "time_per_iteration": 2.438235282897949 + }, + { + "auxiliary_loss_clip": 0.01201075, + "auxiliary_loss_mlp": 0.0105791, + "balance_loss_clip": 1.06022835, + "balance_loss_mlp": 1.03479517, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.979426233465801, + "language_loss": 0.75783795, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78042775, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.2310791, + "step": 1260, + "time_per_iteration": 2.5024068355560303 + }, + { + "auxiliary_loss_clip": 0.01200699, + "auxiliary_loss_mlp": 0.01062175, + "balance_loss_clip": 1.0582478, + "balance_loss_mlp": 1.03792787, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 2.82177755568658, + "language_loss": 0.84825385, + "learning_rate": 3.978022291272044e-06, + "loss": 0.87088251, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.24255371, + "step": 1261, + "time_per_iteration": 2.4158029556274414 + }, + { + "auxiliary_loss_clip": 0.01208063, + "auxiliary_loss_mlp": 0.01062058, + "balance_loss_clip": 1.06515968, + "balance_loss_mlp": 1.0385139, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 1.973741434521675, + "language_loss": 0.82596701, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84866828, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.23547363, + "step": 1262, + "time_per_iteration": 2.5124497413635254 + }, + { + "auxiliary_loss_clip": 0.01200304, + "auxiliary_loss_mlp": 0.01060143, + "balance_loss_clip": 1.05858731, + "balance_loss_mlp": 1.0348233, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.7571514277863898, + "language_loss": 0.82910335, + "learning_rate": 3.977906984472136e-06, + "loss": 0.85170788, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.25341797, + "step": 1263, + "time_per_iteration": 2.4777414798736572 + }, + { + "auxiliary_loss_clip": 0.01212254, + "auxiliary_loss_mlp": 0.01054654, + "balance_loss_clip": 1.06594038, + "balance_loss_mlp": 1.03088427, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.0439739838909277, + "language_loss": 0.76244611, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78511512, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 1.46386719, + "router_z_loss_mlp": 0.23803711, + "step": 1264, + "time_per_iteration": 2.46574068069458 + }, + { + "auxiliary_loss_clip": 0.01206092, + "auxiliary_loss_mlp": 0.0106211, + "balance_loss_clip": 1.06318116, + "balance_loss_mlp": 1.03742146, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.631106971495979, + "language_loss": 0.80553931, + "learning_rate": 3.977791377662507e-06, + "loss": 0.82822138, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.24719238, + "step": 1265, + "time_per_iteration": 2.429931640625 + }, + { + "auxiliary_loss_clip": 0.01205085, + "auxiliary_loss_mlp": 0.01056781, + "balance_loss_clip": 1.06150532, + "balance_loss_mlp": 1.03287971, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.567058277884775, + "language_loss": 0.65769589, + "learning_rate": 3.977733461759524e-06, + "loss": 0.68031454, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.23901367, + "step": 1266, + "time_per_iteration": 2.5104384422302246 + }, + { + "auxiliary_loss_clip": 0.01203516, + "auxiliary_loss_mlp": 0.01073887, + "balance_loss_clip": 1.05941212, + "balance_loss_mlp": 1.04684997, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.959087242440341, + "language_loss": 0.79620075, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81897485, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.27038574, + "step": 1267, + "time_per_iteration": 2.497453212738037 + }, + { + "auxiliary_loss_clip": 0.01201253, + "auxiliary_loss_mlp": 0.01052514, + "balance_loss_clip": 1.05836272, + "balance_loss_mlp": 1.02752757, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.4772415094134033, + "language_loss": 0.72993964, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75247729, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.24987793, + "step": 1268, + "time_per_iteration": 2.435974597930908 + }, + { + "auxiliary_loss_clip": 0.01201161, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_clip": 1.05709374, + "balance_loss_mlp": 1.03569937, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.1060422326052914, + "language_loss": 0.82887805, + "learning_rate": 3.977559264084269e-06, + "loss": 0.85149848, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.25195312, + "step": 1269, + "time_per_iteration": 2.4135870933532715 + }, + { + "auxiliary_loss_clip": 0.01202763, + "auxiliary_loss_mlp": 0.01062339, + "balance_loss_clip": 1.06203032, + "balance_loss_mlp": 1.03731656, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.758647380406465, + "language_loss": 0.88437819, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90702927, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.25048828, + "step": 1270, + "time_per_iteration": 2.56874680519104 + }, + { + "auxiliary_loss_clip": 0.01207912, + "auxiliary_loss_mlp": 0.01057535, + "balance_loss_clip": 1.06246459, + "balance_loss_mlp": 1.03282309, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.1242600279300596, + "language_loss": 0.71154076, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73419523, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.24707031, + "step": 1271, + "time_per_iteration": 2.515638589859009 + }, + { + "auxiliary_loss_clip": 0.01215304, + "auxiliary_loss_mlp": 0.01076053, + "balance_loss_clip": 1.07429338, + "balance_loss_mlp": 1.05149615, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.5208403858412034, + "language_loss": 0.8261869, + "learning_rate": 3.977384391505823e-06, + "loss": 0.84910047, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.24560547, + "step": 1272, + "time_per_iteration": 2.614356517791748 + }, + { + "auxiliary_loss_clip": 0.01205751, + "auxiliary_loss_mlp": 0.01053309, + "balance_loss_clip": 1.06054521, + "balance_loss_mlp": 1.02936006, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0392960137315814, + "language_loss": 0.80046785, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82305849, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.23937988, + "step": 1273, + "time_per_iteration": 2.5054678916931152 + }, + { + "auxiliary_loss_clip": 0.01215295, + "auxiliary_loss_mlp": 0.01062306, + "balance_loss_clip": 1.06927299, + "balance_loss_mlp": 1.03757024, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 2.078349959790279, + "language_loss": 0.81017184, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83294785, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.24755859, + "step": 1274, + "time_per_iteration": 2.474898338317871 + }, + { + "auxiliary_loss_clip": 0.01203303, + "auxiliary_loss_mlp": 0.01067936, + "balance_loss_clip": 1.06147099, + "balance_loss_mlp": 1.04267526, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 2.194974333248073, + "language_loss": 0.72886348, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75157583, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.25256348, + "step": 1275, + "time_per_iteration": 2.521674633026123 + }, + { + "auxiliary_loss_clip": 0.01207178, + "auxiliary_loss_mlp": 0.01066368, + "balance_loss_clip": 1.06310451, + "balance_loss_mlp": 1.04005861, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.589202802742963, + "language_loss": 0.79605877, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81879425, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.26330566, + "step": 1276, + "time_per_iteration": 2.459031105041504 + }, + { + "auxiliary_loss_clip": 0.01203881, + "auxiliary_loss_mlp": 0.0105695, + "balance_loss_clip": 1.06038284, + "balance_loss_mlp": 1.03055716, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 3.1107705720642596, + "language_loss": 0.59288955, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61549789, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.26416016, + "step": 1277, + "time_per_iteration": 2.494028091430664 + }, + { + "auxiliary_loss_clip": 0.01208845, + "auxiliary_loss_mlp": 0.01059366, + "balance_loss_clip": 1.0614562, + "balance_loss_mlp": 1.03470135, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.839813928177505, + "language_loss": 0.74238193, + "learning_rate": 3.977032621878305e-06, + "loss": 0.76506412, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.24694824, + "step": 1278, + "time_per_iteration": 2.454986333847046 + }, + { + "auxiliary_loss_clip": 0.01197869, + "auxiliary_loss_mlp": 0.01055163, + "balance_loss_clip": 1.05825412, + "balance_loss_mlp": 1.03148794, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 3.6511768940044482, + "language_loss": 0.88718498, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90971524, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.23681641, + "step": 1279, + "time_per_iteration": 2.4841904640197754 + }, + { + "auxiliary_loss_clip": 0.01200794, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.06141758, + "balance_loss_mlp": 1.03455281, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 2.488185210394218, + "language_loss": 0.82953238, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85213464, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.2487793, + "step": 1280, + "time_per_iteration": 2.4491467475891113 + }, + { + "auxiliary_loss_clip": 0.01197616, + "auxiliary_loss_mlp": 0.01067987, + "balance_loss_clip": 1.05925, + "balance_loss_mlp": 1.04205918, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 3.1436385820837254, + "language_loss": 0.76115894, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78381497, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.25927734, + "step": 1281, + "time_per_iteration": 3.7697184085845947 + }, + { + "auxiliary_loss_clip": 0.01206814, + "auxiliary_loss_mlp": 0.01059355, + "balance_loss_clip": 1.0601747, + "balance_loss_mlp": 1.03386831, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.901318897891991, + "language_loss": 0.75558996, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77825165, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.25512695, + "step": 1282, + "time_per_iteration": 3.9136135578155518 + }, + { + "auxiliary_loss_clip": 0.01205242, + "auxiliary_loss_mlp": 0.01069771, + "balance_loss_clip": 1.06300402, + "balance_loss_mlp": 1.04273486, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 1.8644473226232086, + "language_loss": 0.84076023, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86351037, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.27075195, + "step": 1283, + "time_per_iteration": 2.416512966156006 + }, + { + "auxiliary_loss_clip": 0.01208171, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.06360769, + "balance_loss_mlp": 1.04249692, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 1.7949992453671344, + "language_loss": 0.75005054, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77281857, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.2611084, + "step": 1284, + "time_per_iteration": 2.400773763656616 + }, + { + "auxiliary_loss_clip": 0.01203767, + "auxiliary_loss_mlp": 0.01060806, + "balance_loss_clip": 1.0626452, + "balance_loss_mlp": 1.03692877, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 2.405663441037612, + "language_loss": 0.76268983, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78533554, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.23852539, + "step": 1285, + "time_per_iteration": 5.5899498462677 + }, + { + "auxiliary_loss_clip": 0.01213258, + "auxiliary_loss_mlp": 0.01075148, + "balance_loss_clip": 1.06955266, + "balance_loss_mlp": 1.05130577, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 2.003176771211583, + "language_loss": 0.84145582, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86433983, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.23815918, + "step": 1286, + "time_per_iteration": 2.534374475479126 + }, + { + "auxiliary_loss_clip": 0.01207151, + "auxiliary_loss_mlp": 0.01059982, + "balance_loss_clip": 1.06154251, + "balance_loss_mlp": 1.03598475, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.627490325040734, + "language_loss": 0.77227926, + "learning_rate": 3.97649990716259e-06, + "loss": 0.7949506, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.2401123, + "step": 1287, + "time_per_iteration": 2.4109344482421875 + }, + { + "auxiliary_loss_clip": 0.01208921, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_clip": 1.06748915, + "balance_loss_mlp": 1.02954578, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6798530896974406, + "language_loss": 0.84806848, + "learning_rate": 3.976440341863237e-06, + "loss": 0.87068617, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.23291016, + "step": 1288, + "time_per_iteration": 2.495408058166504 + }, + { + "auxiliary_loss_clip": 0.01207318, + "auxiliary_loss_mlp": 0.0105337, + "balance_loss_clip": 1.06221509, + "balance_loss_mlp": 1.03025544, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.365178405367204, + "language_loss": 0.85365456, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87626147, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 1.45019531, + "router_z_loss_mlp": 0.23132324, + "step": 1289, + "time_per_iteration": 2.465898036956787 + }, + { + "auxiliary_loss_clip": 0.01198928, + "auxiliary_loss_mlp": 0.01048851, + "balance_loss_clip": 1.05887926, + "balance_loss_mlp": 1.0255816, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 16.820086716149223, + "language_loss": 0.85268927, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87516707, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.23278809, + "step": 1290, + "time_per_iteration": 2.5487308502197266 + }, + { + "auxiliary_loss_clip": 0.01199093, + "auxiliary_loss_mlp": 0.01056967, + "balance_loss_clip": 1.06134272, + "balance_loss_mlp": 1.03176594, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 4.27454324630133, + "language_loss": 0.91479784, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.9373585, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.25231934, + "step": 1291, + "time_per_iteration": 2.4277701377868652 + }, + { + "auxiliary_loss_clip": 0.01101858, + "auxiliary_loss_mlp": 0.01010025, + "balance_loss_clip": 1.04097795, + "balance_loss_mlp": 1.00544763, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8846200023200761, + "language_loss": 0.65083092, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.6719498, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.60839844, + "router_z_loss_mlp": 0.04577637, + "step": 1292, + "time_per_iteration": 3.1592979431152344 + }, + { + "auxiliary_loss_clip": 0.01201825, + "auxiliary_loss_mlp": 0.01061685, + "balance_loss_clip": 1.05956125, + "balance_loss_mlp": 1.03631783, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.9949009699050788, + "language_loss": 0.8783772, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.9010123, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.25390625, + "step": 1293, + "time_per_iteration": 2.5139577388763428 + }, + { + "auxiliary_loss_clip": 0.01206681, + "auxiliary_loss_mlp": 0.01066418, + "balance_loss_clip": 1.0633291, + "balance_loss_mlp": 1.0402751, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.7656420098605365, + "language_loss": 0.84934944, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87208045, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.26135254, + "step": 1294, + "time_per_iteration": 2.4858431816101074 + }, + { + "auxiliary_loss_clip": 0.01206364, + "auxiliary_loss_mlp": 0.01057809, + "balance_loss_clip": 1.06422901, + "balance_loss_mlp": 1.0337646, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.5488285777815323, + "language_loss": 0.79359722, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81623894, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.24047852, + "step": 1295, + "time_per_iteration": 2.445600986480713 + }, + { + "auxiliary_loss_clip": 0.01200893, + "auxiliary_loss_mlp": 0.01056452, + "balance_loss_clip": 1.06030369, + "balance_loss_mlp": 1.03200209, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.219281359307061, + "language_loss": 0.88270152, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90527499, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.24462891, + "step": 1296, + "time_per_iteration": 2.4766769409179688 + }, + { + "auxiliary_loss_clip": 0.01210219, + "auxiliary_loss_mlp": 0.01055671, + "balance_loss_clip": 1.066535, + "balance_loss_mlp": 1.03026724, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.457248295250087, + "language_loss": 0.96704024, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98969913, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.25378418, + "step": 1297, + "time_per_iteration": 2.5910539627075195 + }, + { + "auxiliary_loss_clip": 0.01204637, + "auxiliary_loss_mlp": 0.0105788, + "balance_loss_clip": 1.06108069, + "balance_loss_mlp": 1.03390741, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.085134196279652, + "language_loss": 0.76240659, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78503174, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.23950195, + "step": 1298, + "time_per_iteration": 2.521641731262207 + }, + { + "auxiliary_loss_clip": 0.01205924, + "auxiliary_loss_mlp": 0.0106058, + "balance_loss_clip": 1.06391573, + "balance_loss_mlp": 1.03723931, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.7366042351629773, + "language_loss": 0.80717897, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82984406, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.23339844, + "step": 1299, + "time_per_iteration": 2.4686591625213623 + }, + { + "auxiliary_loss_clip": 0.01201544, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_clip": 1.0643332, + "balance_loss_mlp": 1.03239131, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 1.7935875884084322, + "language_loss": 0.86843783, + "learning_rate": 3.975719713068202e-06, + "loss": 0.89101934, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.24206543, + "step": 1300, + "time_per_iteration": 2.5265190601348877 + }, + { + "auxiliary_loss_clip": 0.01203602, + "auxiliary_loss_mlp": 0.0105093, + "balance_loss_clip": 1.06337905, + "balance_loss_mlp": 1.02674246, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 2.4194052077953923, + "language_loss": 0.72163188, + "learning_rate": 3.975659173637458e-06, + "loss": 0.7441771, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.24194336, + "step": 1301, + "time_per_iteration": 2.652167558670044 + }, + { + "auxiliary_loss_clip": 0.01206062, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.06225729, + "balance_loss_mlp": 1.0405848, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 2.8183401439157865, + "language_loss": 0.70763636, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73034096, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.23840332, + "step": 1302, + "time_per_iteration": 2.613041877746582 + }, + { + "auxiliary_loss_clip": 0.01204778, + "auxiliary_loss_mlp": 0.01057087, + "balance_loss_clip": 1.06278145, + "balance_loss_mlp": 1.03189826, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.1598465768782833, + "language_loss": 0.82158184, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84420049, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.2520752, + "step": 1303, + "time_per_iteration": 2.4775588512420654 + }, + { + "auxiliary_loss_clip": 0.01199325, + "auxiliary_loss_mlp": 0.01066621, + "balance_loss_clip": 1.05878258, + "balance_loss_mlp": 1.03972757, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.6456142281414858, + "language_loss": 0.74949038, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.7721498, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.26904297, + "step": 1304, + "time_per_iteration": 2.4414191246032715 + }, + { + "auxiliary_loss_clip": 0.01208447, + "auxiliary_loss_mlp": 0.01058385, + "balance_loss_clip": 1.066522, + "balance_loss_mlp": 1.03366101, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.7646108047633737, + "language_loss": 0.76416636, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78683472, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.24731445, + "step": 1305, + "time_per_iteration": 2.455134630203247 + }, + { + "auxiliary_loss_clip": 0.01205596, + "auxiliary_loss_mlp": 0.01063122, + "balance_loss_clip": 1.06305599, + "balance_loss_mlp": 1.03883898, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.629098483595839, + "language_loss": 0.84805733, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87074453, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.24291992, + "step": 1306, + "time_per_iteration": 2.4799892902374268 + }, + { + "auxiliary_loss_clip": 0.01203515, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_clip": 1.06355536, + "balance_loss_mlp": 1.02240825, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 2.643130159518014, + "language_loss": 0.90634894, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92883158, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.22338867, + "step": 1307, + "time_per_iteration": 2.4778754711151123 + }, + { + "auxiliary_loss_clip": 0.01202106, + "auxiliary_loss_mlp": 0.01053325, + "balance_loss_clip": 1.06125522, + "balance_loss_mlp": 1.02881575, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.9939595440956999, + "language_loss": 0.83227789, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85483217, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.24523926, + "step": 1308, + "time_per_iteration": 2.4668045043945312 + }, + { + "auxiliary_loss_clip": 0.01208085, + "auxiliary_loss_mlp": 0.01055601, + "balance_loss_clip": 1.06763291, + "balance_loss_mlp": 1.03287959, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.5847631531583242, + "language_loss": 0.77497995, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79761684, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.22741699, + "step": 1309, + "time_per_iteration": 2.4682087898254395 + }, + { + "auxiliary_loss_clip": 0.01207775, + "auxiliary_loss_mlp": 0.0106448, + "balance_loss_clip": 1.06333899, + "balance_loss_mlp": 1.03809905, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 2.0082766702989407, + "language_loss": 0.80465972, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82738227, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.26416016, + "step": 1310, + "time_per_iteration": 2.462458610534668 + }, + { + "auxiliary_loss_clip": 0.0120249, + "auxiliary_loss_mlp": 0.01054977, + "balance_loss_clip": 1.06633997, + "balance_loss_mlp": 1.03226805, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7264743272067469, + "language_loss": 0.73152173, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75409639, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.22692871, + "step": 1311, + "time_per_iteration": 2.4931325912475586 + }, + { + "auxiliary_loss_clip": 0.01201766, + "auxiliary_loss_mlp": 0.01075782, + "balance_loss_clip": 1.0631882, + "balance_loss_mlp": 1.05091488, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.8364580208558006, + "language_loss": 0.85795063, + "learning_rate": 3.974988295871553e-06, + "loss": 0.8807261, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.2487793, + "step": 1312, + "time_per_iteration": 2.486048698425293 + }, + { + "auxiliary_loss_clip": 0.01198115, + "auxiliary_loss_mlp": 0.01060961, + "balance_loss_clip": 1.06114423, + "balance_loss_mlp": 1.0380013, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7855395657855984, + "language_loss": 0.82241052, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84500122, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.22961426, + "step": 1313, + "time_per_iteration": 2.5130159854888916 + }, + { + "auxiliary_loss_clip": 0.01209086, + "auxiliary_loss_mlp": 0.01062843, + "balance_loss_clip": 1.0613519, + "balance_loss_mlp": 1.03711748, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.855800177173317, + "language_loss": 0.73567301, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75839227, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.25708008, + "step": 1314, + "time_per_iteration": 2.4317376613616943 + }, + { + "auxiliary_loss_clip": 0.01199829, + "auxiliary_loss_mlp": 0.01058123, + "balance_loss_clip": 1.05983782, + "balance_loss_mlp": 1.03546166, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.639314815226468, + "language_loss": 0.79888576, + "learning_rate": 3.974803756351379e-06, + "loss": 0.82146525, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.22680664, + "step": 1315, + "time_per_iteration": 2.5089874267578125 + }, + { + "auxiliary_loss_clip": 0.01212026, + "auxiliary_loss_mlp": 0.01059675, + "balance_loss_clip": 1.06852245, + "balance_loss_mlp": 1.0344739, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9124756364668456, + "language_loss": 0.74112993, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76384687, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.25219727, + "step": 1316, + "time_per_iteration": 2.494887590408325 + }, + { + "auxiliary_loss_clip": 0.01201516, + "auxiliary_loss_mlp": 0.0105931, + "balance_loss_clip": 1.05902624, + "balance_loss_mlp": 1.0349679, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.2828511022381868, + "language_loss": 0.65609926, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67870748, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.24353027, + "step": 1317, + "time_per_iteration": 2.438446283340454 + }, + { + "auxiliary_loss_clip": 0.01212331, + "auxiliary_loss_mlp": 0.0106758, + "balance_loss_clip": 1.06444871, + "balance_loss_mlp": 1.0415926, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.370263355433321, + "language_loss": 0.7328558, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75565487, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.2598877, + "step": 1318, + "time_per_iteration": 2.529364585876465 + }, + { + "auxiliary_loss_clip": 0.01203505, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.06256878, + "balance_loss_mlp": 1.03145957, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.6062557441527365, + "language_loss": 0.90693814, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92952096, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.2331543, + "step": 1319, + "time_per_iteration": 2.5055739879608154 + }, + { + "auxiliary_loss_clip": 0.01213484, + "auxiliary_loss_mlp": 0.01056018, + "balance_loss_clip": 1.06683004, + "balance_loss_mlp": 1.03205645, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.684778961420721, + "language_loss": 0.79990375, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82259876, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.23913574, + "step": 1320, + "time_per_iteration": 2.5428009033203125 + }, + { + "auxiliary_loss_clip": 0.01214712, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.07307827, + "balance_loss_mlp": 1.03221774, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.0125310166325963, + "language_loss": 0.69370961, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71642101, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.24194336, + "step": 1321, + "time_per_iteration": 2.4270827770233154 + }, + { + "auxiliary_loss_clip": 0.01201526, + "auxiliary_loss_mlp": 0.01053716, + "balance_loss_clip": 1.0634222, + "balance_loss_mlp": 1.03051758, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 2.0367464620785003, + "language_loss": 0.8406136, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.86316609, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.23181152, + "step": 1322, + "time_per_iteration": 2.448850631713867 + }, + { + "auxiliary_loss_clip": 0.01201988, + "auxiliary_loss_mlp": 0.01063045, + "balance_loss_clip": 1.05934799, + "balance_loss_mlp": 1.03888178, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 2.5742269720608166, + "language_loss": 0.90186906, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92451936, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.24169922, + "step": 1323, + "time_per_iteration": 2.433849334716797 + }, + { + "auxiliary_loss_clip": 0.01198453, + "auxiliary_loss_mlp": 0.0105793, + "balance_loss_clip": 1.0613935, + "balance_loss_mlp": 1.03402889, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 2.088857072911598, + "language_loss": 0.82490158, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84746546, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.23913574, + "step": 1324, + "time_per_iteration": 3.8974289894104004 + }, + { + "auxiliary_loss_clip": 0.01200021, + "auxiliary_loss_mlp": 0.01050117, + "balance_loss_clip": 1.05977595, + "balance_loss_mlp": 1.02418923, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.6566533035964985, + "language_loss": 0.79001009, + "learning_rate": 3.974183757463925e-06, + "loss": 0.8125115, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.25915527, + "step": 1325, + "time_per_iteration": 3.9036386013031006 + }, + { + "auxiliary_loss_clip": 0.01204807, + "auxiliary_loss_mlp": 0.01066727, + "balance_loss_clip": 1.06413937, + "balance_loss_mlp": 1.04078698, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.045199224986592, + "language_loss": 0.88516891, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90788424, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.25939941, + "step": 1326, + "time_per_iteration": 2.4452896118164062 + }, + { + "auxiliary_loss_clip": 0.0119425, + "auxiliary_loss_mlp": 0.01051739, + "balance_loss_clip": 1.05758691, + "balance_loss_mlp": 1.02672851, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.374566401133083, + "language_loss": 0.83202791, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85448778, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.25012207, + "step": 1327, + "time_per_iteration": 2.535269021987915 + }, + { + "auxiliary_loss_clip": 0.01208722, + "auxiliary_loss_mlp": 0.01052472, + "balance_loss_clip": 1.06524515, + "balance_loss_mlp": 1.02698541, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.3484043467099784, + "language_loss": 0.78471708, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.807329, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.25488281, + "step": 1328, + "time_per_iteration": 5.401085138320923 + }, + { + "auxiliary_loss_clip": 0.01203681, + "auxiliary_loss_mlp": 0.01053375, + "balance_loss_clip": 1.06342649, + "balance_loss_mlp": 1.02788854, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 3.2469838280394194, + "language_loss": 0.74242973, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76500034, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.25476074, + "step": 1329, + "time_per_iteration": 2.4459948539733887 + }, + { + "auxiliary_loss_clip": 0.01199574, + "auxiliary_loss_mlp": 0.01062902, + "balance_loss_clip": 1.06019294, + "balance_loss_mlp": 1.03865457, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 2.130360687162517, + "language_loss": 0.81063598, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83326077, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.24255371, + "step": 1330, + "time_per_iteration": 2.493551254272461 + }, + { + "auxiliary_loss_clip": 0.01206623, + "auxiliary_loss_mlp": 0.01062899, + "balance_loss_clip": 1.06441259, + "balance_loss_mlp": 1.03797269, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.3532602901680875, + "language_loss": 0.88582492, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.9085201, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.24963379, + "step": 1331, + "time_per_iteration": 2.477362632751465 + }, + { + "auxiliary_loss_clip": 0.01210423, + "auxiliary_loss_mlp": 0.01058602, + "balance_loss_clip": 1.06512356, + "balance_loss_mlp": 1.03403282, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.321068338796065, + "language_loss": 0.73424113, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75693142, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.24560547, + "step": 1332, + "time_per_iteration": 2.6203205585479736 + }, + { + "auxiliary_loss_clip": 0.01219092, + "auxiliary_loss_mlp": 0.010564, + "balance_loss_clip": 1.07586837, + "balance_loss_mlp": 1.03274858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 3.5126654507561574, + "language_loss": 0.82605529, + "learning_rate": 3.973682368232138e-06, + "loss": 0.8488102, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.2364502, + "step": 1333, + "time_per_iteration": 2.4613161087036133 + }, + { + "auxiliary_loss_clip": 0.01201618, + "auxiliary_loss_mlp": 0.01054047, + "balance_loss_clip": 1.05980587, + "balance_loss_mlp": 1.03034794, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 3.0079984499156973, + "language_loss": 0.75252867, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77508527, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 1.41796875, + "router_z_loss_mlp": 0.23706055, + "step": 1334, + "time_per_iteration": 2.4881861209869385 + }, + { + "auxiliary_loss_clip": 0.01206082, + "auxiliary_loss_mlp": 0.01060041, + "balance_loss_clip": 1.06715608, + "balance_loss_mlp": 1.03641391, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 1.8882487579245322, + "language_loss": 0.79916054, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82182181, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.23657227, + "step": 1335, + "time_per_iteration": 2.52423357963562 + }, + { + "auxiliary_loss_clip": 0.01107386, + "auxiliary_loss_mlp": 0.01023055, + "balance_loss_clip": 1.0478189, + "balance_loss_mlp": 1.01826262, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7478039939417483, + "language_loss": 0.55990607, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58121049, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.59472656, + "router_z_loss_mlp": 0.0479126, + "step": 1336, + "time_per_iteration": 3.130521774291992 + }, + { + "auxiliary_loss_clip": 0.01205455, + "auxiliary_loss_mlp": 0.01063474, + "balance_loss_clip": 1.06525171, + "balance_loss_mlp": 1.04050231, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.011159600336431, + "language_loss": 0.67388451, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69657379, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.2298584, + "step": 1337, + "time_per_iteration": 2.510448694229126 + }, + { + "auxiliary_loss_clip": 0.0120913, + "auxiliary_loss_mlp": 0.01066406, + "balance_loss_clip": 1.06841683, + "balance_loss_mlp": 1.04326797, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8451426666817834, + "language_loss": 0.8662653, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88902068, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.23132324, + "step": 1338, + "time_per_iteration": 2.5288970470428467 + }, + { + "auxiliary_loss_clip": 0.01201081, + "auxiliary_loss_mlp": 0.01072311, + "balance_loss_clip": 1.05995536, + "balance_loss_mlp": 1.04594159, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.262144304134684, + "language_loss": 0.87245041, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89518434, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.26391602, + "step": 1339, + "time_per_iteration": 2.4681692123413086 + }, + { + "auxiliary_loss_clip": 0.01199671, + "auxiliary_loss_mlp": 0.01055628, + "balance_loss_clip": 1.06204462, + "balance_loss_mlp": 1.03309774, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.030535074886346, + "language_loss": 0.88788855, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91044152, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.22546387, + "step": 1340, + "time_per_iteration": 2.424318552017212 + }, + { + "auxiliary_loss_clip": 0.01137708, + "auxiliary_loss_mlp": 0.01013337, + "balance_loss_clip": 1.07547903, + "balance_loss_mlp": 1.0095526, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8790418659672082, + "language_loss": 0.6483478, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66985828, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 0.0378418, + "step": 1341, + "time_per_iteration": 3.02180552482605 + }, + { + "auxiliary_loss_clip": 0.01213931, + "auxiliary_loss_mlp": 0.01058294, + "balance_loss_clip": 1.06708944, + "balance_loss_mlp": 1.03242612, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.4666191758462155, + "language_loss": 0.89747536, + "learning_rate": 3.973112579977733e-06, + "loss": 0.92019749, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.25854492, + "step": 1342, + "time_per_iteration": 2.432866096496582 + }, + { + "auxiliary_loss_clip": 0.01218318, + "auxiliary_loss_mlp": 0.01060031, + "balance_loss_clip": 1.07311094, + "balance_loss_mlp": 1.03487778, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.373121742905433, + "language_loss": 0.76704586, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78982937, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.25146484, + "step": 1343, + "time_per_iteration": 2.4281842708587646 + }, + { + "auxiliary_loss_clip": 0.0111124, + "auxiliary_loss_mlp": 0.01010472, + "balance_loss_clip": 1.0493027, + "balance_loss_mlp": 1.006657, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8009009220645079, + "language_loss": 0.57422251, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59543955, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.61914062, + "router_z_loss_mlp": 0.03808594, + "step": 1344, + "time_per_iteration": 2.9721901416778564 + }, + { + "auxiliary_loss_clip": 0.01210607, + "auxiliary_loss_mlp": 0.01058708, + "balance_loss_clip": 1.06788647, + "balance_loss_mlp": 1.03435338, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.6794680800932635, + "language_loss": 0.86838061, + "learning_rate": 3.972921303701695e-06, + "loss": 0.89107376, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.24353027, + "step": 1345, + "time_per_iteration": 2.4614949226379395 + }, + { + "auxiliary_loss_clip": 0.01199692, + "auxiliary_loss_mlp": 0.01049274, + "balance_loss_clip": 1.06229711, + "balance_loss_mlp": 1.02624297, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.90512342455909, + "language_loss": 0.87542826, + "learning_rate": 3.972857395313042e-06, + "loss": 0.89791793, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.23034668, + "step": 1346, + "time_per_iteration": 2.440563678741455 + }, + { + "auxiliary_loss_clip": 0.01207565, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_clip": 1.06648862, + "balance_loss_mlp": 1.03085375, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.601135961541481, + "language_loss": 0.92793161, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95055205, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.2364502, + "step": 1347, + "time_per_iteration": 2.601360321044922 + }, + { + "auxiliary_loss_clip": 0.01200705, + "auxiliary_loss_mlp": 0.01072189, + "balance_loss_clip": 1.06111121, + "balance_loss_mlp": 1.04409146, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.737755939595027, + "language_loss": 0.89072871, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91345769, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.28100586, + "step": 1348, + "time_per_iteration": 2.4848477840423584 + }, + { + "auxiliary_loss_clip": 0.01213819, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.07518137, + "balance_loss_mlp": 1.03578556, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.6246516260573594, + "language_loss": 0.76583421, + "learning_rate": 3.97266522129109e-06, + "loss": 0.78856492, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.23449707, + "step": 1349, + "time_per_iteration": 2.513993263244629 + }, + { + "auxiliary_loss_clip": 0.01201494, + "auxiliary_loss_mlp": 0.01062191, + "balance_loss_clip": 1.06153607, + "balance_loss_mlp": 1.03770518, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.17917874502898, + "language_loss": 0.88763797, + "learning_rate": 3.972601013673205e-06, + "loss": 0.9102748, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.24523926, + "step": 1350, + "time_per_iteration": 2.4377894401550293 + }, + { + "auxiliary_loss_clip": 0.01196907, + "auxiliary_loss_mlp": 0.01063777, + "balance_loss_clip": 1.06080222, + "balance_loss_mlp": 1.03966093, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 1.9950237876202164, + "language_loss": 0.82298899, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84559584, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.24121094, + "step": 1351, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01199359, + "auxiliary_loss_mlp": 0.01051019, + "balance_loss_clip": 1.05931902, + "balance_loss_mlp": 1.02613986, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 2.3773377629840544, + "language_loss": 0.75350702, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77601075, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.24890137, + "step": 1352, + "time_per_iteration": 2.4955077171325684 + }, + { + "auxiliary_loss_clip": 0.01204121, + "auxiliary_loss_mlp": 0.01056259, + "balance_loss_clip": 1.06294882, + "balance_loss_mlp": 1.03123689, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.8874034124692363, + "language_loss": 0.83110988, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85371363, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.25, + "step": 1353, + "time_per_iteration": 2.5010809898376465 + }, + { + "auxiliary_loss_clip": 0.01097622, + "auxiliary_loss_mlp": 0.01020118, + "balance_loss_clip": 1.04179275, + "balance_loss_mlp": 1.01659524, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8594729588170499, + "language_loss": 0.59714526, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61832261, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.55761719, + "router_z_loss_mlp": 0.03527832, + "step": 1354, + "time_per_iteration": 3.135298252105713 + }, + { + "auxiliary_loss_clip": 0.01194558, + "auxiliary_loss_mlp": 0.01057598, + "balance_loss_clip": 1.05909109, + "balance_loss_mlp": 1.03418565, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7967511790550121, + "language_loss": 0.82712853, + "learning_rate": 3.972278853614154e-06, + "loss": 0.84965014, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.234375, + "step": 1355, + "time_per_iteration": 2.56046199798584 + }, + { + "auxiliary_loss_clip": 0.01199773, + "auxiliary_loss_mlp": 0.01059964, + "balance_loss_clip": 1.06075633, + "balance_loss_mlp": 1.03388143, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 2.105731205345736, + "language_loss": 0.71367383, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73627126, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.26074219, + "step": 1356, + "time_per_iteration": 2.4793968200683594 + }, + { + "auxiliary_loss_clip": 0.01201039, + "auxiliary_loss_mlp": 0.0105526, + "balance_loss_clip": 1.0593549, + "balance_loss_mlp": 1.03038144, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.195118019260158, + "language_loss": 0.70271456, + "learning_rate": 3.972149466050329e-06, + "loss": 0.7252776, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.24865723, + "step": 1357, + "time_per_iteration": 2.473951816558838 + }, + { + "auxiliary_loss_clip": 0.01208372, + "auxiliary_loss_mlp": 0.01065572, + "balance_loss_clip": 1.06545365, + "balance_loss_mlp": 1.03807044, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.4843268689673614, + "language_loss": 0.84260464, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86534405, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.27502441, + "step": 1358, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.0120593, + "auxiliary_loss_mlp": 0.01057258, + "balance_loss_clip": 1.0639658, + "balance_loss_mlp": 1.03091311, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 2.0176245770441756, + "language_loss": 1.02482128, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04745317, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.26367188, + "step": 1359, + "time_per_iteration": 2.4917564392089844 + }, + { + "auxiliary_loss_clip": 0.01201961, + "auxiliary_loss_mlp": 0.01052971, + "balance_loss_clip": 1.06187379, + "balance_loss_mlp": 1.02793694, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.2800990110938937, + "language_loss": 0.83316314, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85571247, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.25012207, + "step": 1360, + "time_per_iteration": 2.49775767326355 + }, + { + "auxiliary_loss_clip": 0.0122476, + "auxiliary_loss_mlp": 0.01066457, + "balance_loss_clip": 1.07859027, + "balance_loss_mlp": 1.04123259, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.2133547383856187, + "language_loss": 0.7195847, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74249691, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.25219727, + "step": 1361, + "time_per_iteration": 2.486652135848999 + }, + { + "auxiliary_loss_clip": 0.01202175, + "auxiliary_loss_mlp": 0.01058194, + "balance_loss_clip": 1.06308937, + "balance_loss_mlp": 1.03265965, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 3.0141707471771215, + "language_loss": 0.76818913, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79079282, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.25537109, + "step": 1362, + "time_per_iteration": 2.478294610977173 + }, + { + "auxiliary_loss_clip": 0.01210949, + "auxiliary_loss_mlp": 0.01055657, + "balance_loss_clip": 1.06978035, + "balance_loss_mlp": 1.03148162, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.346833111701837, + "language_loss": 0.72430897, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74697506, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.24169922, + "step": 1363, + "time_per_iteration": 2.4477202892303467 + }, + { + "auxiliary_loss_clip": 0.01219051, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.07835317, + "balance_loss_mlp": 1.02498269, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 2.2866245772640204, + "language_loss": 0.77389604, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79660141, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.26525879, + "step": 1364, + "time_per_iteration": 2.483591318130493 + }, + { + "auxiliary_loss_clip": 0.01207712, + "auxiliary_loss_mlp": 0.01061031, + "balance_loss_clip": 1.06753373, + "balance_loss_mlp": 1.03599691, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 1.7683591726377252, + "language_loss": 0.81970549, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84239292, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.25036621, + "step": 1365, + "time_per_iteration": 2.5405008792877197 + }, + { + "auxiliary_loss_clip": 0.01201303, + "auxiliary_loss_mlp": 0.01062661, + "balance_loss_clip": 1.06357896, + "balance_loss_mlp": 1.03829479, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 1.877946550965078, + "language_loss": 0.82006615, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84270585, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.24353027, + "step": 1366, + "time_per_iteration": 2.439049243927002 + }, + { + "auxiliary_loss_clip": 0.01210958, + "auxiliary_loss_mlp": 0.01063581, + "balance_loss_clip": 1.06693673, + "balance_loss_mlp": 1.04056168, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 1.9597119971682901, + "language_loss": 0.81575108, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83849657, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.23022461, + "step": 1367, + "time_per_iteration": 2.455850601196289 + }, + { + "auxiliary_loss_clip": 0.01206961, + "auxiliary_loss_mlp": 0.01058137, + "balance_loss_clip": 1.06435275, + "balance_loss_mlp": 1.03346038, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.2914901846604008, + "language_loss": 0.83898085, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86163181, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.2467041, + "step": 1368, + "time_per_iteration": 3.8584437370300293 + }, + { + "auxiliary_loss_clip": 0.01203236, + "auxiliary_loss_mlp": 0.01054463, + "balance_loss_clip": 1.06722164, + "balance_loss_mlp": 1.03107405, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7210285294301382, + "language_loss": 0.8129757, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83555269, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.23388672, + "step": 1369, + "time_per_iteration": 3.9046788215637207 + }, + { + "auxiliary_loss_clip": 0.01198503, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.0645659, + "balance_loss_mlp": 1.0320574, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.19976151854757, + "language_loss": 0.74912453, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77164763, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.21765137, + "step": 1370, + "time_per_iteration": 2.5289344787597656 + }, + { + "auxiliary_loss_clip": 0.01206261, + "auxiliary_loss_mlp": 0.01064351, + "balance_loss_clip": 1.06507969, + "balance_loss_mlp": 1.04006851, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6480378089009955, + "language_loss": 0.743559, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76626509, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.24267578, + "step": 1371, + "time_per_iteration": 3.920104742050171 + }, + { + "auxiliary_loss_clip": 0.01206534, + "auxiliary_loss_mlp": 0.01063199, + "balance_loss_clip": 1.065382, + "balance_loss_mlp": 1.03878486, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 1.9918516531394495, + "language_loss": 0.7088629, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73156023, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.24401855, + "step": 1372, + "time_per_iteration": 2.555016279220581 + }, + { + "auxiliary_loss_clip": 0.01204045, + "auxiliary_loss_mlp": 0.01058619, + "balance_loss_clip": 1.06321263, + "balance_loss_mlp": 1.03143954, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.5700134617747508, + "language_loss": 0.88156807, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90419471, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.27160645, + "step": 1373, + "time_per_iteration": 3.802147150039673 + }, + { + "auxiliary_loss_clip": 0.01202969, + "auxiliary_loss_mlp": 0.01066337, + "balance_loss_clip": 1.0619849, + "balance_loss_mlp": 1.03952694, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7935173819668473, + "language_loss": 0.82426429, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84695733, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.26818848, + "step": 1374, + "time_per_iteration": 2.4998981952667236 + }, + { + "auxiliary_loss_clip": 0.01091194, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.03620756, + "balance_loss_mlp": 1.02910674, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8301342971370199, + "language_loss": 0.60714132, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62838137, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.54980469, + "router_z_loss_mlp": 0.0369873, + "step": 1375, + "time_per_iteration": 3.04308819770813 + }, + { + "auxiliary_loss_clip": 0.01098345, + "auxiliary_loss_mlp": 0.01017548, + "balance_loss_clip": 1.04105759, + "balance_loss_mlp": 1.01350689, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9083993694213378, + "language_loss": 0.62214082, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64329976, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.57128906, + "router_z_loss_mlp": 0.04040527, + "step": 1376, + "time_per_iteration": 3.0171079635620117 + }, + { + "auxiliary_loss_clip": 0.01208769, + "auxiliary_loss_mlp": 0.01068791, + "balance_loss_clip": 1.06671727, + "balance_loss_mlp": 1.04517579, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 2.461519445570273, + "language_loss": 0.82610649, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84888208, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.23632812, + "step": 1377, + "time_per_iteration": 2.482084274291992 + }, + { + "auxiliary_loss_clip": 0.01201481, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.06324947, + "balance_loss_mlp": 1.03672183, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 2.8001309035361825, + "language_loss": 0.85128021, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87389749, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.23498535, + "step": 1378, + "time_per_iteration": 2.5309107303619385 + }, + { + "auxiliary_loss_clip": 0.01205449, + "auxiliary_loss_mlp": 0.01069258, + "balance_loss_clip": 1.06400883, + "balance_loss_mlp": 1.04381824, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 2.70949497517178, + "language_loss": 0.87988257, + "learning_rate": 3.970706464194672e-06, + "loss": 0.90262967, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.25439453, + "step": 1379, + "time_per_iteration": 2.5201337337493896 + }, + { + "auxiliary_loss_clip": 0.01208137, + "auxiliary_loss_mlp": 0.01070012, + "balance_loss_clip": 1.06755781, + "balance_loss_mlp": 1.04657495, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 2.049352754883285, + "language_loss": 0.78203559, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80481708, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.23449707, + "step": 1380, + "time_per_iteration": 2.6383767127990723 + }, + { + "auxiliary_loss_clip": 0.01215685, + "auxiliary_loss_mlp": 0.01072514, + "balance_loss_clip": 1.07701015, + "balance_loss_mlp": 1.04836202, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 3.109790464396715, + "language_loss": 0.85951501, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88239694, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.24169922, + "step": 1381, + "time_per_iteration": 2.5575125217437744 + }, + { + "auxiliary_loss_clip": 0.01226058, + "auxiliary_loss_mlp": 0.01070561, + "balance_loss_clip": 1.07698894, + "balance_loss_mlp": 1.04571819, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 6.248489580738116, + "language_loss": 0.88621294, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90917915, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.24816895, + "step": 1382, + "time_per_iteration": 2.508143663406372 + }, + { + "auxiliary_loss_clip": 0.01207132, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.06580329, + "balance_loss_mlp": 1.04629767, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 1.990601183950446, + "language_loss": 0.77720392, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79995424, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.21606445, + "step": 1383, + "time_per_iteration": 2.4148776531219482 + }, + { + "auxiliary_loss_clip": 0.01215763, + "auxiliary_loss_mlp": 0.0107816, + "balance_loss_clip": 1.07052827, + "balance_loss_mlp": 1.05387712, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.0250682992191362, + "language_loss": 0.82680058, + "learning_rate": 3.97037346403694e-06, + "loss": 0.84973985, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.24304199, + "step": 1384, + "time_per_iteration": 2.5076780319213867 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01067357, + "balance_loss_clip": 1.06740403, + "balance_loss_mlp": 1.04146504, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 4.00715154863909, + "language_loss": 0.8546921, + "learning_rate": 3.970306639845e-06, + "loss": 0.87751555, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 1.47753906, + "router_z_loss_mlp": 0.25891113, + "step": 1385, + "time_per_iteration": 2.4716920852661133 + }, + { + "auxiliary_loss_clip": 0.0121011, + "auxiliary_loss_mlp": 0.01077535, + "balance_loss_clip": 1.06559014, + "balance_loss_mlp": 1.05203652, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 1.7330466687227186, + "language_loss": 0.69200933, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71488577, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.25500488, + "step": 1386, + "time_per_iteration": 2.4683189392089844 + }, + { + "auxiliary_loss_clip": 0.01207764, + "auxiliary_loss_mlp": 0.01064121, + "balance_loss_clip": 1.06516218, + "balance_loss_mlp": 1.04029155, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.6205562247821668, + "language_loss": 0.81614482, + "learning_rate": 3.97017276732098e-06, + "loss": 0.83886373, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 1.42773438, + "router_z_loss_mlp": 0.23828125, + "step": 1387, + "time_per_iteration": 2.444894552230835 + }, + { + "auxiliary_loss_clip": 0.01212233, + "auxiliary_loss_mlp": 0.01074995, + "balance_loss_clip": 1.06715608, + "balance_loss_mlp": 1.04869699, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 6.062993842752933, + "language_loss": 0.77156186, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79443419, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.26318359, + "step": 1388, + "time_per_iteration": 2.4620039463043213 + }, + { + "auxiliary_loss_clip": 0.01209156, + "auxiliary_loss_mlp": 0.0107004, + "balance_loss_clip": 1.07057786, + "balance_loss_mlp": 1.04461277, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.1185093861634536, + "language_loss": 0.79487669, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81766862, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.25415039, + "step": 1389, + "time_per_iteration": 2.4647130966186523 + }, + { + "auxiliary_loss_clip": 0.01211343, + "auxiliary_loss_mlp": 0.01062645, + "balance_loss_clip": 1.0676651, + "balance_loss_mlp": 1.03871942, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 2.852646578699914, + "language_loss": 0.87205458, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89479446, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 1.43945312, + "router_z_loss_mlp": 0.23913574, + "step": 1390, + "time_per_iteration": 2.4579718112945557 + }, + { + "auxiliary_loss_clip": 0.01202348, + "auxiliary_loss_mlp": 0.01070416, + "balance_loss_clip": 1.06312907, + "balance_loss_mlp": 1.04570413, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6938822775140312, + "language_loss": 0.86781728, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89054489, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.24719238, + "step": 1391, + "time_per_iteration": 2.5226194858551025 + }, + { + "auxiliary_loss_clip": 0.01214223, + "auxiliary_loss_mlp": 0.01079318, + "balance_loss_clip": 1.06843352, + "balance_loss_mlp": 1.05524969, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 2.0005079055087123, + "language_loss": 0.87285662, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89579213, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 1.45800781, + "router_z_loss_mlp": 0.24072266, + "step": 1392, + "time_per_iteration": 2.450033187866211 + }, + { + "auxiliary_loss_clip": 0.01206417, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.06571615, + "balance_loss_mlp": 1.0371958, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.672361676249765, + "language_loss": 0.803339, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82601553, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.24060059, + "step": 1393, + "time_per_iteration": 2.462094783782959 + }, + { + "auxiliary_loss_clip": 0.01206367, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.06806803, + "balance_loss_mlp": 1.02783108, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.7539613340957745, + "language_loss": 0.84972239, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87230253, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.23828125, + "step": 1394, + "time_per_iteration": 2.579692840576172 + }, + { + "auxiliary_loss_clip": 0.01205168, + "auxiliary_loss_mlp": 0.01053973, + "balance_loss_clip": 1.06630707, + "balance_loss_mlp": 1.03007197, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 4.1048994587439775, + "language_loss": 0.83076102, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85335243, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.23876953, + "step": 1395, + "time_per_iteration": 2.512106418609619 + }, + { + "auxiliary_loss_clip": 0.01208062, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.0676595, + "balance_loss_mlp": 1.02875316, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9877859908136974, + "language_loss": 0.82285225, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84547323, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.25292969, + "step": 1396, + "time_per_iteration": 2.562748432159424 + }, + { + "auxiliary_loss_clip": 0.01203928, + "auxiliary_loss_mlp": 0.0106154, + "balance_loss_clip": 1.06756997, + "balance_loss_mlp": 1.03482485, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 1.8816906760442067, + "language_loss": 0.76853567, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79119033, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.26708984, + "step": 1397, + "time_per_iteration": 2.627530097961426 + }, + { + "auxiliary_loss_clip": 0.01204868, + "auxiliary_loss_mlp": 0.01054099, + "balance_loss_clip": 1.06674457, + "balance_loss_mlp": 1.0289104, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 4.2513728341882375, + "language_loss": 0.78284842, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80543816, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.25195312, + "step": 1398, + "time_per_iteration": 2.520326852798462 + }, + { + "auxiliary_loss_clip": 0.01195705, + "auxiliary_loss_mlp": 0.01050911, + "balance_loss_clip": 1.06097913, + "balance_loss_mlp": 1.02746284, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.4285718202333837, + "language_loss": 0.95165569, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97412181, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.234375, + "step": 1399, + "time_per_iteration": 2.447671890258789 + }, + { + "auxiliary_loss_clip": 0.01204523, + "auxiliary_loss_mlp": 0.01062327, + "balance_loss_clip": 1.06360745, + "balance_loss_mlp": 1.03533769, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 10.251994454975573, + "language_loss": 0.81986797, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84253639, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.26989746, + "step": 1400, + "time_per_iteration": 2.511448383331299 + }, + { + "auxiliary_loss_clip": 0.0120248, + "auxiliary_loss_mlp": 0.01061123, + "balance_loss_clip": 1.06376624, + "balance_loss_mlp": 1.0370543, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 1.9513127067251401, + "language_loss": 0.8689577, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89159369, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.24072266, + "step": 1401, + "time_per_iteration": 2.478240966796875 + }, + { + "auxiliary_loss_clip": 0.01200782, + "auxiliary_loss_mlp": 0.01070664, + "balance_loss_clip": 1.0603013, + "balance_loss_mlp": 1.04239929, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 1.9704045616831396, + "language_loss": 0.87625778, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89897221, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.28283691, + "step": 1402, + "time_per_iteration": 2.45993971824646 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01053995, + "balance_loss_clip": 1.05962503, + "balance_loss_mlp": 1.03009331, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.2069101498325123, + "language_loss": 0.89096165, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.91347629, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.23913574, + "step": 1403, + "time_per_iteration": 2.65106201171875 + }, + { + "auxiliary_loss_clip": 0.01198153, + "auxiliary_loss_mlp": 0.0105039, + "balance_loss_clip": 1.05802131, + "balance_loss_mlp": 1.025249, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 2.4909186099168528, + "language_loss": 0.79855567, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82104111, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.25134277, + "step": 1404, + "time_per_iteration": 2.4838955402374268 + }, + { + "auxiliary_loss_clip": 0.0120749, + "auxiliary_loss_mlp": 0.01066851, + "balance_loss_clip": 1.06512046, + "balance_loss_mlp": 1.04166222, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.091253545398677, + "language_loss": 0.83524948, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85799295, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 1.42382812, + "router_z_loss_mlp": 0.25219727, + "step": 1405, + "time_per_iteration": 2.406167984008789 + }, + { + "auxiliary_loss_clip": 0.01196423, + "auxiliary_loss_mlp": 0.01055188, + "balance_loss_clip": 1.0586313, + "balance_loss_mlp": 1.03210855, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.8657827982517738, + "language_loss": 0.80280125, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82531738, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.23059082, + "step": 1406, + "time_per_iteration": 2.6059417724609375 + }, + { + "auxiliary_loss_clip": 0.01208627, + "auxiliary_loss_mlp": 0.01061441, + "balance_loss_clip": 1.06947064, + "balance_loss_mlp": 1.03514314, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.987200781939423, + "language_loss": 0.79433525, + "learning_rate": 3.96881760944111e-06, + "loss": 0.81703591, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.26293945, + "step": 1407, + "time_per_iteration": 2.460848569869995 + }, + { + "auxiliary_loss_clip": 0.01197137, + "auxiliary_loss_mlp": 0.01051373, + "balance_loss_clip": 1.06129181, + "balance_loss_mlp": 1.02774572, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.269469041188757, + "language_loss": 0.91541296, + "learning_rate": 3.968749067468819e-06, + "loss": 0.93789804, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.23632812, + "step": 1408, + "time_per_iteration": 2.419459342956543 + }, + { + "auxiliary_loss_clip": 0.0109807, + "auxiliary_loss_mlp": 0.01100385, + "balance_loss_clip": 1.03854465, + "balance_loss_mlp": 1.09637952, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8972183193925624, + "language_loss": 0.61828721, + "learning_rate": 3.968680450841368e-06, + "loss": 0.64027184, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.59472656, + "router_z_loss_mlp": 0.04003906, + "step": 1409, + "time_per_iteration": 3.146244764328003 + }, + { + "auxiliary_loss_clip": 0.01192985, + "auxiliary_loss_mlp": 0.0105825, + "balance_loss_clip": 1.06103086, + "balance_loss_mlp": 1.03493261, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 7.821905596864955, + "language_loss": 0.86637747, + "learning_rate": 3.968611759561355e-06, + "loss": 0.88888979, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.2331543, + "step": 1410, + "time_per_iteration": 2.5016677379608154 + }, + { + "auxiliary_loss_clip": 0.01206096, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.06729221, + "balance_loss_mlp": 1.02850604, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.1391350014323813, + "language_loss": 0.74436206, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76696688, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.25866699, + "step": 1411, + "time_per_iteration": 3.8776447772979736 + }, + { + "auxiliary_loss_clip": 0.01087104, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.03221321, + "balance_loss_mlp": 1.04130054, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9084918561996181, + "language_loss": 0.56776011, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58908212, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.54980469, + "router_z_loss_mlp": 0.03796387, + "step": 1412, + "time_per_iteration": 4.406885385513306 + }, + { + "auxiliary_loss_clip": 0.01199172, + "auxiliary_loss_mlp": 0.01058641, + "balance_loss_clip": 1.06226254, + "balance_loss_mlp": 1.03471565, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.28451867857058, + "language_loss": 0.89106607, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91364419, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.23901367, + "step": 1413, + "time_per_iteration": 2.410562515258789 + }, + { + "auxiliary_loss_clip": 0.01200438, + "auxiliary_loss_mlp": 0.01074055, + "balance_loss_clip": 1.06383061, + "balance_loss_mlp": 1.04674459, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.2133147791141976, + "language_loss": 0.88133395, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90407878, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.27307129, + "step": 1414, + "time_per_iteration": 2.455766439437866 + }, + { + "auxiliary_loss_clip": 0.01203404, + "auxiliary_loss_mlp": 0.01067104, + "balance_loss_clip": 1.06575704, + "balance_loss_mlp": 1.0439775, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.8362452560785414, + "language_loss": 0.77615011, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79885519, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.23132324, + "step": 1415, + "time_per_iteration": 3.864198923110962 + }, + { + "auxiliary_loss_clip": 0.01198518, + "auxiliary_loss_mlp": 0.01076625, + "balance_loss_clip": 1.06147945, + "balance_loss_mlp": 1.05390322, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.8346280129634327, + "language_loss": 0.70859593, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73134732, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.22717285, + "step": 1416, + "time_per_iteration": 2.8129255771636963 + }, + { + "auxiliary_loss_clip": 0.01200445, + "auxiliary_loss_mlp": 0.01070968, + "balance_loss_clip": 1.06032288, + "balance_loss_mlp": 1.04556453, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 2.4134070180272054, + "language_loss": 0.74938095, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77209508, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.25402832, + "step": 1417, + "time_per_iteration": 3.9368903636932373 + }, + { + "auxiliary_loss_clip": 0.01199361, + "auxiliary_loss_mlp": 0.0107953, + "balance_loss_clip": 1.06251407, + "balance_loss_mlp": 1.05414987, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.769602505036375, + "language_loss": 0.8222391, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84502804, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.25378418, + "step": 1418, + "time_per_iteration": 2.4182000160217285 + }, + { + "auxiliary_loss_clip": 0.01113907, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.05452609, + "balance_loss_mlp": 1.05456781, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8724435673830797, + "language_loss": 0.5660575, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58778667, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.59472656, + "router_z_loss_mlp": 0.04449463, + "step": 1419, + "time_per_iteration": 3.0258078575134277 + }, + { + "auxiliary_loss_clip": 0.01198485, + "auxiliary_loss_mlp": 0.01072471, + "balance_loss_clip": 1.06140816, + "balance_loss_mlp": 1.04790163, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.6494638228349725, + "language_loss": 0.70237064, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72508025, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.24584961, + "step": 1420, + "time_per_iteration": 2.4831650257110596 + }, + { + "auxiliary_loss_clip": 0.01202334, + "auxiliary_loss_mlp": 0.01058646, + "balance_loss_clip": 1.06226146, + "balance_loss_mlp": 1.03432786, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.6978927418043075, + "language_loss": 0.88219619, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90480602, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.24328613, + "step": 1421, + "time_per_iteration": 2.475297689437866 + }, + { + "auxiliary_loss_clip": 0.01092073, + "auxiliary_loss_mlp": 0.01005589, + "balance_loss_clip": 1.03606331, + "balance_loss_mlp": 1.00197673, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7889749422662427, + "language_loss": 0.6350432, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65601981, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.56054688, + "router_z_loss_mlp": 0.03607178, + "step": 1422, + "time_per_iteration": 3.047189235687256 + }, + { + "auxiliary_loss_clip": 0.0120799, + "auxiliary_loss_mlp": 0.01056451, + "balance_loss_clip": 1.07412744, + "balance_loss_mlp": 1.03376508, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 3.019097588168799, + "language_loss": 0.82891262, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85155702, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.22705078, + "step": 1423, + "time_per_iteration": 2.507890224456787 + }, + { + "auxiliary_loss_clip": 0.012038, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.06485927, + "balance_loss_mlp": 1.03649426, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 2.773675611265315, + "language_loss": 0.75014472, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77278215, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.23474121, + "step": 1424, + "time_per_iteration": 2.4781386852264404 + }, + { + "auxiliary_loss_clip": 0.01199475, + "auxiliary_loss_mlp": 0.01059389, + "balance_loss_clip": 1.06383371, + "balance_loss_mlp": 1.03596473, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 2.0047320643931847, + "language_loss": 0.75718474, + "learning_rate": 3.96757243383196e-06, + "loss": 0.77977335, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.23449707, + "step": 1425, + "time_per_iteration": 2.4543917179107666 + }, + { + "auxiliary_loss_clip": 0.0119547, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_clip": 1.06102037, + "balance_loss_mlp": 1.03492427, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.2747695962344068, + "language_loss": 0.93133831, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.9538753, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.23327637, + "step": 1426, + "time_per_iteration": 2.4567008018493652 + }, + { + "auxiliary_loss_clip": 0.01204956, + "auxiliary_loss_mlp": 0.01071622, + "balance_loss_clip": 1.06587958, + "balance_loss_mlp": 1.04295206, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.815584925095746, + "language_loss": 0.75532305, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77808887, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.28649902, + "step": 1427, + "time_per_iteration": 2.427035331726074 + }, + { + "auxiliary_loss_clip": 0.01193394, + "auxiliary_loss_mlp": 0.01062821, + "balance_loss_clip": 1.05919993, + "balance_loss_mlp": 1.04002786, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.703119144275436, + "language_loss": 0.81933141, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84189355, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.22802734, + "step": 1428, + "time_per_iteration": 2.5102648735046387 + }, + { + "auxiliary_loss_clip": 0.01201478, + "auxiliary_loss_mlp": 0.01074706, + "balance_loss_clip": 1.06291592, + "balance_loss_mlp": 1.04989862, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.456175019579399, + "language_loss": 0.79699409, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81975597, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.24780273, + "step": 1429, + "time_per_iteration": 2.525357484817505 + }, + { + "auxiliary_loss_clip": 0.01201036, + "auxiliary_loss_mlp": 0.01070895, + "balance_loss_clip": 1.06302273, + "balance_loss_mlp": 1.04685092, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.2547076235447734, + "language_loss": 0.88106358, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90378284, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.24035645, + "step": 1430, + "time_per_iteration": 2.420774221420288 + }, + { + "auxiliary_loss_clip": 0.01196972, + "auxiliary_loss_mlp": 0.01081308, + "balance_loss_clip": 1.06447077, + "balance_loss_mlp": 1.05580902, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7454265530041952, + "language_loss": 0.81810617, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84088898, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.25512695, + "step": 1431, + "time_per_iteration": 2.494596481323242 + }, + { + "auxiliary_loss_clip": 0.01198608, + "auxiliary_loss_mlp": 0.0106721, + "balance_loss_clip": 1.06314373, + "balance_loss_mlp": 1.04265296, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.7748967428398525, + "language_loss": 0.77684844, + "learning_rate": 3.967081669605559e-06, + "loss": 0.79950655, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.24536133, + "step": 1432, + "time_per_iteration": 2.4812567234039307 + }, + { + "auxiliary_loss_clip": 0.01203004, + "auxiliary_loss_mlp": 0.01080437, + "balance_loss_clip": 1.06335962, + "balance_loss_mlp": 1.05495, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.0679300239463236, + "language_loss": 0.72971654, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75255096, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.25488281, + "step": 1433, + "time_per_iteration": 2.4566614627838135 + }, + { + "auxiliary_loss_clip": 0.01210305, + "auxiliary_loss_mlp": 0.01076316, + "balance_loss_clip": 1.0689373, + "balance_loss_mlp": 1.05000603, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.8295827121026833, + "language_loss": 0.85407406, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87694025, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.26293945, + "step": 1434, + "time_per_iteration": 2.435662269592285 + }, + { + "auxiliary_loss_clip": 0.0121088, + "auxiliary_loss_mlp": 0.01076991, + "balance_loss_clip": 1.07056618, + "balance_loss_mlp": 1.05285168, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.3511400990118836, + "language_loss": 0.78628838, + "learning_rate": 3.966870223147707e-06, + "loss": 0.80916715, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.24145508, + "step": 1435, + "time_per_iteration": 2.4750022888183594 + }, + { + "auxiliary_loss_clip": 0.01091993, + "auxiliary_loss_mlp": 0.01075822, + "balance_loss_clip": 1.03649664, + "balance_loss_mlp": 1.07228184, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.9063467722622258, + "language_loss": 0.57865953, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60033762, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.55566406, + "router_z_loss_mlp": 0.03540039, + "step": 1436, + "time_per_iteration": 3.169522523880005 + }, + { + "auxiliary_loss_clip": 0.01197567, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_clip": 1.06022012, + "balance_loss_mlp": 1.02970743, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.602610389752907, + "language_loss": 0.69383919, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71635157, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.23937988, + "step": 1437, + "time_per_iteration": 2.497230291366577 + }, + { + "auxiliary_loss_clip": 0.01205621, + "auxiliary_loss_mlp": 0.01055882, + "balance_loss_clip": 1.0662024, + "balance_loss_mlp": 1.03275526, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.864905369634047, + "language_loss": 0.72823513, + "learning_rate": 3.966658105434627e-06, + "loss": 0.7508502, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.23120117, + "step": 1438, + "time_per_iteration": 2.450420379638672 + }, + { + "auxiliary_loss_clip": 0.012134, + "auxiliary_loss_mlp": 0.01053558, + "balance_loss_clip": 1.07848263, + "balance_loss_mlp": 1.02934623, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.7253183176328446, + "language_loss": 0.64184785, + "learning_rate": 3.966587250374945e-06, + "loss": 0.6645174, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.2421875, + "step": 1439, + "time_per_iteration": 2.594395875930786 + }, + { + "auxiliary_loss_clip": 0.01206339, + "auxiliary_loss_mlp": 0.01052635, + "balance_loss_clip": 1.07095289, + "balance_loss_mlp": 1.02744579, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.4959192554178222, + "language_loss": 0.87572509, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89831483, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.25195312, + "step": 1440, + "time_per_iteration": 2.520014524459839 + }, + { + "auxiliary_loss_clip": 0.01204782, + "auxiliary_loss_mlp": 0.01062672, + "balance_loss_clip": 1.06536579, + "balance_loss_mlp": 1.03717315, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 3.2008092937128816, + "language_loss": 0.83362353, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85629803, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.25524902, + "step": 1441, + "time_per_iteration": 2.4831130504608154 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.04102159, + "balance_loss_mlp": 1.02660203, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.9879421545480224, + "language_loss": 0.60434192, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62562919, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.56347656, + "router_z_loss_mlp": 0.04663086, + "step": 1442, + "time_per_iteration": 3.195704698562622 + }, + { + "auxiliary_loss_clip": 0.01203317, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.06363368, + "balance_loss_mlp": 1.03334332, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.469940134967075, + "language_loss": 0.79184854, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81447935, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.26416016, + "step": 1443, + "time_per_iteration": 2.4991040229797363 + }, + { + "auxiliary_loss_clip": 0.01198653, + "auxiliary_loss_mlp": 0.01059358, + "balance_loss_clip": 1.06061244, + "balance_loss_mlp": 1.03377533, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.6174255450605786, + "language_loss": 0.82485664, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84743679, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.25561523, + "step": 1444, + "time_per_iteration": 2.5499155521392822 + }, + { + "auxiliary_loss_clip": 0.01206975, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.06644201, + "balance_loss_mlp": 1.03504241, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 2.176609148768529, + "language_loss": 0.86514753, + "learning_rate": 3.966160554074189e-06, + "loss": 0.887824, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.25634766, + "step": 1445, + "time_per_iteration": 2.532001256942749 + }, + { + "auxiliary_loss_clip": 0.01205382, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.06827033, + "balance_loss_mlp": 1.03305101, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.8873695084947946, + "language_loss": 0.81448138, + "learning_rate": 3.96608917705879e-06, + "loss": 0.83709872, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.23303223, + "step": 1446, + "time_per_iteration": 2.4773993492126465 + }, + { + "auxiliary_loss_clip": 0.01098621, + "auxiliary_loss_mlp": 0.01024951, + "balance_loss_clip": 1.04262161, + "balance_loss_mlp": 1.02154195, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.7368197163364228, + "language_loss": 0.54776919, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56900495, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.56054688, + "router_z_loss_mlp": 0.03417969, + "step": 1447, + "time_per_iteration": 3.1316776275634766 + }, + { + "auxiliary_loss_clip": 0.01202385, + "auxiliary_loss_mlp": 0.01058473, + "balance_loss_clip": 1.06789935, + "balance_loss_mlp": 1.03583467, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 3.5586809945609446, + "language_loss": 0.84322906, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86583769, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.22631836, + "step": 1448, + "time_per_iteration": 2.484738826751709 + }, + { + "auxiliary_loss_clip": 0.01208796, + "auxiliary_loss_mlp": 0.01058614, + "balance_loss_clip": 1.06929147, + "balance_loss_mlp": 1.03545165, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.960077537187749, + "language_loss": 0.80265558, + "learning_rate": 3.965874598697638e-06, + "loss": 0.8253296, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.23193359, + "step": 1449, + "time_per_iteration": 2.453521966934204 + }, + { + "auxiliary_loss_clip": 0.01197374, + "auxiliary_loss_mlp": 0.0105079, + "balance_loss_clip": 1.06387544, + "balance_loss_mlp": 1.02792585, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 2.127784577052218, + "language_loss": 0.710051, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73253262, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.22875977, + "step": 1450, + "time_per_iteration": 2.6123554706573486 + }, + { + "auxiliary_loss_clip": 0.01212191, + "auxiliary_loss_mlp": 0.01056764, + "balance_loss_clip": 1.07418895, + "balance_loss_mlp": 1.03299332, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.9331595998433733, + "language_loss": 0.83571547, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.2376709, + "step": 1451, + "time_per_iteration": 2.450910806655884 + }, + { + "auxiliary_loss_clip": 0.01205051, + "auxiliary_loss_mlp": 0.01053105, + "balance_loss_clip": 1.07044351, + "balance_loss_mlp": 1.03051472, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.8689310535672512, + "language_loss": 0.74428821, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76686972, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.22607422, + "step": 1452, + "time_per_iteration": 2.556621789932251 + }, + { + "auxiliary_loss_clip": 0.01210872, + "auxiliary_loss_mlp": 0.01082123, + "balance_loss_clip": 1.07044196, + "balance_loss_mlp": 1.05448985, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.339265960571473, + "language_loss": 0.79720968, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82013965, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.27636719, + "step": 1453, + "time_per_iteration": 2.434509754180908 + }, + { + "auxiliary_loss_clip": 0.01202068, + "auxiliary_loss_mlp": 0.01070413, + "balance_loss_clip": 1.06711042, + "balance_loss_mlp": 1.0466311, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.9475081504012743, + "language_loss": 0.70810723, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73083204, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.23803711, + "step": 1454, + "time_per_iteration": 2.4425132274627686 + }, + { + "auxiliary_loss_clip": 0.01097813, + "auxiliary_loss_mlp": 0.01045234, + "balance_loss_clip": 1.04331613, + "balance_loss_mlp": 1.04216146, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7970406603209187, + "language_loss": 0.58578342, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60721385, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.54492188, + "router_z_loss_mlp": 0.03070068, + "step": 1455, + "time_per_iteration": 5.799927234649658 + }, + { + "auxiliary_loss_clip": 0.01207544, + "auxiliary_loss_mlp": 0.0106606, + "balance_loss_clip": 1.07056808, + "balance_loss_mlp": 1.04189587, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 2.0412168958739634, + "language_loss": 0.77659547, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79933149, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 1.37011719, + "router_z_loss_mlp": 0.24169922, + "step": 1456, + "time_per_iteration": 2.5624372959136963 + }, + { + "auxiliary_loss_clip": 0.01207877, + "auxiliary_loss_mlp": 0.01054063, + "balance_loss_clip": 1.07024717, + "balance_loss_mlp": 1.03030491, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 3.338585363712448, + "language_loss": 0.72191012, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74452949, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.23754883, + "step": 1457, + "time_per_iteration": 2.582282066345215 + }, + { + "auxiliary_loss_clip": 0.01196907, + "auxiliary_loss_mlp": 0.01054006, + "balance_loss_clip": 1.06428099, + "balance_loss_mlp": 1.0319407, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.6792515080301205, + "language_loss": 0.86261868, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88512778, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.22045898, + "step": 1458, + "time_per_iteration": 2.4854469299316406 + }, + { + "auxiliary_loss_clip": 0.01208288, + "auxiliary_loss_mlp": 0.01066008, + "balance_loss_clip": 1.07103109, + "balance_loss_mlp": 1.04244041, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 2.735205513768102, + "language_loss": 0.80651152, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82925445, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.23571777, + "step": 1459, + "time_per_iteration": 3.890669345855713 + }, + { + "auxiliary_loss_clip": 0.01209459, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.0709002, + "balance_loss_mlp": 1.03706527, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.226581614336563, + "language_loss": 0.83930743, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86201286, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.2401123, + "step": 1460, + "time_per_iteration": 3.8348536491394043 + }, + { + "auxiliary_loss_clip": 0.01210652, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.07357883, + "balance_loss_mlp": 1.04377866, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 3.4014910323809793, + "language_loss": 0.80023789, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82300353, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.22119141, + "step": 1461, + "time_per_iteration": 2.5475471019744873 + }, + { + "auxiliary_loss_clip": 0.01203596, + "auxiliary_loss_mlp": 0.01068294, + "balance_loss_clip": 1.06754994, + "balance_loss_mlp": 1.04553747, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 2.181321235814112, + "language_loss": 0.76379347, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78651237, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.22753906, + "step": 1462, + "time_per_iteration": 2.5298731327056885 + }, + { + "auxiliary_loss_clip": 0.01209731, + "auxiliary_loss_mlp": 0.01061442, + "balance_loss_clip": 1.07130802, + "balance_loss_mlp": 1.03731418, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.127377089195407, + "language_loss": 0.74709493, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76980668, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.24157715, + "step": 1463, + "time_per_iteration": 2.4597980976104736 + }, + { + "auxiliary_loss_clip": 0.0121057, + "auxiliary_loss_mlp": 0.01068846, + "balance_loss_clip": 1.06769133, + "balance_loss_mlp": 1.04426479, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.11972165020629, + "language_loss": 0.83139777, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85419196, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.24584961, + "step": 1464, + "time_per_iteration": 2.520271062850952 + }, + { + "auxiliary_loss_clip": 0.01197836, + "auxiliary_loss_mlp": 0.01074829, + "balance_loss_clip": 1.06338716, + "balance_loss_mlp": 1.05031919, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 2.002114445977782, + "language_loss": 0.78403437, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80676103, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.24536133, + "step": 1465, + "time_per_iteration": 2.500349521636963 + }, + { + "auxiliary_loss_clip": 0.01205657, + "auxiliary_loss_mlp": 0.01059255, + "balance_loss_clip": 1.06724668, + "balance_loss_mlp": 1.03748775, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 2.1417505127602516, + "language_loss": 0.85560822, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87825733, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.21777344, + "step": 1466, + "time_per_iteration": 2.531421422958374 + }, + { + "auxiliary_loss_clip": 0.01200852, + "auxiliary_loss_mlp": 0.01065421, + "balance_loss_clip": 1.06394124, + "balance_loss_mlp": 1.03990996, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 2.1677635329895, + "language_loss": 0.83975255, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86241531, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.25500488, + "step": 1467, + "time_per_iteration": 2.5181005001068115 + }, + { + "auxiliary_loss_clip": 0.01197795, + "auxiliary_loss_mlp": 0.01057289, + "balance_loss_clip": 1.06401467, + "balance_loss_mlp": 1.03467536, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 2.1563304948365527, + "language_loss": 0.75814855, + "learning_rate": 3.964500025305907e-06, + "loss": 0.78069937, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.22619629, + "step": 1468, + "time_per_iteration": 2.4645936489105225 + }, + { + "auxiliary_loss_clip": 0.01195853, + "auxiliary_loss_mlp": 0.01063085, + "balance_loss_clip": 1.0621326, + "balance_loss_mlp": 1.03812289, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.5616251672656027, + "language_loss": 0.80416691, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.8267563, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.24951172, + "step": 1469, + "time_per_iteration": 2.516282320022583 + }, + { + "auxiliary_loss_clip": 0.01201479, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.06202161, + "balance_loss_mlp": 1.04274619, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 2.327144648180007, + "language_loss": 0.77803624, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.80074012, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.26184082, + "step": 1470, + "time_per_iteration": 2.4807815551757812 + }, + { + "auxiliary_loss_clip": 0.01191288, + "auxiliary_loss_mlp": 0.01062614, + "balance_loss_clip": 1.05915797, + "balance_loss_mlp": 1.038867, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.734230515321314, + "language_loss": 0.83922189, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86176091, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.23779297, + "step": 1471, + "time_per_iteration": 2.4362294673919678 + }, + { + "auxiliary_loss_clip": 0.01197874, + "auxiliary_loss_mlp": 0.01052072, + "balance_loss_clip": 1.06962514, + "balance_loss_mlp": 1.031497, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.5905376338147459, + "language_loss": 0.83366269, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85616213, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.20581055, + "step": 1472, + "time_per_iteration": 2.5031816959381104 + }, + { + "auxiliary_loss_clip": 0.01199099, + "auxiliary_loss_mlp": 0.01060684, + "balance_loss_clip": 1.06398761, + "balance_loss_mlp": 1.03766441, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.1530923116012315, + "language_loss": 0.82631665, + "learning_rate": 3.964133825052146e-06, + "loss": 0.84891444, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.23046875, + "step": 1473, + "time_per_iteration": 2.451876401901245 + }, + { + "auxiliary_loss_clip": 0.0119456, + "auxiliary_loss_mlp": 0.01055634, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.03394949, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 2.124141561428976, + "language_loss": 0.78676975, + "learning_rate": 3.964060361549816e-06, + "loss": 0.80927169, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.21679688, + "step": 1474, + "time_per_iteration": 2.716201066970825 + }, + { + "auxiliary_loss_clip": 0.01195534, + "auxiliary_loss_mlp": 0.01056495, + "balance_loss_clip": 1.06284523, + "balance_loss_mlp": 1.03310621, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.7805426785027154, + "language_loss": 0.79072392, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81324422, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.23388672, + "step": 1475, + "time_per_iteration": 2.533200740814209 + }, + { + "auxiliary_loss_clip": 0.01203007, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_clip": 1.0666697, + "balance_loss_mlp": 1.02623534, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 3.0889045650974287, + "language_loss": 0.74130052, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76382548, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.23266602, + "step": 1476, + "time_per_iteration": 2.6491587162017822 + }, + { + "auxiliary_loss_clip": 0.01197171, + "auxiliary_loss_mlp": 0.01059788, + "balance_loss_clip": 1.0628072, + "balance_loss_mlp": 1.03709078, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.579108926129831, + "language_loss": 0.74427301, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.7668426, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.22692871, + "step": 1477, + "time_per_iteration": 2.5545856952667236 + }, + { + "auxiliary_loss_clip": 0.01198076, + "auxiliary_loss_mlp": 0.01054153, + "balance_loss_clip": 1.06460428, + "balance_loss_mlp": 1.03102648, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.363006731408709, + "language_loss": 0.87104106, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89356333, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.23132324, + "step": 1478, + "time_per_iteration": 2.472637176513672 + }, + { + "auxiliary_loss_clip": 0.01195206, + "auxiliary_loss_mlp": 0.01058549, + "balance_loss_clip": 1.06234717, + "balance_loss_mlp": 1.03648293, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.6466814358039719, + "language_loss": 0.77465516, + "learning_rate": 3.963691926933495e-06, + "loss": 0.79719269, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.22070312, + "step": 1479, + "time_per_iteration": 2.4756171703338623 + }, + { + "auxiliary_loss_clip": 0.01197854, + "auxiliary_loss_mlp": 0.01053865, + "balance_loss_clip": 1.06337762, + "balance_loss_mlp": 1.02991557, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.658820208759222, + "language_loss": 0.78002501, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80254221, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.23962402, + "step": 1480, + "time_per_iteration": 2.485952377319336 + }, + { + "auxiliary_loss_clip": 0.01202819, + "auxiliary_loss_mlp": 0.01060608, + "balance_loss_clip": 1.06480455, + "balance_loss_mlp": 1.03648019, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 1.630595329833887, + "language_loss": 0.66665184, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68928611, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.24121094, + "step": 1481, + "time_per_iteration": 2.489647626876831 + }, + { + "auxiliary_loss_clip": 0.01198077, + "auxiliary_loss_mlp": 0.01049096, + "balance_loss_clip": 1.06397462, + "balance_loss_mlp": 1.02678013, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 2.0734504336446453, + "language_loss": 0.96156418, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98403597, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.22314453, + "step": 1482, + "time_per_iteration": 2.458712339401245 + }, + { + "auxiliary_loss_clip": 0.01201557, + "auxiliary_loss_mlp": 0.01058575, + "balance_loss_clip": 1.06217909, + "balance_loss_mlp": 1.03422022, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 1.919206755316593, + "language_loss": 0.78589416, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80849552, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.24389648, + "step": 1483, + "time_per_iteration": 2.5421531200408936 + }, + { + "auxiliary_loss_clip": 0.01195925, + "auxiliary_loss_mlp": 0.01073646, + "balance_loss_clip": 1.06367922, + "balance_loss_mlp": 1.0496608, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.3650208286445698, + "language_loss": 0.85577321, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87846893, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.23999023, + "step": 1484, + "time_per_iteration": 2.427931308746338 + }, + { + "auxiliary_loss_clip": 0.01202494, + "auxiliary_loss_mlp": 0.0106077, + "balance_loss_clip": 1.06404817, + "balance_loss_mlp": 1.03684509, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.9954530905201715, + "language_loss": 0.80164367, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82427633, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.23925781, + "step": 1485, + "time_per_iteration": 2.5566043853759766 + }, + { + "auxiliary_loss_clip": 0.01194692, + "auxiliary_loss_mlp": 0.0105702, + "balance_loss_clip": 1.05940056, + "balance_loss_mlp": 1.03361976, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 1.7555135045991292, + "language_loss": 0.82802629, + "learning_rate": 3.96317299108688e-06, + "loss": 0.8505435, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.23425293, + "step": 1486, + "time_per_iteration": 2.4960782527923584 + }, + { + "auxiliary_loss_clip": 0.01197098, + "auxiliary_loss_mlp": 0.01060564, + "balance_loss_clip": 1.06342125, + "balance_loss_mlp": 1.03737736, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.7999338397907116, + "language_loss": 0.76278764, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78536427, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.23181152, + "step": 1487, + "time_per_iteration": 2.465285062789917 + }, + { + "auxiliary_loss_clip": 0.01209196, + "auxiliary_loss_mlp": 0.01057191, + "balance_loss_clip": 1.07331038, + "balance_loss_mlp": 1.03314638, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.4764106872657026, + "language_loss": 0.82843935, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85110319, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.24047852, + "step": 1488, + "time_per_iteration": 2.4732697010040283 + }, + { + "auxiliary_loss_clip": 0.01202908, + "auxiliary_loss_mlp": 0.01054942, + "balance_loss_clip": 1.06730652, + "balance_loss_mlp": 1.03132677, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9460135406390646, + "language_loss": 0.71559644, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73817492, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.23632812, + "step": 1489, + "time_per_iteration": 2.7149219512939453 + }, + { + "auxiliary_loss_clip": 0.01195723, + "auxiliary_loss_mlp": 0.01051576, + "balance_loss_clip": 1.06199896, + "balance_loss_mlp": 1.02898574, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 2.1863152033352047, + "language_loss": 0.89926052, + "learning_rate": 3.962874818493745e-06, + "loss": 0.9217335, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.22595215, + "step": 1490, + "time_per_iteration": 2.5543227195739746 + }, + { + "auxiliary_loss_clip": 0.01200836, + "auxiliary_loss_mlp": 0.01064036, + "balance_loss_clip": 1.06162715, + "balance_loss_mlp": 1.04118371, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 6.596603341700405, + "language_loss": 0.73184562, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75449425, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.2286377, + "step": 1491, + "time_per_iteration": 2.5307118892669678 + }, + { + "auxiliary_loss_clip": 0.01206603, + "auxiliary_loss_mlp": 0.01054313, + "balance_loss_clip": 1.07310712, + "balance_loss_mlp": 1.03305829, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.9130579642074608, + "language_loss": 0.77173948, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79434866, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.21264648, + "step": 1492, + "time_per_iteration": 2.4866671562194824 + }, + { + "auxiliary_loss_clip": 0.01192272, + "auxiliary_loss_mlp": 0.0105135, + "balance_loss_clip": 1.06297159, + "balance_loss_mlp": 1.02868867, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.379727281126403, + "language_loss": 0.70858049, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73101664, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.22680664, + "step": 1493, + "time_per_iteration": 2.5776302814483643 + }, + { + "auxiliary_loss_clip": 0.01198633, + "auxiliary_loss_mlp": 0.01053618, + "balance_loss_clip": 1.06437159, + "balance_loss_mlp": 1.0302527, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.3876053574356955, + "language_loss": 0.87006807, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89259058, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.23376465, + "step": 1494, + "time_per_iteration": 2.475684404373169 + }, + { + "auxiliary_loss_clip": 0.01200696, + "auxiliary_loss_mlp": 0.01062577, + "balance_loss_clip": 1.06754673, + "balance_loss_mlp": 1.0387466, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 2.040542708183433, + "language_loss": 0.82918078, + "learning_rate": 3.962500428044454e-06, + "loss": 0.8518135, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.23840332, + "step": 1495, + "time_per_iteration": 2.592409372329712 + }, + { + "auxiliary_loss_clip": 0.01203062, + "auxiliary_loss_mlp": 0.0105953, + "balance_loss_clip": 1.06789505, + "balance_loss_mlp": 1.03711891, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 2.9291143406731375, + "language_loss": 0.7019521, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72457802, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.22399902, + "step": 1496, + "time_per_iteration": 2.4003257751464844 + }, + { + "auxiliary_loss_clip": 0.01193335, + "auxiliary_loss_mlp": 0.01050069, + "balance_loss_clip": 1.06234097, + "balance_loss_mlp": 1.02881432, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 2.007083464228843, + "language_loss": 0.79474139, + "learning_rate": 3.962350150917351e-06, + "loss": 0.81717539, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.21264648, + "step": 1497, + "time_per_iteration": 2.4634571075439453 + }, + { + "auxiliary_loss_clip": 0.01209765, + "auxiliary_loss_mlp": 0.01058718, + "balance_loss_clip": 1.06887949, + "balance_loss_mlp": 1.03507924, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 6.916517095675672, + "language_loss": 0.8261497, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84883451, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 1.40917969, + "router_z_loss_mlp": 0.23632812, + "step": 1498, + "time_per_iteration": 3.9027414321899414 + }, + { + "auxiliary_loss_clip": 0.01198438, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_clip": 1.06631386, + "balance_loss_mlp": 1.04008484, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.9584928411394382, + "language_loss": 0.78854769, + "learning_rate": 3.962199576140195e-06, + "loss": 0.81114811, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.2154541, + "step": 1499, + "time_per_iteration": 3.8322057723999023 + }, + { + "auxiliary_loss_clip": 0.01204116, + "auxiliary_loss_mlp": 0.01058566, + "balance_loss_clip": 1.07285643, + "balance_loss_mlp": 1.03605938, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.6171554755915292, + "language_loss": 0.93174148, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95436829, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.22497559, + "step": 1500, + "time_per_iteration": 2.4893667697906494 + }, + { + "auxiliary_loss_clip": 0.01192769, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.05810142, + "balance_loss_mlp": 1.02908468, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 5.954036582798612, + "language_loss": 0.74296111, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76542711, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.24768066, + "step": 1501, + "time_per_iteration": 2.4686684608459473 + }, + { + "auxiliary_loss_clip": 0.01089484, + "auxiliary_loss_mlp": 0.01019303, + "balance_loss_clip": 1.03520477, + "balance_loss_mlp": 1.01610839, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7305077010651958, + "language_loss": 0.58303654, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60412443, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.54101562, + "router_z_loss_mlp": 0.03198242, + "step": 1502, + "time_per_iteration": 3.093172311782837 + }, + { + "auxiliary_loss_clip": 0.01198384, + "auxiliary_loss_mlp": 0.01049617, + "balance_loss_clip": 1.06526768, + "balance_loss_mlp": 1.02827859, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.573089699193843, + "language_loss": 0.69826865, + "learning_rate": 3.961897533727119e-06, + "loss": 0.7207486, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 1.33105469, + "router_z_loss_mlp": 0.21337891, + "step": 1503, + "time_per_iteration": 2.599553346633911 + }, + { + "auxiliary_loss_clip": 0.01193795, + "auxiliary_loss_mlp": 0.010594, + "balance_loss_clip": 1.0596565, + "balance_loss_mlp": 1.03751278, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.18950894897629, + "language_loss": 0.86224914, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88478112, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.21887207, + "step": 1504, + "time_per_iteration": 5.370445013046265 + }, + { + "auxiliary_loss_clip": 0.01207441, + "auxiliary_loss_mlp": 0.01061828, + "balance_loss_clip": 1.069731, + "balance_loss_mlp": 1.03626907, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 2.5346336968483314, + "language_loss": 0.72237968, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74507236, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.25561523, + "step": 1505, + "time_per_iteration": 2.469116449356079 + }, + { + "auxiliary_loss_clip": 0.01196148, + "auxiliary_loss_mlp": 0.01054983, + "balance_loss_clip": 1.06647682, + "balance_loss_mlp": 1.03285801, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.065412825579389, + "language_loss": 0.81134236, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83385366, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.22131348, + "step": 1506, + "time_per_iteration": 2.445068597793579 + }, + { + "auxiliary_loss_clip": 0.01195488, + "auxiliary_loss_mlp": 0.01056594, + "balance_loss_clip": 1.06417704, + "balance_loss_mlp": 1.03301418, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 1.889135574536309, + "language_loss": 0.76290119, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78542197, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.23571777, + "step": 1507, + "time_per_iteration": 2.546595811843872 + }, + { + "auxiliary_loss_clip": 0.01102272, + "auxiliary_loss_mlp": 0.01012006, + "balance_loss_clip": 1.04821849, + "balance_loss_mlp": 1.00881147, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7317612738814225, + "language_loss": 0.57639372, + "learning_rate": 3.961518306836998e-06, + "loss": 0.5975365, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.54003906, + "router_z_loss_mlp": 0.0319519, + "step": 1508, + "time_per_iteration": 2.9586429595947266 + }, + { + "auxiliary_loss_clip": 0.01194408, + "auxiliary_loss_mlp": 0.01055681, + "balance_loss_clip": 1.06190538, + "balance_loss_mlp": 1.03270888, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.6012104143618866, + "language_loss": 0.85216212, + "learning_rate": 3.961442238304543e-06, + "loss": 0.874663, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.22973633, + "step": 1509, + "time_per_iteration": 2.454474687576294 + }, + { + "auxiliary_loss_clip": 0.01200444, + "auxiliary_loss_mlp": 0.01061663, + "balance_loss_clip": 1.06316459, + "balance_loss_mlp": 1.0374155, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.7381889096031022, + "language_loss": 0.83966064, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86228174, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.24267578, + "step": 1510, + "time_per_iteration": 2.511934995651245 + }, + { + "auxiliary_loss_clip": 0.01198195, + "auxiliary_loss_mlp": 0.01054471, + "balance_loss_clip": 1.06324697, + "balance_loss_mlp": 1.03136778, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9919797974376006, + "language_loss": 0.8530966, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87562323, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.23095703, + "step": 1511, + "time_per_iteration": 2.454730749130249 + }, + { + "auxiliary_loss_clip": 0.01198058, + "auxiliary_loss_mlp": 0.01049904, + "balance_loss_clip": 1.06753004, + "balance_loss_mlp": 1.02754033, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.962573971016332, + "language_loss": 0.85185814, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87433779, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.22351074, + "step": 1512, + "time_per_iteration": 2.5285556316375732 + }, + { + "auxiliary_loss_clip": 0.01190232, + "auxiliary_loss_mlp": 0.01050126, + "balance_loss_clip": 1.06180489, + "balance_loss_mlp": 1.02945495, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.394134030326647, + "language_loss": 0.86782396, + "learning_rate": 3.961137220422749e-06, + "loss": 0.8902275, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.20678711, + "step": 1513, + "time_per_iteration": 2.485741376876831 + }, + { + "auxiliary_loss_clip": 0.0118967, + "auxiliary_loss_mlp": 0.01048653, + "balance_loss_clip": 1.06073451, + "balance_loss_mlp": 1.02810121, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.9612964854167159, + "language_loss": 0.86790335, + "learning_rate": 3.961060780028764e-06, + "loss": 0.89028662, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.20544434, + "step": 1514, + "time_per_iteration": 2.4911086559295654 + }, + { + "auxiliary_loss_clip": 0.01190128, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_clip": 1.06215334, + "balance_loss_mlp": 1.03241873, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.8563694678631568, + "language_loss": 0.90137661, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92379999, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.19775391, + "step": 1515, + "time_per_iteration": 2.5670206546783447 + }, + { + "auxiliary_loss_clip": 0.01193089, + "auxiliary_loss_mlp": 0.01045179, + "balance_loss_clip": 1.06189346, + "balance_loss_mlp": 1.02268422, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.5437263934124257, + "language_loss": 0.85133433, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87371707, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.22509766, + "step": 1516, + "time_per_iteration": 2.5068519115448 + }, + { + "auxiliary_loss_clip": 0.0120314, + "auxiliary_loss_mlp": 0.01053062, + "balance_loss_clip": 1.06630969, + "balance_loss_mlp": 1.03080535, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5284980293590598, + "language_loss": 0.8117348, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83429682, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.22253418, + "step": 1517, + "time_per_iteration": 2.555699348449707 + }, + { + "auxiliary_loss_clip": 0.0119785, + "auxiliary_loss_mlp": 0.01065139, + "balance_loss_clip": 1.06404018, + "balance_loss_mlp": 1.04290676, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.964032507992795, + "language_loss": 0.77770561, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80033553, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.22229004, + "step": 1518, + "time_per_iteration": 2.455714225769043 + }, + { + "auxiliary_loss_clip": 0.01192416, + "auxiliary_loss_mlp": 0.01060516, + "balance_loss_clip": 1.06161988, + "balance_loss_mlp": 1.03830683, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.8385223090200813, + "language_loss": 0.8635453, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88607466, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.22229004, + "step": 1519, + "time_per_iteration": 2.5269808769226074 + }, + { + "auxiliary_loss_clip": 0.01189601, + "auxiliary_loss_mlp": 0.01057345, + "balance_loss_clip": 1.05742812, + "balance_loss_mlp": 1.0325613, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.1320553287088284, + "language_loss": 0.72968268, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75215214, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.24768066, + "step": 1520, + "time_per_iteration": 2.5288634300231934 + }, + { + "auxiliary_loss_clip": 0.01192671, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_clip": 1.05984473, + "balance_loss_mlp": 1.02969503, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 7.140826216116647, + "language_loss": 0.85881805, + "learning_rate": 3.960523615252156e-06, + "loss": 0.8812722, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.23034668, + "step": 1521, + "time_per_iteration": 2.4286251068115234 + }, + { + "auxiliary_loss_clip": 0.01199863, + "auxiliary_loss_mlp": 0.01057488, + "balance_loss_clip": 1.06421077, + "balance_loss_mlp": 1.03443265, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.4461705754262817, + "language_loss": 0.84211898, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86469245, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.23059082, + "step": 1522, + "time_per_iteration": 2.4694712162017822 + }, + { + "auxiliary_loss_clip": 0.01188816, + "auxiliary_loss_mlp": 0.01059678, + "balance_loss_clip": 1.05999672, + "balance_loss_mlp": 1.03682542, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.8443951754286805, + "language_loss": 0.80904305, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83152807, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.2286377, + "step": 1523, + "time_per_iteration": 2.5210115909576416 + }, + { + "auxiliary_loss_clip": 0.01196416, + "auxiliary_loss_mlp": 0.0106173, + "balance_loss_clip": 1.0643332, + "balance_loss_mlp": 1.03985548, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.192696063271926, + "language_loss": 0.74699426, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76957572, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.21875, + "step": 1524, + "time_per_iteration": 2.4059898853302 + }, + { + "auxiliary_loss_clip": 0.01193499, + "auxiliary_loss_mlp": 0.01055272, + "balance_loss_clip": 1.06203067, + "balance_loss_mlp": 1.03301609, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 4.111873046902377, + "language_loss": 0.86197114, + "learning_rate": 3.960215028335644e-06, + "loss": 0.8844589, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.22265625, + "step": 1525, + "time_per_iteration": 2.422893762588501 + }, + { + "auxiliary_loss_clip": 0.01196634, + "auxiliary_loss_mlp": 0.0104717, + "balance_loss_clip": 1.06515217, + "balance_loss_mlp": 1.02436507, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.53621996313909, + "language_loss": 0.74632663, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76876467, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.22802734, + "step": 1526, + "time_per_iteration": 2.510282516479492 + }, + { + "auxiliary_loss_clip": 0.01188443, + "auxiliary_loss_mlp": 0.01049187, + "balance_loss_clip": 1.06030369, + "balance_loss_mlp": 1.02781248, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 2.0122111971909065, + "language_loss": 0.77202845, + "learning_rate": 3.960060288876378e-06, + "loss": 0.7944048, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.21398926, + "step": 1527, + "time_per_iteration": 2.446504592895508 + }, + { + "auxiliary_loss_clip": 0.01195041, + "auxiliary_loss_mlp": 0.01055307, + "balance_loss_clip": 1.06290007, + "balance_loss_mlp": 1.03190589, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 2.552438540871102, + "language_loss": 0.78729457, + "learning_rate": 3.959982807656753e-06, + "loss": 0.809798, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.23425293, + "step": 1528, + "time_per_iteration": 2.5066792964935303 + }, + { + "auxiliary_loss_clip": 0.01193185, + "auxiliary_loss_mlp": 0.01047269, + "balance_loss_clip": 1.06127238, + "balance_loss_mlp": 1.02598989, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 3.6121383769686117, + "language_loss": 0.76522654, + "learning_rate": 3.959905252114384e-06, + "loss": 0.78763109, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.21276855, + "step": 1529, + "time_per_iteration": 2.454122304916382 + }, + { + "auxiliary_loss_clip": 0.01191639, + "auxiliary_loss_mlp": 0.01046676, + "balance_loss_clip": 1.05837214, + "balance_loss_mlp": 1.0238471, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 2.4107521578430946, + "language_loss": 0.82843518, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85081828, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.22839355, + "step": 1530, + "time_per_iteration": 2.479022979736328 + }, + { + "auxiliary_loss_clip": 0.0120453, + "auxiliary_loss_mlp": 0.01058566, + "balance_loss_clip": 1.07321954, + "balance_loss_mlp": 1.03608322, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 5.477231611842539, + "language_loss": 0.83409595, + "learning_rate": 3.959749918073179e-06, + "loss": 0.85672694, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.22485352, + "step": 1531, + "time_per_iteration": 2.4730939865112305 + }, + { + "auxiliary_loss_clip": 0.01194717, + "auxiliary_loss_mlp": 0.01051982, + "balance_loss_clip": 1.06394553, + "balance_loss_mlp": 1.02986884, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.9794351162292507, + "language_loss": 0.81189686, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83436382, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 1.30957031, + "router_z_loss_mlp": 0.22119141, + "step": 1532, + "time_per_iteration": 2.4986045360565186 + }, + { + "auxiliary_loss_clip": 0.01193948, + "auxiliary_loss_mlp": 0.0105103, + "balance_loss_clip": 1.06317854, + "balance_loss_mlp": 1.02858257, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.1881117003841752, + "language_loss": 0.83958447, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.8620342, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.22436523, + "step": 1533, + "time_per_iteration": 2.5364012718200684 + }, + { + "auxiliary_loss_clip": 0.01193637, + "auxiliary_loss_mlp": 0.01048722, + "balance_loss_clip": 1.06213439, + "balance_loss_mlp": 1.02700198, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 3.0435145002370585, + "language_loss": 0.90194356, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92436713, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.21740723, + "step": 1534, + "time_per_iteration": 2.47128963470459 + }, + { + "auxiliary_loss_clip": 0.01198821, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.06254578, + "balance_loss_mlp": 1.0412364, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.179736938722815, + "language_loss": 0.75954312, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78219044, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.24707031, + "step": 1535, + "time_per_iteration": 2.516408920288086 + }, + { + "auxiliary_loss_clip": 0.01184884, + "auxiliary_loss_mlp": 0.01045459, + "balance_loss_clip": 1.05839789, + "balance_loss_mlp": 1.02443027, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.7859786318222945, + "language_loss": 0.81394911, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83625257, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.21032715, + "step": 1536, + "time_per_iteration": 2.4552628993988037 + }, + { + "auxiliary_loss_clip": 0.01195585, + "auxiliary_loss_mlp": 0.01051977, + "balance_loss_clip": 1.06595874, + "balance_loss_mlp": 1.03063869, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.0389705676071834, + "language_loss": 0.88846248, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91093808, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.21337891, + "step": 1537, + "time_per_iteration": 2.4786553382873535 + }, + { + "auxiliary_loss_clip": 0.01217438, + "auxiliary_loss_mlp": 0.01067741, + "balance_loss_clip": 1.07936263, + "balance_loss_mlp": 1.04437566, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.096830012866628, + "language_loss": 0.8064028, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82925463, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.23352051, + "step": 1538, + "time_per_iteration": 2.449911594390869 + }, + { + "auxiliary_loss_clip": 0.01106478, + "auxiliary_loss_mlp": 0.01008267, + "balance_loss_clip": 1.05231822, + "balance_loss_mlp": 1.00488114, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7317806414838182, + "language_loss": 0.57366717, + "learning_rate": 3.959125609588142e-06, + "loss": 0.5948146, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.54199219, + "router_z_loss_mlp": 0.03384399, + "step": 1539, + "time_per_iteration": 3.1868698596954346 + }, + { + "auxiliary_loss_clip": 0.01189242, + "auxiliary_loss_mlp": 0.01055511, + "balance_loss_clip": 1.05963004, + "balance_loss_mlp": 1.03183651, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.8178646379367613, + "language_loss": 0.67540967, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69785714, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.23657227, + "step": 1540, + "time_per_iteration": 2.4358036518096924 + }, + { + "auxiliary_loss_clip": 0.01192209, + "auxiliary_loss_mlp": 0.01053493, + "balance_loss_clip": 1.06171393, + "balance_loss_mlp": 1.02899528, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.8470233503470923, + "language_loss": 0.83753085, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85998785, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.24499512, + "step": 1541, + "time_per_iteration": 2.4724245071411133 + }, + { + "auxiliary_loss_clip": 0.0108025, + "auxiliary_loss_mlp": 0.01015459, + "balance_loss_clip": 1.02763379, + "balance_loss_mlp": 1.01235092, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8872601931550986, + "language_loss": 0.61873007, + "learning_rate": 3.9588902680358e-06, + "loss": 0.63968718, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.52539062, + "router_z_loss_mlp": 0.03109741, + "step": 1542, + "time_per_iteration": 4.504995584487915 + }, + { + "auxiliary_loss_clip": 0.01191662, + "auxiliary_loss_mlp": 0.01067712, + "balance_loss_clip": 1.06012201, + "balance_loss_mlp": 1.0444665, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.6721082890884, + "language_loss": 0.82962084, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85221457, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 1.31640625, + "router_z_loss_mlp": 0.23242188, + "step": 1543, + "time_per_iteration": 3.971769332885742 + }, + { + "auxiliary_loss_clip": 0.01213751, + "auxiliary_loss_mlp": 0.01055785, + "balance_loss_clip": 1.08007216, + "balance_loss_mlp": 1.03395748, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 2.1366548945473998, + "language_loss": 0.72464168, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74733704, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 1.33789062, + "router_z_loss_mlp": 0.21813965, + "step": 1544, + "time_per_iteration": 2.773512601852417 + }, + { + "auxiliary_loss_clip": 0.01195127, + "auxiliary_loss_mlp": 0.01053108, + "balance_loss_clip": 1.0622766, + "balance_loss_mlp": 1.03057694, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.623255463760798, + "language_loss": 0.77391148, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79639381, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.22546387, + "step": 1545, + "time_per_iteration": 2.520751476287842 + }, + { + "auxiliary_loss_clip": 0.01183432, + "auxiliary_loss_mlp": 0.01059631, + "balance_loss_clip": 1.05745935, + "balance_loss_mlp": 1.03569341, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 3.1750617569992174, + "language_loss": 0.74433088, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76676148, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.23962402, + "step": 1546, + "time_per_iteration": 2.3982391357421875 + }, + { + "auxiliary_loss_clip": 0.01193855, + "auxiliary_loss_mlp": 0.01059476, + "balance_loss_clip": 1.06105578, + "balance_loss_mlp": 1.03662348, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.0225399363958734, + "language_loss": 0.84413826, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86667162, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.22851562, + "step": 1547, + "time_per_iteration": 3.9145469665527344 + }, + { + "auxiliary_loss_clip": 0.01188979, + "auxiliary_loss_mlp": 0.01052418, + "balance_loss_clip": 1.05897808, + "balance_loss_mlp": 1.0305078, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.0129605614620134, + "language_loss": 0.67655957, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69897354, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.21911621, + "step": 1548, + "time_per_iteration": 2.5042622089385986 + }, + { + "auxiliary_loss_clip": 0.01191306, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.05845833, + "balance_loss_mlp": 1.03451586, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.2950571862461024, + "language_loss": 0.83356738, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85604703, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.22155762, + "step": 1549, + "time_per_iteration": 2.451545476913452 + }, + { + "auxiliary_loss_clip": 0.01198242, + "auxiliary_loss_mlp": 0.01048571, + "balance_loss_clip": 1.06767702, + "balance_loss_mlp": 1.02669644, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.5969752426719157, + "language_loss": 0.75404537, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77651346, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.21875, + "step": 1550, + "time_per_iteration": 2.5426790714263916 + }, + { + "auxiliary_loss_clip": 0.01194463, + "auxiliary_loss_mlp": 0.01065635, + "balance_loss_clip": 1.0622201, + "balance_loss_mlp": 1.04118574, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.272486371685719, + "language_loss": 0.8328706, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85547161, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.24462891, + "step": 1551, + "time_per_iteration": 2.503980875015259 + }, + { + "auxiliary_loss_clip": 0.01106491, + "auxiliary_loss_mlp": 0.01006952, + "balance_loss_clip": 1.05199039, + "balance_loss_mlp": 1.00390959, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.8553766683917077, + "language_loss": 0.61824399, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63937843, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.54394531, + "router_z_loss_mlp": 0.03042603, + "step": 1552, + "time_per_iteration": 3.208505153656006 + }, + { + "auxiliary_loss_clip": 0.01090433, + "auxiliary_loss_mlp": 0.01007372, + "balance_loss_clip": 1.03738523, + "balance_loss_mlp": 1.00403976, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8322581066716535, + "language_loss": 0.58937854, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61035657, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.52929688, + "router_z_loss_mlp": 0.0333252, + "step": 1553, + "time_per_iteration": 3.208962917327881 + }, + { + "auxiliary_loss_clip": 0.01186425, + "auxiliary_loss_mlp": 0.01055714, + "balance_loss_clip": 1.05525422, + "balance_loss_mlp": 1.03309989, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 1.6818095689780355, + "language_loss": 0.87579644, + "learning_rate": 3.957942217314823e-06, + "loss": 0.8982178, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.22631836, + "step": 1554, + "time_per_iteration": 2.5941781997680664 + }, + { + "auxiliary_loss_clip": 0.01186224, + "auxiliary_loss_mlp": 0.01056136, + "balance_loss_clip": 1.0597291, + "balance_loss_mlp": 1.03351045, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.007643266900994, + "language_loss": 0.81457722, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83700085, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.22607422, + "step": 1555, + "time_per_iteration": 2.4612669944763184 + }, + { + "auxiliary_loss_clip": 0.01087666, + "auxiliary_loss_mlp": 0.01006482, + "balance_loss_clip": 1.03647661, + "balance_loss_mlp": 1.00383866, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8703563272876671, + "language_loss": 0.59610635, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61704791, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.51074219, + "router_z_loss_mlp": 0.02645874, + "step": 1556, + "time_per_iteration": 3.0920660495758057 + }, + { + "auxiliary_loss_clip": 0.01188806, + "auxiliary_loss_mlp": 0.01057221, + "balance_loss_clip": 1.0604434, + "balance_loss_mlp": 1.0357275, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 2.0091449675578628, + "language_loss": 0.84217417, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86463439, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.21496582, + "step": 1557, + "time_per_iteration": 2.606248617172241 + }, + { + "auxiliary_loss_clip": 0.01185074, + "auxiliary_loss_mlp": 0.01059332, + "balance_loss_clip": 1.05736661, + "balance_loss_mlp": 1.03535903, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.8256973428808607, + "language_loss": 0.77885759, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80130172, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.23974609, + "step": 1558, + "time_per_iteration": 2.469417095184326 + }, + { + "auxiliary_loss_clip": 0.01195236, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.06218934, + "balance_loss_mlp": 1.02591157, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 1.9589977103688512, + "language_loss": 0.8005482, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82298833, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.2286377, + "step": 1559, + "time_per_iteration": 2.4158167839050293 + }, + { + "auxiliary_loss_clip": 0.01189689, + "auxiliary_loss_mlp": 0.01060169, + "balance_loss_clip": 1.0614512, + "balance_loss_mlp": 1.03803158, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 2.69980511948815, + "language_loss": 0.76156294, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78406149, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.22167969, + "step": 1560, + "time_per_iteration": 2.420292615890503 + }, + { + "auxiliary_loss_clip": 0.01193134, + "auxiliary_loss_mlp": 0.01055166, + "balance_loss_clip": 1.05992413, + "balance_loss_mlp": 1.03275466, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.647955705443526, + "language_loss": 0.80804366, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83052665, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.22412109, + "step": 1561, + "time_per_iteration": 2.4987735748291016 + }, + { + "auxiliary_loss_clip": 0.01191084, + "auxiliary_loss_mlp": 0.01048815, + "balance_loss_clip": 1.05871403, + "balance_loss_mlp": 1.02720284, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.2970761697480975, + "language_loss": 0.61947155, + "learning_rate": 3.957304243552354e-06, + "loss": 0.64187056, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.21618652, + "step": 1562, + "time_per_iteration": 2.541534662246704 + }, + { + "auxiliary_loss_clip": 0.0118554, + "auxiliary_loss_mlp": 0.01056603, + "balance_loss_clip": 1.05971467, + "balance_loss_mlp": 1.03593254, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.230426107384425, + "language_loss": 0.85279208, + "learning_rate": 3.957224162804956e-06, + "loss": 0.8752135, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.20666504, + "step": 1563, + "time_per_iteration": 2.424421548843384 + }, + { + "auxiliary_loss_clip": 0.01180906, + "auxiliary_loss_mlp": 0.01049754, + "balance_loss_clip": 1.05605793, + "balance_loss_mlp": 1.0291307, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.9363777006906928, + "language_loss": 0.76471841, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78702497, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.20617676, + "step": 1564, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01188107, + "auxiliary_loss_mlp": 0.01051657, + "balance_loss_clip": 1.06068516, + "balance_loss_mlp": 1.03058052, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 1.8942151498038822, + "language_loss": 0.80109024, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82348788, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.21081543, + "step": 1565, + "time_per_iteration": 2.503408670425415 + }, + { + "auxiliary_loss_clip": 0.01184265, + "auxiliary_loss_mlp": 0.01058988, + "balance_loss_clip": 1.05566347, + "balance_loss_mlp": 1.03761423, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8959805405957577, + "language_loss": 0.75557101, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77800351, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.21386719, + "step": 1566, + "time_per_iteration": 2.5001778602600098 + }, + { + "auxiliary_loss_clip": 0.01184973, + "auxiliary_loss_mlp": 0.01062158, + "balance_loss_clip": 1.05454171, + "balance_loss_mlp": 1.03991342, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.2016272980647393, + "language_loss": 0.78004026, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80251157, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.22253418, + "step": 1567, + "time_per_iteration": 2.4578940868377686 + }, + { + "auxiliary_loss_clip": 0.01193707, + "auxiliary_loss_mlp": 0.01050194, + "balance_loss_clip": 1.06396103, + "balance_loss_mlp": 1.03020287, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 1.6607819207201657, + "language_loss": 0.82453376, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84697276, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.19995117, + "step": 1568, + "time_per_iteration": 2.515643835067749 + }, + { + "auxiliary_loss_clip": 0.01193132, + "auxiliary_loss_mlp": 0.01050326, + "balance_loss_clip": 1.06055439, + "balance_loss_mlp": 1.02767634, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 1.7667203518963703, + "language_loss": 0.76613486, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78856945, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.22668457, + "step": 1569, + "time_per_iteration": 2.454951763153076 + }, + { + "auxiliary_loss_clip": 0.01180048, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_clip": 1.05606389, + "balance_loss_mlp": 1.02474666, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.127325354079853, + "language_loss": 0.85567796, + "learning_rate": 3.956661519635756e-06, + "loss": 0.87794411, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.21813965, + "step": 1570, + "time_per_iteration": 2.4222898483276367 + }, + { + "auxiliary_loss_clip": 0.01183038, + "auxiliary_loss_mlp": 0.01046689, + "balance_loss_clip": 1.05725932, + "balance_loss_mlp": 1.02515936, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.7172473155982617, + "language_loss": 0.76587069, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78816795, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.21533203, + "step": 1571, + "time_per_iteration": 2.509368658065796 + }, + { + "auxiliary_loss_clip": 0.01181104, + "auxiliary_loss_mlp": 0.01050446, + "balance_loss_clip": 1.0591265, + "balance_loss_mlp": 1.02999008, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.9052714648999338, + "language_loss": 0.7927947, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81511021, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.20471191, + "step": 1572, + "time_per_iteration": 2.4632761478424072 + }, + { + "auxiliary_loss_clip": 0.01180872, + "auxiliary_loss_mlp": 0.0104671, + "balance_loss_clip": 1.05766058, + "balance_loss_mlp": 1.02534747, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8911512929147518, + "language_loss": 0.8758719, + "learning_rate": 3.956419273835913e-06, + "loss": 0.89814776, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.21386719, + "step": 1573, + "time_per_iteration": 2.4771111011505127 + }, + { + "auxiliary_loss_clip": 0.01186555, + "auxiliary_loss_mlp": 0.01055426, + "balance_loss_clip": 1.05847716, + "balance_loss_mlp": 1.03158426, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 4.681056561940266, + "language_loss": 0.81468844, + "learning_rate": 3.95633837685665e-06, + "loss": 0.83710825, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.23852539, + "step": 1574, + "time_per_iteration": 2.5121943950653076 + }, + { + "auxiliary_loss_clip": 0.01188474, + "auxiliary_loss_mlp": 0.01045367, + "balance_loss_clip": 1.06257463, + "balance_loss_mlp": 1.02469647, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 4.931756860985847, + "language_loss": 0.81062812, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83296657, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.20654297, + "step": 1575, + "time_per_iteration": 2.4705052375793457 + }, + { + "auxiliary_loss_clip": 0.01190101, + "auxiliary_loss_mlp": 0.01056807, + "balance_loss_clip": 1.0652386, + "balance_loss_mlp": 1.03517067, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.180332201955938, + "language_loss": 0.86829185, + "learning_rate": 3.956176360347553e-06, + "loss": 0.8907609, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.21630859, + "step": 1576, + "time_per_iteration": 2.493532419204712 + }, + { + "auxiliary_loss_clip": 0.01091802, + "auxiliary_loss_mlp": 0.01010799, + "balance_loss_clip": 1.0422945, + "balance_loss_mlp": 1.00789356, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9812389131915629, + "language_loss": 0.65833771, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67936373, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 0.02905273, + "step": 1577, + "time_per_iteration": 3.0482850074768066 + }, + { + "auxiliary_loss_clip": 0.01180645, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.05465877, + "balance_loss_mlp": 1.02084172, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 1.8653896528477076, + "language_loss": 0.79416341, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81637961, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.20117188, + "step": 1578, + "time_per_iteration": 2.4888784885406494 + }, + { + "auxiliary_loss_clip": 0.01177877, + "auxiliary_loss_mlp": 0.01057904, + "balance_loss_clip": 1.05254221, + "balance_loss_mlp": 1.03667283, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.9655429530925026, + "language_loss": 0.77662385, + "learning_rate": 3.955932779253578e-06, + "loss": 0.79898167, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.21228027, + "step": 1579, + "time_per_iteration": 2.488325834274292 + }, + { + "auxiliary_loss_clip": 0.01187384, + "auxiliary_loss_mlp": 0.010567, + "balance_loss_clip": 1.05916905, + "balance_loss_mlp": 1.03449082, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.1860725976370654, + "language_loss": 0.73742521, + "learning_rate": 3.955851437213144e-06, + "loss": 0.759866, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.22216797, + "step": 1580, + "time_per_iteration": 2.4873297214508057 + }, + { + "auxiliary_loss_clip": 0.01178711, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.05589473, + "balance_loss_mlp": 1.02755201, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.062038799624783, + "language_loss": 0.77669173, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79896283, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.20849609, + "step": 1581, + "time_per_iteration": 2.5432164669036865 + }, + { + "auxiliary_loss_clip": 0.01194616, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.06690991, + "balance_loss_mlp": 1.04488122, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 1.9334741398650224, + "language_loss": 0.86831015, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89091551, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.21032715, + "step": 1582, + "time_per_iteration": 2.4719746112823486 + }, + { + "auxiliary_loss_clip": 0.01192213, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.06432915, + "balance_loss_mlp": 1.02544665, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.9198635426322213, + "language_loss": 0.67360592, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6960091, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.22668457, + "step": 1583, + "time_per_iteration": 2.427664041519165 + }, + { + "auxiliary_loss_clip": 0.01191597, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_clip": 1.06227326, + "balance_loss_mlp": 1.02305436, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.9605890174783602, + "language_loss": 0.70491922, + "learning_rate": 3.95552532742147e-06, + "loss": 0.72729802, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.23242188, + "step": 1584, + "time_per_iteration": 2.5127434730529785 + }, + { + "auxiliary_loss_clip": 0.01188641, + "auxiliary_loss_mlp": 0.01053707, + "balance_loss_clip": 1.06191564, + "balance_loss_mlp": 1.0335964, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.7767413253163697, + "language_loss": 0.80631769, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82874119, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.20129395, + "step": 1585, + "time_per_iteration": 2.4683308601379395 + }, + { + "auxiliary_loss_clip": 0.01192784, + "auxiliary_loss_mlp": 0.0105461, + "balance_loss_clip": 1.06184506, + "balance_loss_mlp": 1.03080344, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 2.0226381382439484, + "language_loss": 0.72052467, + "learning_rate": 3.955361827590961e-06, + "loss": 0.7429986, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 1.31152344, + "router_z_loss_mlp": 0.23791504, + "step": 1586, + "time_per_iteration": 3.891348123550415 + }, + { + "auxiliary_loss_clip": 0.01085836, + "auxiliary_loss_mlp": 0.01011873, + "balance_loss_clip": 1.03482676, + "balance_loss_mlp": 1.00894964, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8150379010822921, + "language_loss": 0.55393505, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57491219, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.50878906, + "router_z_loss_mlp": 0.0291748, + "step": 1587, + "time_per_iteration": 4.227677822113037 + }, + { + "auxiliary_loss_clip": 0.01189227, + "auxiliary_loss_mlp": 0.0105323, + "balance_loss_clip": 1.06103778, + "balance_loss_mlp": 1.03154588, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 2.7091980972544527, + "language_loss": 0.81033111, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83275568, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.21679688, + "step": 1588, + "time_per_iteration": 2.57045316696167 + }, + { + "auxiliary_loss_clip": 0.01191446, + "auxiliary_loss_mlp": 0.01060979, + "balance_loss_clip": 1.06407332, + "balance_loss_mlp": 1.04018879, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.5797310462083736, + "language_loss": 0.81576109, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83828533, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.20800781, + "step": 1589, + "time_per_iteration": 2.4928579330444336 + }, + { + "auxiliary_loss_clip": 0.01181383, + "auxiliary_loss_mlp": 0.01056129, + "balance_loss_clip": 1.05639994, + "balance_loss_mlp": 1.03288352, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 2.1468632397476157, + "language_loss": 0.64607364, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66844881, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.23242188, + "step": 1590, + "time_per_iteration": 2.68107271194458 + }, + { + "auxiliary_loss_clip": 0.0118642, + "auxiliary_loss_mlp": 0.01056132, + "balance_loss_clip": 1.06152272, + "balance_loss_mlp": 1.03375673, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.7124511981180757, + "language_loss": 0.83299184, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85541737, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.22387695, + "step": 1591, + "time_per_iteration": 4.027567625045776 + }, + { + "auxiliary_loss_clip": 0.01187481, + "auxiliary_loss_mlp": 0.01056223, + "balance_loss_clip": 1.05838096, + "balance_loss_mlp": 1.03444314, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 3.0598786894636945, + "language_loss": 0.74192512, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76436222, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.21777344, + "step": 1592, + "time_per_iteration": 3.798664093017578 + }, + { + "auxiliary_loss_clip": 0.01184901, + "auxiliary_loss_mlp": 0.01053508, + "balance_loss_clip": 1.05902779, + "balance_loss_mlp": 1.03185964, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 2.056031795183822, + "language_loss": 0.74273622, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76512039, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.21655273, + "step": 1593, + "time_per_iteration": 2.5284006595611572 + }, + { + "auxiliary_loss_clip": 0.01180186, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_clip": 1.05732286, + "balance_loss_mlp": 1.0279398, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.9285930084682492, + "language_loss": 0.7016294, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72392339, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.21252441, + "step": 1594, + "time_per_iteration": 2.503384590148926 + }, + { + "auxiliary_loss_clip": 0.01186707, + "auxiliary_loss_mlp": 0.0105085, + "balance_loss_clip": 1.05713511, + "balance_loss_mlp": 1.02932072, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 4.292802506552535, + "language_loss": 0.82587337, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84824896, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.21533203, + "step": 1595, + "time_per_iteration": 2.477755069732666 + }, + { + "auxiliary_loss_clip": 0.01184197, + "auxiliary_loss_mlp": 0.01050258, + "balance_loss_clip": 1.05786109, + "balance_loss_mlp": 1.02789426, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.136617746669241, + "language_loss": 0.84786588, + "learning_rate": 3.954539880085045e-06, + "loss": 0.87021041, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.22351074, + "step": 1596, + "time_per_iteration": 2.4599790573120117 + }, + { + "auxiliary_loss_clip": 0.0119345, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_clip": 1.06400585, + "balance_loss_mlp": 1.02296865, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.9818996707120093, + "language_loss": 0.69016469, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71255624, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.22741699, + "step": 1597, + "time_per_iteration": 2.6326210498809814 + }, + { + "auxiliary_loss_clip": 0.01182019, + "auxiliary_loss_mlp": 0.01039049, + "balance_loss_clip": 1.05498409, + "balance_loss_mlp": 1.01776981, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.1500902744652586, + "language_loss": 0.75042742, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77263808, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.21276855, + "step": 1598, + "time_per_iteration": 2.4895026683807373 + }, + { + "auxiliary_loss_clip": 0.01188969, + "auxiliary_loss_mlp": 0.01047708, + "balance_loss_clip": 1.06085134, + "balance_loss_mlp": 1.02541566, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 2.158340041956811, + "language_loss": 0.68585014, + "learning_rate": 3.954291850422382e-06, + "loss": 0.70821679, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.22277832, + "step": 1599, + "time_per_iteration": 2.5896403789520264 + }, + { + "auxiliary_loss_clip": 0.01185433, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.05868912, + "balance_loss_mlp": 1.02994764, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.2807575491853878, + "language_loss": 0.84371436, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86607933, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.21130371, + "step": 1600, + "time_per_iteration": 2.5549068450927734 + }, + { + "auxiliary_loss_clip": 0.0118705, + "auxiliary_loss_mlp": 0.0105169, + "balance_loss_clip": 1.06082547, + "balance_loss_mlp": 1.0306375, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 2.015719860578417, + "language_loss": 0.80182254, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82420993, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.21044922, + "step": 1601, + "time_per_iteration": 2.4393882751464844 + }, + { + "auxiliary_loss_clip": 0.01192218, + "auxiliary_loss_mlp": 0.01052466, + "balance_loss_clip": 1.06341922, + "balance_loss_mlp": 1.03031683, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 5.263250673562286, + "language_loss": 0.82650673, + "learning_rate": 3.954043153797251e-06, + "loss": 0.8489536, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.22155762, + "step": 1602, + "time_per_iteration": 2.4531376361846924 + }, + { + "auxiliary_loss_clip": 0.01192547, + "auxiliary_loss_mlp": 0.01054879, + "balance_loss_clip": 1.0669384, + "balance_loss_mlp": 1.031991, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 2.0335919004143426, + "language_loss": 0.62964129, + "learning_rate": 3.953960106722989e-06, + "loss": 0.65211558, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.22875977, + "step": 1603, + "time_per_iteration": 2.4920380115509033 + }, + { + "auxiliary_loss_clip": 0.01206134, + "auxiliary_loss_mlp": 0.01052792, + "balance_loss_clip": 1.07573128, + "balance_loss_mlp": 1.02986848, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.366209274533676, + "language_loss": 0.71377122, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73636055, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.22924805, + "step": 1604, + "time_per_iteration": 2.509183883666992 + }, + { + "auxiliary_loss_clip": 0.01181837, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.05868006, + "balance_loss_mlp": 1.02858126, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 1.901409953135493, + "language_loss": 0.79713249, + "learning_rate": 3.953793790294527e-06, + "loss": 0.81943977, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.203125, + "step": 1605, + "time_per_iteration": 2.5421414375305176 + }, + { + "auxiliary_loss_clip": 0.01186684, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.05907452, + "balance_loss_mlp": 1.0214659, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 1.8688399733931205, + "language_loss": 0.74546093, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76775664, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.21435547, + "step": 1606, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01188803, + "auxiliary_loss_mlp": 0.01050101, + "balance_loss_clip": 1.05994868, + "balance_loss_mlp": 1.02875102, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.7587988495429934, + "language_loss": 0.75605476, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77844387, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.21337891, + "step": 1607, + "time_per_iteration": 2.479853391647339 + }, + { + "auxiliary_loss_clip": 0.01187679, + "auxiliary_loss_mlp": 0.01045096, + "balance_loss_clip": 1.06073713, + "balance_loss_mlp": 1.02379346, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 1.7827312967281073, + "language_loss": 0.86917591, + "learning_rate": 3.953543759999312e-06, + "loss": 0.89150363, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.21313477, + "step": 1608, + "time_per_iteration": 2.4643492698669434 + }, + { + "auxiliary_loss_clip": 0.01193292, + "auxiliary_loss_mlp": 0.01053038, + "balance_loss_clip": 1.06119013, + "balance_loss_mlp": 1.03143764, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.067933006584396, + "language_loss": 0.70883179, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73129511, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.21606445, + "step": 1609, + "time_per_iteration": 2.6154305934906006 + }, + { + "auxiliary_loss_clip": 0.01184548, + "auxiliary_loss_mlp": 0.01054663, + "balance_loss_clip": 1.05847645, + "balance_loss_mlp": 1.03374135, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 2.695828790437272, + "language_loss": 0.8443718, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86676389, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.20922852, + "step": 1610, + "time_per_iteration": 2.4600415229797363 + }, + { + "auxiliary_loss_clip": 0.01192646, + "auxiliary_loss_mlp": 0.010632, + "balance_loss_clip": 1.06634879, + "balance_loss_mlp": 1.04069352, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.25758310997248, + "language_loss": 0.67008424, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69264269, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.22485352, + "step": 1611, + "time_per_iteration": 2.453350067138672 + }, + { + "auxiliary_loss_clip": 0.01184496, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.06008458, + "balance_loss_mlp": 1.02944589, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 2.0186729469036298, + "language_loss": 0.8107686, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83311075, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.20263672, + "step": 1612, + "time_per_iteration": 2.4704878330230713 + }, + { + "auxiliary_loss_clip": 0.01187977, + "auxiliary_loss_mlp": 0.01061575, + "balance_loss_clip": 1.06049824, + "balance_loss_mlp": 1.03980708, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.6837674750277, + "language_loss": 0.8099978, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83249325, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.2175293, + "step": 1613, + "time_per_iteration": 2.5672767162323 + }, + { + "auxiliary_loss_clip": 0.0118456, + "auxiliary_loss_mlp": 0.01061985, + "balance_loss_clip": 1.05880547, + "balance_loss_mlp": 1.03749895, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 1.8077604267506733, + "language_loss": 0.84253937, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86500478, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.24487305, + "step": 1614, + "time_per_iteration": 2.5787854194641113 + }, + { + "auxiliary_loss_clip": 0.01096667, + "auxiliary_loss_mlp": 0.01021487, + "balance_loss_clip": 1.04226446, + "balance_loss_mlp": 1.01710033, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7015284184765533, + "language_loss": 0.54644877, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56763029, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.54394531, + "router_z_loss_mlp": 0.04388428, + "step": 1615, + "time_per_iteration": 3.0860555171966553 + }, + { + "auxiliary_loss_clip": 0.01098384, + "auxiliary_loss_mlp": 0.01009609, + "balance_loss_clip": 1.04125893, + "balance_loss_mlp": 1.00605679, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7633480226339158, + "language_loss": 0.5825814, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60366136, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 0.03552246, + "step": 1616, + "time_per_iteration": 3.2612924575805664 + }, + { + "auxiliary_loss_clip": 0.01213537, + "auxiliary_loss_mlp": 0.01056894, + "balance_loss_clip": 1.08207202, + "balance_loss_mlp": 1.03307617, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 2.100189276076155, + "language_loss": 0.68954587, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71225023, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.23828125, + "step": 1617, + "time_per_iteration": 2.4580252170562744 + }, + { + "auxiliary_loss_clip": 0.01189119, + "auxiliary_loss_mlp": 0.01054958, + "balance_loss_clip": 1.06011868, + "balance_loss_mlp": 1.0304482, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 3.930285149058212, + "language_loss": 0.80710864, + "learning_rate": 3.952705511055698e-06, + "loss": 0.82954943, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.24499512, + "step": 1618, + "time_per_iteration": 2.53057861328125 + }, + { + "auxiliary_loss_clip": 0.01177368, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.05631983, + "balance_loss_mlp": 1.02839005, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.6963361999483224, + "language_loss": 0.92752761, + "learning_rate": 3.952621278851435e-06, + "loss": 0.94978547, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.20031738, + "step": 1619, + "time_per_iteration": 2.483599901199341 + }, + { + "auxiliary_loss_clip": 0.01180177, + "auxiliary_loss_mlp": 0.01050708, + "balance_loss_clip": 1.06059098, + "balance_loss_mlp": 1.02910733, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 2.0743140563913727, + "language_loss": 0.88680279, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.90911168, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.21606445, + "step": 1620, + "time_per_iteration": 2.5651304721832275 + }, + { + "auxiliary_loss_clip": 0.01180448, + "auxiliary_loss_mlp": 0.01055543, + "balance_loss_clip": 1.05627251, + "balance_loss_mlp": 1.03232121, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 4.539777392492211, + "language_loss": 0.77262193, + "learning_rate": 3.952452592315324e-06, + "loss": 0.7949819, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.23217773, + "step": 1621, + "time_per_iteration": 2.465806722640991 + }, + { + "auxiliary_loss_clip": 0.0118015, + "auxiliary_loss_mlp": 0.01062814, + "balance_loss_clip": 1.05303729, + "balance_loss_mlp": 1.04070044, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 2.060583515303023, + "language_loss": 0.77687562, + "learning_rate": 3.952368137989871e-06, + "loss": 0.79930526, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.22106934, + "step": 1622, + "time_per_iteration": 2.519080877304077 + }, + { + "auxiliary_loss_clip": 0.01183704, + "auxiliary_loss_mlp": 0.01055097, + "balance_loss_clip": 1.05753493, + "balance_loss_mlp": 1.03241146, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 2.0858801622503145, + "language_loss": 0.86200267, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88439065, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.22668457, + "step": 1623, + "time_per_iteration": 2.488126039505005 + }, + { + "auxiliary_loss_clip": 0.01181541, + "auxiliary_loss_mlp": 0.0106147, + "balance_loss_clip": 1.05848551, + "balance_loss_mlp": 1.03924894, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 3.1577336174789328, + "language_loss": 0.80187291, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82430303, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.2220459, + "step": 1624, + "time_per_iteration": 2.4834344387054443 + }, + { + "auxiliary_loss_clip": 0.01175371, + "auxiliary_loss_mlp": 0.0104519, + "balance_loss_clip": 1.05116177, + "balance_loss_mlp": 1.02407837, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.337572933833435, + "language_loss": 0.85604864, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87825423, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.21118164, + "step": 1625, + "time_per_iteration": 2.4013426303863525 + }, + { + "auxiliary_loss_clip": 0.01193905, + "auxiliary_loss_mlp": 0.01054807, + "balance_loss_clip": 1.0673008, + "balance_loss_mlp": 1.03362346, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.227796067970631, + "language_loss": 0.85249913, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87498629, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.21191406, + "step": 1626, + "time_per_iteration": 2.5074734687805176 + }, + { + "auxiliary_loss_clip": 0.0118689, + "auxiliary_loss_mlp": 0.01054896, + "balance_loss_clip": 1.06018209, + "balance_loss_mlp": 1.03213859, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 7.852843289384192, + "language_loss": 0.83297598, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85539389, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 1.26757812, + "router_z_loss_mlp": 0.22766113, + "step": 1627, + "time_per_iteration": 2.478468179702759 + }, + { + "auxiliary_loss_clip": 0.01175343, + "auxiliary_loss_mlp": 0.01058625, + "balance_loss_clip": 1.05347204, + "balance_loss_mlp": 1.03584433, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 2.006011713076177, + "language_loss": 0.84394902, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86628872, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.22766113, + "step": 1628, + "time_per_iteration": 2.495720624923706 + }, + { + "auxiliary_loss_clip": 0.01177382, + "auxiliary_loss_mlp": 0.0105315, + "balance_loss_clip": 1.05501688, + "balance_loss_mlp": 1.03183556, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.772044148250716, + "language_loss": 0.75667191, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77897722, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.2130127, + "step": 1629, + "time_per_iteration": 2.4803669452667236 + }, + { + "auxiliary_loss_clip": 0.01187167, + "auxiliary_loss_mlp": 0.01055016, + "balance_loss_clip": 1.06218171, + "balance_loss_mlp": 1.03218746, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.9521925408221525, + "language_loss": 0.78387809, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80629992, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.22827148, + "step": 1630, + "time_per_iteration": 3.891793727874756 + }, + { + "auxiliary_loss_clip": 0.01192649, + "auxiliary_loss_mlp": 0.01056085, + "balance_loss_clip": 1.06715536, + "balance_loss_mlp": 1.03304148, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 2.0558056559054223, + "language_loss": 0.86705869, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88954604, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.23059082, + "step": 1631, + "time_per_iteration": 3.879659652709961 + }, + { + "auxiliary_loss_clip": 0.01184774, + "auxiliary_loss_mlp": 0.01052886, + "balance_loss_clip": 1.06094933, + "balance_loss_mlp": 1.03216743, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 1.894845211044561, + "language_loss": 0.83658224, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85895884, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.20715332, + "step": 1632, + "time_per_iteration": 2.539984941482544 + }, + { + "auxiliary_loss_clip": 0.01196208, + "auxiliary_loss_mlp": 0.01051178, + "balance_loss_clip": 1.06901681, + "balance_loss_mlp": 1.03062582, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.6568794018240476, + "language_loss": 0.78874397, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81121784, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.20544434, + "step": 1633, + "time_per_iteration": 2.472787380218506 + }, + { + "auxiliary_loss_clip": 0.0118305, + "auxiliary_loss_mlp": 0.01053513, + "balance_loss_clip": 1.06055486, + "balance_loss_mlp": 1.0323292, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.262237012809251, + "language_loss": 0.73221672, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75458235, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.21179199, + "step": 1634, + "time_per_iteration": 3.923522710800171 + }, + { + "auxiliary_loss_clip": 0.01184327, + "auxiliary_loss_mlp": 0.01059623, + "balance_loss_clip": 1.05612874, + "balance_loss_mlp": 1.03743792, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 4.388904218913943, + "language_loss": 0.72779244, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75023198, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.22180176, + "step": 1635, + "time_per_iteration": 2.555605173110962 + }, + { + "auxiliary_loss_clip": 0.01190183, + "auxiliary_loss_mlp": 0.01054462, + "balance_loss_clip": 1.06270063, + "balance_loss_mlp": 1.03230095, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 2.4301812742030484, + "language_loss": 0.78224599, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80469245, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.22180176, + "step": 1636, + "time_per_iteration": 3.8968186378479004 + }, + { + "auxiliary_loss_clip": 0.0118211, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.05927837, + "balance_loss_mlp": 1.0319165, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.582274200964679, + "language_loss": 0.7000162, + "learning_rate": 3.951092440828715e-06, + "loss": 0.72236884, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.21228027, + "step": 1637, + "time_per_iteration": 2.433790445327759 + }, + { + "auxiliary_loss_clip": 0.01181494, + "auxiliary_loss_mlp": 0.01055613, + "balance_loss_clip": 1.0560782, + "balance_loss_mlp": 1.03345203, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.1305028043885335, + "language_loss": 0.7743609, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79673195, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.22180176, + "step": 1638, + "time_per_iteration": 2.4795734882354736 + }, + { + "auxiliary_loss_clip": 0.01186767, + "auxiliary_loss_mlp": 0.01052287, + "balance_loss_clip": 1.06313884, + "balance_loss_mlp": 1.03043556, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.612135014058666, + "language_loss": 0.72766626, + "learning_rate": 3.950921089880003e-06, + "loss": 0.7500568, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.21875, + "step": 1639, + "time_per_iteration": 2.5255091190338135 + }, + { + "auxiliary_loss_clip": 0.01178397, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.05406272, + "balance_loss_mlp": 1.02277398, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.771991913682, + "language_loss": 0.88505936, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90728652, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.21557617, + "step": 1640, + "time_per_iteration": 2.5024497509002686 + }, + { + "auxiliary_loss_clip": 0.01182634, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.05791473, + "balance_loss_mlp": 1.0252378, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 1.925021233714605, + "language_loss": 0.80330956, + "learning_rate": 3.950749443014801e-06, + "loss": 0.82560074, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.21252441, + "step": 1641, + "time_per_iteration": 2.470980644226074 + }, + { + "auxiliary_loss_clip": 0.01177842, + "auxiliary_loss_mlp": 0.01057411, + "balance_loss_clip": 1.05578876, + "balance_loss_mlp": 1.03448713, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 3.602588203134168, + "language_loss": 0.86105192, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88340449, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.22937012, + "step": 1642, + "time_per_iteration": 2.5167624950408936 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.06879151, + "balance_loss_mlp": 1.03150749, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.6708706733747032, + "language_loss": 0.81042123, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83289444, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.20898438, + "step": 1643, + "time_per_iteration": 2.507554769515991 + }, + { + "auxiliary_loss_clip": 0.01184327, + "auxiliary_loss_mlp": 0.01072521, + "balance_loss_clip": 1.05826831, + "balance_loss_mlp": 1.05051506, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 2.049690496104042, + "language_loss": 0.82362473, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84619331, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.22021484, + "step": 1644, + "time_per_iteration": 2.458993434906006 + }, + { + "auxiliary_loss_clip": 0.01177156, + "auxiliary_loss_mlp": 0.01062269, + "balance_loss_clip": 1.05760455, + "balance_loss_mlp": 1.03954744, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.8683913756075163, + "language_loss": 0.68407005, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70646435, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.22717285, + "step": 1645, + "time_per_iteration": 2.4483489990234375 + }, + { + "auxiliary_loss_clip": 0.01097018, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.04628015, + "balance_loss_mlp": 1.02731204, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 1.1330371219677564, + "language_loss": 0.60875034, + "learning_rate": 3.950319031388119e-06, + "loss": 0.63002509, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.03146362, + "step": 1646, + "time_per_iteration": 3.064371109008789 + }, + { + "auxiliary_loss_clip": 0.01176148, + "auxiliary_loss_mlp": 0.01050522, + "balance_loss_clip": 1.05510938, + "balance_loss_mlp": 1.02824187, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.4939265834949143, + "language_loss": 0.73183894, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75410569, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.22277832, + "step": 1647, + "time_per_iteration": 2.564204216003418 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_clip": 1.05924594, + "balance_loss_mlp": 1.04169679, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8537181899525352, + "language_loss": 0.8405171, + "learning_rate": 3.950146349020525e-06, + "loss": 0.8629753, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.2208252, + "step": 1648, + "time_per_iteration": 2.5187439918518066 + }, + { + "auxiliary_loss_clip": 0.01085595, + "auxiliary_loss_mlp": 0.01013478, + "balance_loss_clip": 1.03471625, + "balance_loss_mlp": 1.00963902, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7293650139513113, + "language_loss": 0.55669284, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57768357, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.03839111, + "step": 1649, + "time_per_iteration": 3.0135855674743652 + }, + { + "auxiliary_loss_clip": 0.01175595, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_clip": 1.05397117, + "balance_loss_mlp": 1.02416515, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.314730915756547, + "language_loss": 0.9026956, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92491472, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.22167969, + "step": 1650, + "time_per_iteration": 2.5471696853637695 + }, + { + "auxiliary_loss_clip": 0.01092349, + "auxiliary_loss_mlp": 0.01008546, + "balance_loss_clip": 1.03987837, + "balance_loss_mlp": 1.00529766, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.7950406664583697, + "language_loss": 0.6370616, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65807056, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.52441406, + "router_z_loss_mlp": 0.03253174, + "step": 1651, + "time_per_iteration": 3.2173213958740234 + }, + { + "auxiliary_loss_clip": 0.0118985, + "auxiliary_loss_mlp": 0.01057535, + "balance_loss_clip": 1.06689596, + "balance_loss_mlp": 1.03551698, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.893818497588409, + "language_loss": 0.8859328, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90840662, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.22033691, + "step": 1652, + "time_per_iteration": 2.518644094467163 + }, + { + "auxiliary_loss_clip": 0.01188352, + "auxiliary_loss_mlp": 0.01055622, + "balance_loss_clip": 1.06333256, + "balance_loss_mlp": 1.03446293, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 2.8092501964790237, + "language_loss": 0.81825751, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84069723, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.21166992, + "step": 1653, + "time_per_iteration": 2.522204875946045 + }, + { + "auxiliary_loss_clip": 0.0118, + "auxiliary_loss_mlp": 0.0104886, + "balance_loss_clip": 1.05800724, + "balance_loss_mlp": 1.02770042, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.1141596591392124, + "language_loss": 0.79310495, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81539357, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.21179199, + "step": 1654, + "time_per_iteration": 2.563636302947998 + }, + { + "auxiliary_loss_clip": 0.01178041, + "auxiliary_loss_mlp": 0.01063855, + "balance_loss_clip": 1.05836308, + "balance_loss_mlp": 1.04018044, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.8667593993611018, + "language_loss": 0.80854428, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83096325, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.23657227, + "step": 1655, + "time_per_iteration": 2.490210771560669 + }, + { + "auxiliary_loss_clip": 0.01180518, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_clip": 1.05868673, + "balance_loss_mlp": 1.03165627, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.8709276741517966, + "language_loss": 0.80537498, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.82770526, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.20825195, + "step": 1656, + "time_per_iteration": 2.5353169441223145 + }, + { + "auxiliary_loss_clip": 0.01184347, + "auxiliary_loss_mlp": 0.01054916, + "balance_loss_clip": 1.06132138, + "balance_loss_mlp": 1.03356552, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.8678053309639397, + "language_loss": 0.88699996, + "learning_rate": 3.949365618233217e-06, + "loss": 0.9093926, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.21362305, + "step": 1657, + "time_per_iteration": 2.468991279602051 + }, + { + "auxiliary_loss_clip": 0.01196273, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.06759191, + "balance_loss_mlp": 1.03180361, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.051563948088427, + "language_loss": 0.84869421, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.87119567, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.22058105, + "step": 1658, + "time_per_iteration": 2.507113218307495 + }, + { + "auxiliary_loss_clip": 0.0110484, + "auxiliary_loss_mlp": 0.01022263, + "balance_loss_clip": 1.05034351, + "balance_loss_mlp": 1.01897275, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9017778340838054, + "language_loss": 0.60780996, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62908101, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.54589844, + "router_z_loss_mlp": 0.03295898, + "step": 1659, + "time_per_iteration": 3.1051363945007324 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01052988, + "balance_loss_clip": 1.07103086, + "balance_loss_mlp": 1.0310533, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8442367960923862, + "language_loss": 0.85150516, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87399888, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.21936035, + "step": 1660, + "time_per_iteration": 2.515099287033081 + }, + { + "auxiliary_loss_clip": 0.0118454, + "auxiliary_loss_mlp": 0.01060536, + "balance_loss_clip": 1.06383443, + "balance_loss_mlp": 1.03837478, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.0882896916828595, + "language_loss": 0.80210328, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82455409, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.22131348, + "step": 1661, + "time_per_iteration": 2.502528429031372 + }, + { + "auxiliary_loss_clip": 0.01184836, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.05722117, + "balance_loss_mlp": 1.02694106, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 2.20374267071747, + "language_loss": 0.83369535, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85602999, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.21679688, + "step": 1662, + "time_per_iteration": 2.525162935256958 + }, + { + "auxiliary_loss_clip": 0.01176683, + "auxiliary_loss_mlp": 0.01059639, + "balance_loss_clip": 1.05501044, + "balance_loss_mlp": 1.0374068, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.211755757812447, + "language_loss": 0.89074743, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91311061, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.22229004, + "step": 1663, + "time_per_iteration": 2.5491104125976562 + }, + { + "auxiliary_loss_clip": 0.01179214, + "auxiliary_loss_mlp": 0.01049436, + "balance_loss_clip": 1.05582011, + "balance_loss_mlp": 1.02733397, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.7778192694717405, + "language_loss": 0.7041415, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72642803, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.22094727, + "step": 1664, + "time_per_iteration": 2.6017093658447266 + }, + { + "auxiliary_loss_clip": 0.01178578, + "auxiliary_loss_mlp": 0.01047913, + "balance_loss_clip": 1.05736065, + "balance_loss_mlp": 1.02637148, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.4269852993822556, + "language_loss": 0.78861123, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81087613, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.2154541, + "step": 1665, + "time_per_iteration": 2.5305464267730713 + }, + { + "auxiliary_loss_clip": 0.01182277, + "auxiliary_loss_mlp": 0.01059793, + "balance_loss_clip": 1.05868924, + "balance_loss_mlp": 1.03833556, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.805817529460636, + "language_loss": 0.69924039, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72166109, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.21472168, + "step": 1666, + "time_per_iteration": 2.486348867416382 + }, + { + "auxiliary_loss_clip": 0.011824, + "auxiliary_loss_mlp": 0.01053914, + "balance_loss_clip": 1.05713332, + "balance_loss_mlp": 1.03108501, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 2.0166655079447073, + "language_loss": 0.7924785, + "learning_rate": 3.948491117273956e-06, + "loss": 0.81484163, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.22802734, + "step": 1667, + "time_per_iteration": 2.452514886856079 + }, + { + "auxiliary_loss_clip": 0.01183136, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.05867088, + "balance_loss_mlp": 1.03558433, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.3784987895424528, + "language_loss": 0.76931059, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79173303, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.23498535, + "step": 1668, + "time_per_iteration": 2.476109027862549 + }, + { + "auxiliary_loss_clip": 0.01178709, + "auxiliary_loss_mlp": 0.01047978, + "balance_loss_clip": 1.05701268, + "balance_loss_mlp": 1.02579343, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.8391499841219805, + "language_loss": 0.78004038, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80230725, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.22167969, + "step": 1669, + "time_per_iteration": 2.480712413787842 + }, + { + "auxiliary_loss_clip": 0.01187163, + "auxiliary_loss_mlp": 0.01057342, + "balance_loss_clip": 1.05988955, + "balance_loss_mlp": 1.03468001, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.5409063393439038, + "language_loss": 0.85602075, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87846577, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.22680664, + "step": 1670, + "time_per_iteration": 2.4907078742980957 + }, + { + "auxiliary_loss_clip": 0.01173251, + "auxiliary_loss_mlp": 0.01049806, + "balance_loss_clip": 1.05503404, + "balance_loss_mlp": 1.0285989, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.6277511024611184, + "language_loss": 0.76716316, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.78939378, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.21203613, + "step": 1671, + "time_per_iteration": 2.5382633209228516 + }, + { + "auxiliary_loss_clip": 0.01077771, + "auxiliary_loss_mlp": 0.0100736, + "balance_loss_clip": 1.02994633, + "balance_loss_mlp": 1.00441539, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7735669308177319, + "language_loss": 0.60726571, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62811702, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.02938843, + "step": 1672, + "time_per_iteration": 3.026754379272461 + }, + { + "auxiliary_loss_clip": 0.01183431, + "auxiliary_loss_mlp": 0.0105832, + "balance_loss_clip": 1.05702114, + "balance_loss_mlp": 1.03603947, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.180467849059074, + "language_loss": 0.77447927, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79689682, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.22290039, + "step": 1673, + "time_per_iteration": 2.4519636631011963 + }, + { + "auxiliary_loss_clip": 0.01176357, + "auxiliary_loss_mlp": 0.0105457, + "balance_loss_clip": 1.05206704, + "balance_loss_mlp": 1.0311811, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.339429013093867, + "language_loss": 0.73363781, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75594705, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.23388672, + "step": 1674, + "time_per_iteration": 3.904435634613037 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01049713, + "balance_loss_clip": 1.0497092, + "balance_loss_mlp": 1.02706325, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.315172985346795, + "language_loss": 0.7974605, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81968457, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.22631836, + "step": 1675, + "time_per_iteration": 3.885525941848755 + }, + { + "auxiliary_loss_clip": 0.01177292, + "auxiliary_loss_mlp": 0.01059538, + "balance_loss_clip": 1.05557799, + "balance_loss_mlp": 1.03672123, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.257145380250523, + "language_loss": 0.8149122, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83728051, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.22814941, + "step": 1676, + "time_per_iteration": 2.570385456085205 + }, + { + "auxiliary_loss_clip": 0.01185088, + "auxiliary_loss_mlp": 0.01055294, + "balance_loss_clip": 1.05897355, + "balance_loss_mlp": 1.03351498, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 5.237789378050288, + "language_loss": 0.86012638, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88253021, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.21801758, + "step": 1677, + "time_per_iteration": 3.8937466144561768 + }, + { + "auxiliary_loss_clip": 0.0118135, + "auxiliary_loss_mlp": 0.01052776, + "balance_loss_clip": 1.05637181, + "balance_loss_mlp": 1.03124714, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 8.983505349343677, + "language_loss": 0.8596732, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88201451, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.21533203, + "step": 1678, + "time_per_iteration": 2.4337120056152344 + }, + { + "auxiliary_loss_clip": 0.01189519, + "auxiliary_loss_mlp": 0.01055409, + "balance_loss_clip": 1.06320882, + "balance_loss_mlp": 1.03300941, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 6.95345366309645, + "language_loss": 0.89986652, + "learning_rate": 3.947431963338532e-06, + "loss": 0.92231584, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.22399902, + "step": 1679, + "time_per_iteration": 3.8939006328582764 + }, + { + "auxiliary_loss_clip": 0.0107421, + "auxiliary_loss_mlp": 0.01006293, + "balance_loss_clip": 1.02651346, + "balance_loss_mlp": 1.00362277, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7784135717010058, + "language_loss": 0.5302273, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55103236, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.47705078, + "router_z_loss_mlp": 0.0267334, + "step": 1680, + "time_per_iteration": 3.1868882179260254 + }, + { + "auxiliary_loss_clip": 0.01178232, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_clip": 1.05542362, + "balance_loss_mlp": 1.03079677, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.7821867350189966, + "language_loss": 0.76755822, + "learning_rate": 3.947254403670641e-06, + "loss": 0.78986049, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.21191406, + "step": 1681, + "time_per_iteration": 2.461228370666504 + }, + { + "auxiliary_loss_clip": 0.01194173, + "auxiliary_loss_mlp": 0.01054512, + "balance_loss_clip": 1.0628593, + "balance_loss_mlp": 1.02953768, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.329298310767083, + "language_loss": 0.93447435, + "learning_rate": 3.947165513074889e-06, + "loss": 0.95696115, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.24963379, + "step": 1682, + "time_per_iteration": 2.435328722000122 + }, + { + "auxiliary_loss_clip": 0.01193342, + "auxiliary_loss_mlp": 0.01046976, + "balance_loss_clip": 1.06621277, + "balance_loss_mlp": 1.02537525, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 1.9193436908289234, + "language_loss": 0.87872684, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90113008, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.21594238, + "step": 1683, + "time_per_iteration": 2.4518160820007324 + }, + { + "auxiliary_loss_clip": 0.01176473, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.0564146, + "balance_loss_mlp": 1.03241622, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.779302418486126, + "language_loss": 0.74541831, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76772416, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.21691895, + "step": 1684, + "time_per_iteration": 2.468552827835083 + }, + { + "auxiliary_loss_clip": 0.01085309, + "auxiliary_loss_mlp": 0.0100659, + "balance_loss_clip": 1.03571069, + "balance_loss_mlp": 1.00340748, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.763234925295327, + "language_loss": 0.61154145, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63246042, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.49707031, + "router_z_loss_mlp": 0.03189087, + "step": 1685, + "time_per_iteration": 3.159696578979492 + }, + { + "auxiliary_loss_clip": 0.01183403, + "auxiliary_loss_mlp": 0.01052033, + "balance_loss_clip": 1.05907416, + "balance_loss_mlp": 1.02993131, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 4.7032604538914935, + "language_loss": 0.61295992, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63531429, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.22106934, + "step": 1686, + "time_per_iteration": 2.574004888534546 + }, + { + "auxiliary_loss_clip": 0.01183909, + "auxiliary_loss_mlp": 0.0105483, + "balance_loss_clip": 1.06299114, + "balance_loss_mlp": 1.03244209, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.7472447021845094, + "language_loss": 0.81403208, + "learning_rate": 3.946719952612972e-06, + "loss": 0.8364194, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.22387695, + "step": 1687, + "time_per_iteration": 2.5395169258117676 + }, + { + "auxiliary_loss_clip": 0.01183463, + "auxiliary_loss_mlp": 0.01053807, + "balance_loss_clip": 1.05644846, + "balance_loss_mlp": 1.031515, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.9175046454938998, + "language_loss": 0.72426194, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74663466, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.22277832, + "step": 1688, + "time_per_iteration": 2.5292139053344727 + }, + { + "auxiliary_loss_clip": 0.01178402, + "auxiliary_loss_mlp": 0.01061151, + "balance_loss_clip": 1.05617118, + "balance_loss_mlp": 1.0389303, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 3.1401577458610386, + "language_loss": 0.87525034, + "learning_rate": 3.94654121166582e-06, + "loss": 0.89764589, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.22229004, + "step": 1689, + "time_per_iteration": 2.451981544494629 + }, + { + "auxiliary_loss_clip": 0.01175959, + "auxiliary_loss_mlp": 0.01050691, + "balance_loss_clip": 1.05305994, + "balance_loss_mlp": 1.03013968, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 2.22249114273294, + "language_loss": 0.88011682, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90238333, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.20544434, + "step": 1690, + "time_per_iteration": 2.5589380264282227 + }, + { + "auxiliary_loss_clip": 0.01178104, + "auxiliary_loss_mlp": 0.01057253, + "balance_loss_clip": 1.05363715, + "balance_loss_mlp": 1.03583109, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 2.2359794220115377, + "language_loss": 0.8365249, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85887849, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.21435547, + "step": 1691, + "time_per_iteration": 2.4350814819335938 + }, + { + "auxiliary_loss_clip": 0.01182478, + "auxiliary_loss_mlp": 0.01059523, + "balance_loss_clip": 1.05544865, + "balance_loss_mlp": 1.03638494, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.9951073427972104, + "language_loss": 0.6667577, + "learning_rate": 3.946272546655801e-06, + "loss": 0.68917769, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.23144531, + "step": 1692, + "time_per_iteration": 2.6560349464416504 + }, + { + "auxiliary_loss_clip": 0.01183098, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_clip": 1.05748391, + "balance_loss_mlp": 1.04994261, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 5.4021185330707295, + "language_loss": 0.75635678, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77889931, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.21240234, + "step": 1693, + "time_per_iteration": 2.4597909450531006 + }, + { + "auxiliary_loss_clip": 0.01182867, + "auxiliary_loss_mlp": 0.01052987, + "balance_loss_clip": 1.05752277, + "balance_loss_mlp": 1.02988422, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.7760908129949884, + "language_loss": 0.87542534, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89778388, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.23095703, + "step": 1694, + "time_per_iteration": 2.4833884239196777 + }, + { + "auxiliary_loss_clip": 0.01198244, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_clip": 1.065346, + "balance_loss_mlp": 1.03131962, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 2.333574868848367, + "language_loss": 0.79498959, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81751949, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.23425293, + "step": 1695, + "time_per_iteration": 2.4235501289367676 + }, + { + "auxiliary_loss_clip": 0.01184602, + "auxiliary_loss_mlp": 0.01060845, + "balance_loss_clip": 1.05737829, + "balance_loss_mlp": 1.03770638, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.841547795011503, + "language_loss": 0.86477661, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88723111, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.23144531, + "step": 1696, + "time_per_iteration": 2.525050163269043 + }, + { + "auxiliary_loss_clip": 0.01177784, + "auxiliary_loss_mlp": 0.01059507, + "balance_loss_clip": 1.05505741, + "balance_loss_mlp": 1.03739381, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 2.900406616504431, + "language_loss": 0.8215121, + "learning_rate": 3.945823295627519e-06, + "loss": 0.843885, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.22119141, + "step": 1697, + "time_per_iteration": 2.4658725261688232 + }, + { + "auxiliary_loss_clip": 0.01192626, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.06228304, + "balance_loss_mlp": 1.02934849, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.1249618633540925, + "language_loss": 0.80712962, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82958657, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.23730469, + "step": 1698, + "time_per_iteration": 2.5443241596221924 + }, + { + "auxiliary_loss_clip": 0.01184455, + "auxiliary_loss_mlp": 0.01047072, + "balance_loss_clip": 1.05747664, + "balance_loss_mlp": 1.0253284, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 3.1181288855094285, + "language_loss": 0.76097822, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78329349, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.21765137, + "step": 1699, + "time_per_iteration": 2.467940092086792 + }, + { + "auxiliary_loss_clip": 0.0118214, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.05768728, + "balance_loss_mlp": 1.03080034, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.8277712314649066, + "language_loss": 0.79973793, + "learning_rate": 3.945552859553516e-06, + "loss": 0.82212055, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.25341797, + "step": 1700, + "time_per_iteration": 2.576227903366089 + }, + { + "auxiliary_loss_clip": 0.01182812, + "auxiliary_loss_mlp": 0.01056262, + "balance_loss_clip": 1.05524135, + "balance_loss_mlp": 1.03138328, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8606245490780458, + "language_loss": 0.76945949, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79185021, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.24890137, + "step": 1701, + "time_per_iteration": 2.5377204418182373 + }, + { + "auxiliary_loss_clip": 0.01185314, + "auxiliary_loss_mlp": 0.01053918, + "balance_loss_clip": 1.05619967, + "balance_loss_mlp": 1.03119636, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.158850419234796, + "language_loss": 0.77852702, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80091929, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.22705078, + "step": 1702, + "time_per_iteration": 2.5618369579315186 + }, + { + "auxiliary_loss_clip": 0.01188781, + "auxiliary_loss_mlp": 0.01047542, + "balance_loss_clip": 1.06216943, + "balance_loss_mlp": 1.02576256, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.261733020674512, + "language_loss": 0.94665533, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96901864, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.21765137, + "step": 1703, + "time_per_iteration": 2.448547840118408 + }, + { + "auxiliary_loss_clip": 0.01104822, + "auxiliary_loss_mlp": 0.01006681, + "balance_loss_clip": 1.0531981, + "balance_loss_mlp": 1.00354552, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8742985873370708, + "language_loss": 0.55085313, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57196814, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.51660156, + "router_z_loss_mlp": 0.03134155, + "step": 1704, + "time_per_iteration": 3.0556089878082275 + }, + { + "auxiliary_loss_clip": 0.01185195, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.05785418, + "balance_loss_mlp": 1.02906156, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.4734244307073276, + "language_loss": 0.84072423, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86309278, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.22607422, + "step": 1705, + "time_per_iteration": 2.4290413856506348 + }, + { + "auxiliary_loss_clip": 0.01113907, + "auxiliary_loss_mlp": 0.01006708, + "balance_loss_clip": 1.06250691, + "balance_loss_mlp": 1.00306308, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 1.2540657259243344, + "language_loss": 0.60418642, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.6253925, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.51464844, + "router_z_loss_mlp": 0.03649902, + "step": 1706, + "time_per_iteration": 3.176647901535034 + }, + { + "auxiliary_loss_clip": 0.01193847, + "auxiliary_loss_mlp": 0.01051283, + "balance_loss_clip": 1.06326866, + "balance_loss_mlp": 1.02897882, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.7160631123736314, + "language_loss": 0.86256939, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88502073, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.22302246, + "step": 1707, + "time_per_iteration": 2.4269235134124756 + }, + { + "auxiliary_loss_clip": 0.01180204, + "auxiliary_loss_mlp": 0.01049265, + "balance_loss_clip": 1.05526376, + "balance_loss_mlp": 1.0271039, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.7242377508476987, + "language_loss": 0.73245537, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75475007, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.22167969, + "step": 1708, + "time_per_iteration": 2.5320401191711426 + }, + { + "auxiliary_loss_clip": 0.01182219, + "auxiliary_loss_mlp": 0.01062882, + "balance_loss_clip": 1.05719817, + "balance_loss_mlp": 1.03949273, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.7237908336123353, + "language_loss": 0.9143272, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93677825, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.23388672, + "step": 1709, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01185802, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_clip": 1.05924284, + "balance_loss_mlp": 1.02639723, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 2.7513900254814128, + "language_loss": 0.88300097, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90535289, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.23022461, + "step": 1710, + "time_per_iteration": 2.529573678970337 + }, + { + "auxiliary_loss_clip": 0.0118422, + "auxiliary_loss_mlp": 0.01053783, + "balance_loss_clip": 1.05794716, + "balance_loss_mlp": 1.03113341, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.9043523787823926, + "language_loss": 0.79247171, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485176, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.22668457, + "step": 1711, + "time_per_iteration": 2.470608949661255 + }, + { + "auxiliary_loss_clip": 0.01190422, + "auxiliary_loss_mlp": 0.01056033, + "balance_loss_clip": 1.06151235, + "balance_loss_mlp": 1.03238153, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 1.9234811318788525, + "language_loss": 0.73819041, + "learning_rate": 3.944464476383668e-06, + "loss": 0.76065499, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.2364502, + "step": 1712, + "time_per_iteration": 2.54910945892334 + }, + { + "auxiliary_loss_clip": 0.01180836, + "auxiliary_loss_mlp": 0.01055642, + "balance_loss_clip": 1.05793214, + "balance_loss_mlp": 1.03418446, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8230427875570847, + "language_loss": 0.8667016, + "learning_rate": 3.94437329843114e-06, + "loss": 0.88906634, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.21472168, + "step": 1713, + "time_per_iteration": 2.4976882934570312 + }, + { + "auxiliary_loss_clip": 0.01186104, + "auxiliary_loss_mlp": 0.01061273, + "balance_loss_clip": 1.05994284, + "balance_loss_mlp": 1.04011309, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.6309141083925713, + "language_loss": 0.7256223, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74809599, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.21154785, + "step": 1714, + "time_per_iteration": 2.5006484985351562 + }, + { + "auxiliary_loss_clip": 0.01191097, + "auxiliary_loss_mlp": 0.01056896, + "balance_loss_clip": 1.06006408, + "balance_loss_mlp": 1.03356624, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 1.951379628797557, + "language_loss": 0.90863442, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93111438, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.2331543, + "step": 1715, + "time_per_iteration": 2.4925200939178467 + }, + { + "auxiliary_loss_clip": 0.01184828, + "auxiliary_loss_mlp": 0.01060939, + "balance_loss_clip": 1.05826807, + "balance_loss_mlp": 1.03842044, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.8334707486476007, + "language_loss": 0.75985068, + "learning_rate": 3.944099322202418e-06, + "loss": 0.78230834, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.22521973, + "step": 1716, + "time_per_iteration": 2.591581344604492 + }, + { + "auxiliary_loss_clip": 0.01183753, + "auxiliary_loss_mlp": 0.01066954, + "balance_loss_clip": 1.05707812, + "balance_loss_mlp": 1.04410195, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.08511842061253, + "language_loss": 0.84757435, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87008142, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.22827148, + "step": 1717, + "time_per_iteration": 3.984079360961914 + }, + { + "auxiliary_loss_clip": 0.01179932, + "auxiliary_loss_mlp": 0.01055577, + "balance_loss_clip": 1.05826378, + "balance_loss_mlp": 1.03473926, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 1.9111327335939419, + "language_loss": 0.8242681, + "learning_rate": 3.943916302775292e-06, + "loss": 0.84662324, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.20825195, + "step": 1718, + "time_per_iteration": 3.797858953475952 + }, + { + "auxiliary_loss_clip": 0.01186049, + "auxiliary_loss_mlp": 0.01046375, + "balance_loss_clip": 1.06279659, + "balance_loss_mlp": 1.0238564, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8799603523102435, + "language_loss": 0.73330295, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75562721, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.2253418, + "step": 1719, + "time_per_iteration": 2.5746867656707764 + }, + { + "auxiliary_loss_clip": 0.01187034, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.06091213, + "balance_loss_mlp": 1.02858698, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 1.9302572491453367, + "language_loss": 0.92513967, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94750285, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.20690918, + "step": 1720, + "time_per_iteration": 2.441253185272217 + }, + { + "auxiliary_loss_clip": 0.01180995, + "auxiliary_loss_mlp": 0.01052357, + "balance_loss_clip": 1.05628777, + "balance_loss_mlp": 1.03008902, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.8560323022434813, + "language_loss": 0.79355091, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81588447, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.22265625, + "step": 1721, + "time_per_iteration": 3.930884599685669 + }, + { + "auxiliary_loss_clip": 0.01190628, + "auxiliary_loss_mlp": 0.0106336, + "balance_loss_clip": 1.06115818, + "balance_loss_mlp": 1.03942323, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.1197117296387233, + "language_loss": 0.8040216, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82656145, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.23950195, + "step": 1722, + "time_per_iteration": 3.9261343479156494 + }, + { + "auxiliary_loss_clip": 0.01091336, + "auxiliary_loss_mlp": 0.01025595, + "balance_loss_clip": 1.04065418, + "balance_loss_mlp": 1.02285874, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9764001736648761, + "language_loss": 0.67206055, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69322991, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 0.02737427, + "step": 1723, + "time_per_iteration": 2.8479840755462646 + }, + { + "auxiliary_loss_clip": 0.01196447, + "auxiliary_loss_mlp": 0.01056557, + "balance_loss_clip": 1.06663465, + "balance_loss_mlp": 1.03511143, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.891846248244082, + "language_loss": 0.78187001, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80440003, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.21447754, + "step": 1724, + "time_per_iteration": 2.449434518814087 + }, + { + "auxiliary_loss_clip": 0.01185939, + "auxiliary_loss_mlp": 0.01054091, + "balance_loss_clip": 1.05910242, + "balance_loss_mlp": 1.03285956, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 2.1019432023641564, + "language_loss": 0.75064439, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77304465, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.21203613, + "step": 1725, + "time_per_iteration": 2.7032461166381836 + }, + { + "auxiliary_loss_clip": 0.01186638, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.06140494, + "balance_loss_mlp": 1.03464305, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.744886864218967, + "language_loss": 0.74877363, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77119875, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.21228027, + "step": 1726, + "time_per_iteration": 2.4653146266937256 + }, + { + "auxiliary_loss_clip": 0.01209277, + "auxiliary_loss_mlp": 0.01065525, + "balance_loss_clip": 1.07924366, + "balance_loss_mlp": 1.04305422, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 3.002670958430351, + "language_loss": 0.73595518, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75870317, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.22473145, + "step": 1727, + "time_per_iteration": 2.5372867584228516 + }, + { + "auxiliary_loss_clip": 0.01181099, + "auxiliary_loss_mlp": 0.01051563, + "balance_loss_clip": 1.0561161, + "balance_loss_mlp": 1.03018904, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.341035191821147, + "language_loss": 0.84700358, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86933017, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.21362305, + "step": 1728, + "time_per_iteration": 2.459115982055664 + }, + { + "auxiliary_loss_clip": 0.01185101, + "auxiliary_loss_mlp": 0.01049275, + "balance_loss_clip": 1.05958247, + "balance_loss_mlp": 1.02757859, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.173219643100395, + "language_loss": 0.70952684, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73187059, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.21704102, + "step": 1729, + "time_per_iteration": 2.4970641136169434 + }, + { + "auxiliary_loss_clip": 0.01185272, + "auxiliary_loss_mlp": 0.01064285, + "balance_loss_clip": 1.05989349, + "balance_loss_mlp": 1.04082513, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.6324774856451043, + "language_loss": 0.81694186, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83943748, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.23474121, + "step": 1730, + "time_per_iteration": 2.4345359802246094 + }, + { + "auxiliary_loss_clip": 0.0117969, + "auxiliary_loss_mlp": 0.0105092, + "balance_loss_clip": 1.05596292, + "balance_loss_mlp": 1.03110731, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.9649987658882964, + "language_loss": 0.75763547, + "learning_rate": 3.942719490677489e-06, + "loss": 0.77994162, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.19824219, + "step": 1731, + "time_per_iteration": 2.457520008087158 + }, + { + "auxiliary_loss_clip": 0.01192096, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.07029152, + "balance_loss_mlp": 1.03020501, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.7668855742364282, + "language_loss": 0.83094144, + "learning_rate": 3.9426269124336e-06, + "loss": 0.85336512, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.20080566, + "step": 1732, + "time_per_iteration": 2.4664907455444336 + }, + { + "auxiliary_loss_clip": 0.01178904, + "auxiliary_loss_mlp": 0.01056422, + "balance_loss_clip": 1.05638433, + "balance_loss_mlp": 1.03726482, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.68757315603035, + "language_loss": 0.83412433, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85647756, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.19152832, + "step": 1733, + "time_per_iteration": 2.4702036380767822 + }, + { + "auxiliary_loss_clip": 0.01184655, + "auxiliary_loss_mlp": 0.01058827, + "balance_loss_clip": 1.0586946, + "balance_loss_mlp": 1.03792953, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 3.9722910846586257, + "language_loss": 0.76624095, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78867573, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.2088623, + "step": 1734, + "time_per_iteration": 2.420538902282715 + }, + { + "auxiliary_loss_clip": 0.01184411, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.06078863, + "balance_loss_mlp": 1.03381562, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 8.601011061445648, + "language_loss": 0.74886847, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77125478, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20385742, + "step": 1735, + "time_per_iteration": 2.490955352783203 + }, + { + "auxiliary_loss_clip": 0.01187596, + "auxiliary_loss_mlp": 0.01056153, + "balance_loss_clip": 1.06031013, + "balance_loss_mlp": 1.03511214, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.730524518115856, + "language_loss": 0.78425819, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80669564, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.21020508, + "step": 1736, + "time_per_iteration": 2.5167155265808105 + }, + { + "auxiliary_loss_clip": 0.01179234, + "auxiliary_loss_mlp": 0.0105798, + "balance_loss_clip": 1.05663586, + "balance_loss_mlp": 1.03626049, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.7987251718792492, + "language_loss": 0.70608926, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72846138, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.21716309, + "step": 1737, + "time_per_iteration": 2.5271384716033936 + }, + { + "auxiliary_loss_clip": 0.01182143, + "auxiliary_loss_mlp": 0.01059918, + "balance_loss_clip": 1.05374551, + "balance_loss_mlp": 1.03539634, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.8156764095380113, + "language_loss": 0.81489289, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83731347, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.24499512, + "step": 1738, + "time_per_iteration": 2.5678775310516357 + }, + { + "auxiliary_loss_clip": 0.01182577, + "auxiliary_loss_mlp": 0.01060082, + "balance_loss_clip": 1.05483329, + "balance_loss_mlp": 1.03680038, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 2.652419882432326, + "language_loss": 0.7484771, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77090371, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.23291016, + "step": 1739, + "time_per_iteration": 2.455061674118042 + }, + { + "auxiliary_loss_clip": 0.01183552, + "auxiliary_loss_mlp": 0.01057823, + "balance_loss_clip": 1.05988336, + "balance_loss_mlp": 1.03579319, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.6187315979243198, + "language_loss": 0.77198637, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79440016, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.22045898, + "step": 1740, + "time_per_iteration": 2.471780776977539 + }, + { + "auxiliary_loss_clip": 0.01183385, + "auxiliary_loss_mlp": 0.01053654, + "balance_loss_clip": 1.06057239, + "balance_loss_mlp": 1.0331738, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.3591055395113023, + "language_loss": 0.85929841, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88166881, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.20483398, + "step": 1741, + "time_per_iteration": 2.5167689323425293 + }, + { + "auxiliary_loss_clip": 0.01183568, + "auxiliary_loss_mlp": 0.01053532, + "balance_loss_clip": 1.05795503, + "balance_loss_mlp": 1.03077555, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 16.878759925871993, + "language_loss": 0.748712, + "learning_rate": 3.941697079021942e-06, + "loss": 0.771083, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.22741699, + "step": 1742, + "time_per_iteration": 2.45400071144104 + }, + { + "auxiliary_loss_clip": 0.01186563, + "auxiliary_loss_mlp": 0.01060639, + "balance_loss_clip": 1.0630424, + "balance_loss_mlp": 1.03994453, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.8453738018572836, + "language_loss": 0.87966871, + "learning_rate": 3.94160369066107e-06, + "loss": 0.90214074, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.20678711, + "step": 1743, + "time_per_iteration": 2.554673194885254 + }, + { + "auxiliary_loss_clip": 0.01176409, + "auxiliary_loss_mlp": 0.01048291, + "balance_loss_clip": 1.05402303, + "balance_loss_mlp": 1.02560508, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.217521772393598, + "language_loss": 0.75548661, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77773368, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.22705078, + "step": 1744, + "time_per_iteration": 2.504962921142578 + }, + { + "auxiliary_loss_clip": 0.01194341, + "auxiliary_loss_mlp": 0.01059609, + "balance_loss_clip": 1.06933069, + "balance_loss_mlp": 1.03898609, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.091763835474317, + "language_loss": 0.7906512, + "learning_rate": 3.941416693065451e-06, + "loss": 0.8131907, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.20629883, + "step": 1745, + "time_per_iteration": 2.6574759483337402 + }, + { + "auxiliary_loss_clip": 0.01179184, + "auxiliary_loss_mlp": 0.01065125, + "balance_loss_clip": 1.05413508, + "balance_loss_mlp": 1.04288065, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.1433934500975442, + "language_loss": 0.82995242, + "learning_rate": 3.941323083837794e-06, + "loss": 0.85239553, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.22241211, + "step": 1746, + "time_per_iteration": 2.5027804374694824 + }, + { + "auxiliary_loss_clip": 0.01185154, + "auxiliary_loss_mlp": 0.01073881, + "balance_loss_clip": 1.06153917, + "balance_loss_mlp": 1.05311453, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 2.8679813421607125, + "language_loss": 0.70075643, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72334671, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.20776367, + "step": 1747, + "time_per_iteration": 2.65671968460083 + }, + { + "auxiliary_loss_clip": 0.01184855, + "auxiliary_loss_mlp": 0.01074162, + "balance_loss_clip": 1.05581963, + "balance_loss_mlp": 1.05032003, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.2016542185734655, + "language_loss": 0.84287238, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86546254, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.23828125, + "step": 1748, + "time_per_iteration": 2.5621936321258545 + }, + { + "auxiliary_loss_clip": 0.01181356, + "auxiliary_loss_mlp": 0.01061477, + "balance_loss_clip": 1.05796373, + "balance_loss_mlp": 1.03938723, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.6946147730815342, + "language_loss": 0.71536386, + "learning_rate": 3.941041814478041e-06, + "loss": 0.73779225, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.22094727, + "step": 1749, + "time_per_iteration": 2.4913716316223145 + }, + { + "auxiliary_loss_clip": 0.01187854, + "auxiliary_loss_mlp": 0.01066765, + "balance_loss_clip": 1.06510663, + "balance_loss_mlp": 1.04457998, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.0230784618251327, + "language_loss": 0.8191694, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84171563, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.22180176, + "step": 1750, + "time_per_iteration": 2.5037777423858643 + }, + { + "auxiliary_loss_clip": 0.0118237, + "auxiliary_loss_mlp": 0.01061011, + "balance_loss_clip": 1.05809414, + "balance_loss_mlp": 1.03931534, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.767171116137461, + "language_loss": 0.92668265, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94911647, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.21704102, + "step": 1751, + "time_per_iteration": 2.4694626331329346 + }, + { + "auxiliary_loss_clip": 0.01191411, + "auxiliary_loss_mlp": 0.01053805, + "balance_loss_clip": 1.06680846, + "balance_loss_mlp": 1.03250194, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.262298263549276, + "language_loss": 0.79708898, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81954116, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.21325684, + "step": 1752, + "time_per_iteration": 2.4590651988983154 + }, + { + "auxiliary_loss_clip": 0.01180636, + "auxiliary_loss_mlp": 0.0105592, + "balance_loss_clip": 1.05787587, + "balance_loss_mlp": 1.03434336, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 1.9275471564267035, + "language_loss": 0.75832295, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78068852, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.21594238, + "step": 1753, + "time_per_iteration": 2.5234076976776123 + }, + { + "auxiliary_loss_clip": 0.01190136, + "auxiliary_loss_mlp": 0.01061443, + "balance_loss_clip": 1.06025898, + "balance_loss_mlp": 1.03867364, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.556519330861184, + "language_loss": 0.83937919, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86189497, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.2277832, + "step": 1754, + "time_per_iteration": 2.5502731800079346 + }, + { + "auxiliary_loss_clip": 0.01190701, + "auxiliary_loss_mlp": 0.01061962, + "balance_loss_clip": 1.064031, + "balance_loss_mlp": 1.03655863, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.970709957750858, + "language_loss": 0.69432288, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71684951, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.25402832, + "step": 1755, + "time_per_iteration": 2.477832078933716 + }, + { + "auxiliary_loss_clip": 0.01184549, + "auxiliary_loss_mlp": 0.01070247, + "balance_loss_clip": 1.05903554, + "balance_loss_mlp": 1.04700089, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.136497872528758, + "language_loss": 0.76867485, + "learning_rate": 3.940382943314182e-06, + "loss": 0.79122275, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.2322998, + "step": 1756, + "time_per_iteration": 2.512014150619507 + }, + { + "auxiliary_loss_clip": 0.01189871, + "auxiliary_loss_mlp": 0.01062141, + "balance_loss_clip": 1.06405759, + "balance_loss_mlp": 1.04120803, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7286519892011698, + "language_loss": 0.79823792, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82075799, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.20947266, + "step": 1757, + "time_per_iteration": 2.476814031600952 + }, + { + "auxiliary_loss_clip": 0.0118828, + "auxiliary_loss_mlp": 0.010564, + "balance_loss_clip": 1.06498325, + "balance_loss_mlp": 1.03466785, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 2.0850087819523684, + "language_loss": 0.78621948, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80866629, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.21728516, + "step": 1758, + "time_per_iteration": 2.7630040645599365 + }, + { + "auxiliary_loss_clip": 0.01187361, + "auxiliary_loss_mlp": 0.01053315, + "balance_loss_clip": 1.05853844, + "balance_loss_mlp": 1.03219104, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 4.48194190814133, + "language_loss": 0.92173433, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94414109, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.21118164, + "step": 1759, + "time_per_iteration": 2.4942517280578613 + }, + { + "auxiliary_loss_clip": 0.01185461, + "auxiliary_loss_mlp": 0.0105358, + "balance_loss_clip": 1.05773234, + "balance_loss_mlp": 1.03023899, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 3.599934694585368, + "language_loss": 0.77711511, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79950553, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.23339844, + "step": 1760, + "time_per_iteration": 2.4494166374206543 + }, + { + "auxiliary_loss_clip": 0.01194906, + "auxiliary_loss_mlp": 0.01065859, + "balance_loss_clip": 1.0655756, + "balance_loss_mlp": 1.04196954, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.226348708470528, + "language_loss": 0.89130062, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91390824, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.2388916, + "step": 1761, + "time_per_iteration": 3.920287847518921 + }, + { + "auxiliary_loss_clip": 0.01185118, + "auxiliary_loss_mlp": 0.01061554, + "balance_loss_clip": 1.06091607, + "balance_loss_mlp": 1.03972626, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.266845953170032, + "language_loss": 0.7800473, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80251402, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.21838379, + "step": 1762, + "time_per_iteration": 2.620760202407837 + }, + { + "auxiliary_loss_clip": 0.01093089, + "auxiliary_loss_mlp": 0.01015954, + "balance_loss_clip": 1.04547775, + "balance_loss_mlp": 1.01325393, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.7593152799476236, + "language_loss": 0.60489053, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62598097, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.47558594, + "router_z_loss_mlp": 0.02697754, + "step": 1763, + "time_per_iteration": 4.6364593505859375 + }, + { + "auxiliary_loss_clip": 0.01189949, + "auxiliary_loss_mlp": 0.01056538, + "balance_loss_clip": 1.06194544, + "balance_loss_mlp": 1.03456807, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 1.7309025660362243, + "language_loss": 0.80077875, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82324362, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.21960449, + "step": 1764, + "time_per_iteration": 2.471067428588867 + }, + { + "auxiliary_loss_clip": 0.01182084, + "auxiliary_loss_mlp": 0.01057946, + "balance_loss_clip": 1.05688274, + "balance_loss_mlp": 1.03464067, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 2.3965920188276617, + "language_loss": 0.80194771, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82434797, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.23339844, + "step": 1765, + "time_per_iteration": 3.9541239738464355 + }, + { + "auxiliary_loss_clip": 0.01180461, + "auxiliary_loss_mlp": 0.01058413, + "balance_loss_clip": 1.05547464, + "balance_loss_mlp": 1.0360378, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.9265078560892321, + "language_loss": 0.76807845, + "learning_rate": 3.939435444841306e-06, + "loss": 0.7904672, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.22399902, + "step": 1766, + "time_per_iteration": 4.0132787227630615 + }, + { + "auxiliary_loss_clip": 0.01204335, + "auxiliary_loss_mlp": 0.0106473, + "balance_loss_clip": 1.07561493, + "balance_loss_mlp": 1.04173422, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6126410126527135, + "language_loss": 0.77352214, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79621279, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.22998047, + "step": 1767, + "time_per_iteration": 2.5724663734436035 + }, + { + "auxiliary_loss_clip": 0.01097684, + "auxiliary_loss_mlp": 0.01005641, + "balance_loss_clip": 1.04990423, + "balance_loss_mlp": 1.00256848, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6744650725688109, + "language_loss": 0.57913792, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60017121, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.03067017, + "step": 1768, + "time_per_iteration": 3.185056686401367 + }, + { + "auxiliary_loss_clip": 0.01190554, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_clip": 1.06575799, + "balance_loss_mlp": 1.0249927, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.437059746783594, + "language_loss": 0.86558008, + "learning_rate": 3.939149761035749e-06, + "loss": 0.88793981, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.20422363, + "step": 1769, + "time_per_iteration": 2.513598918914795 + }, + { + "auxiliary_loss_clip": 0.0118723, + "auxiliary_loss_mlp": 0.01057079, + "balance_loss_clip": 1.06035995, + "balance_loss_mlp": 1.03363013, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 2.194893137882155, + "language_loss": 0.62094569, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64338881, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.23461914, + "step": 1770, + "time_per_iteration": 2.5527381896972656 + }, + { + "auxiliary_loss_clip": 0.0108641, + "auxiliary_loss_mlp": 0.01008268, + "balance_loss_clip": 1.03937197, + "balance_loss_mlp": 1.00514448, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8827064195981624, + "language_loss": 0.57034457, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59129137, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.47070312, + "router_z_loss_mlp": 0.03125, + "step": 1771, + "time_per_iteration": 2.969391107559204 + }, + { + "auxiliary_loss_clip": 0.01186095, + "auxiliary_loss_mlp": 0.01061003, + "balance_loss_clip": 1.06209755, + "balance_loss_mlp": 1.03948557, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6770876416908382, + "language_loss": 0.88154781, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90401882, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.21508789, + "step": 1772, + "time_per_iteration": 2.4642930030822754 + }, + { + "auxiliary_loss_clip": 0.01187167, + "auxiliary_loss_mlp": 0.01062213, + "balance_loss_clip": 1.05618644, + "balance_loss_mlp": 1.03815699, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 3.003480765536802, + "language_loss": 0.76156139, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78405523, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.24072266, + "step": 1773, + "time_per_iteration": 2.4248268604278564 + }, + { + "auxiliary_loss_clip": 0.01187646, + "auxiliary_loss_mlp": 0.01061156, + "balance_loss_clip": 1.06134391, + "balance_loss_mlp": 1.03828001, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.8032366711448202, + "language_loss": 0.82744372, + "learning_rate": 3.938672150753041e-06, + "loss": 0.84993172, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.22888184, + "step": 1774, + "time_per_iteration": 2.5086357593536377 + }, + { + "auxiliary_loss_clip": 0.01190913, + "auxiliary_loss_mlp": 0.01057029, + "balance_loss_clip": 1.06058455, + "balance_loss_mlp": 1.03409326, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.6341064971611385, + "language_loss": 0.76419437, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78667372, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.22924805, + "step": 1775, + "time_per_iteration": 2.4289352893829346 + }, + { + "auxiliary_loss_clip": 0.01098175, + "auxiliary_loss_mlp": 0.01010678, + "balance_loss_clip": 1.0504452, + "balance_loss_mlp": 1.00736952, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8578670823462675, + "language_loss": 0.57407361, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59516215, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.47753906, + "router_z_loss_mlp": 0.03308105, + "step": 1776, + "time_per_iteration": 3.0932374000549316 + }, + { + "auxiliary_loss_clip": 0.01186007, + "auxiliary_loss_mlp": 0.01057927, + "balance_loss_clip": 1.06122017, + "balance_loss_mlp": 1.03433561, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.808438209078855, + "language_loss": 0.83336276, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85580206, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.23608398, + "step": 1777, + "time_per_iteration": 2.533440351486206 + }, + { + "auxiliary_loss_clip": 0.01180334, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.05922031, + "balance_loss_mlp": 1.03640473, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.048118707939436, + "language_loss": 0.8744716, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89686024, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.22143555, + "step": 1778, + "time_per_iteration": 2.487856388092041 + }, + { + "auxiliary_loss_clip": 0.01187572, + "auxiliary_loss_mlp": 0.01053353, + "balance_loss_clip": 1.06298804, + "balance_loss_mlp": 1.0306201, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.0257665076196045, + "language_loss": 0.84070408, + "learning_rate": 3.938192702604417e-06, + "loss": 0.8631134, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.22717285, + "step": 1779, + "time_per_iteration": 2.4458470344543457 + }, + { + "auxiliary_loss_clip": 0.01186689, + "auxiliary_loss_mlp": 0.01050372, + "balance_loss_clip": 1.06200075, + "balance_loss_mlp": 1.02961779, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.9534726947054506, + "language_loss": 0.67051661, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69288719, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.20751953, + "step": 1780, + "time_per_iteration": 2.4262077808380127 + }, + { + "auxiliary_loss_clip": 0.01186176, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_clip": 1.06193209, + "balance_loss_mlp": 1.02969432, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.172319030163447, + "language_loss": 0.91880614, + "learning_rate": 3.938000408844265e-06, + "loss": 0.94118679, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.22180176, + "step": 1781, + "time_per_iteration": 2.4460020065307617 + }, + { + "auxiliary_loss_clip": 0.01192475, + "auxiliary_loss_mlp": 0.01054437, + "balance_loss_clip": 1.06545651, + "balance_loss_mlp": 1.03325403, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.1751052164173235, + "language_loss": 0.79374957, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81621867, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.21179199, + "step": 1782, + "time_per_iteration": 2.4301795959472656 + }, + { + "auxiliary_loss_clip": 0.0118641, + "auxiliary_loss_mlp": 0.010516, + "balance_loss_clip": 1.06002831, + "balance_loss_mlp": 1.02974868, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 1.8931001139359502, + "language_loss": 0.78896296, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81134307, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.21862793, + "step": 1783, + "time_per_iteration": 2.4450523853302 + }, + { + "auxiliary_loss_clip": 0.01198137, + "auxiliary_loss_mlp": 0.01058489, + "balance_loss_clip": 1.06801951, + "balance_loss_mlp": 1.03538585, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 2.2350201061071355, + "language_loss": 0.86193436, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88450062, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.23095703, + "step": 1784, + "time_per_iteration": 2.446683168411255 + }, + { + "auxiliary_loss_clip": 0.01190791, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.06258631, + "balance_loss_mlp": 1.02955413, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 2.201614969100845, + "language_loss": 1.0097369, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03216875, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.22851562, + "step": 1785, + "time_per_iteration": 2.4687702655792236 + }, + { + "auxiliary_loss_clip": 0.01182759, + "auxiliary_loss_mlp": 0.01058614, + "balance_loss_clip": 1.0618242, + "balance_loss_mlp": 1.03757358, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.3824996971130983, + "language_loss": 0.85133809, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87375188, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.21057129, + "step": 1786, + "time_per_iteration": 2.510801315307617 + }, + { + "auxiliary_loss_clip": 0.01181717, + "auxiliary_loss_mlp": 0.01056251, + "balance_loss_clip": 1.05659568, + "balance_loss_mlp": 1.03212345, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.7620413666434305, + "language_loss": 0.78490233, + "learning_rate": 3.937421763940642e-06, + "loss": 0.80728209, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.24133301, + "step": 1787, + "time_per_iteration": 2.4719126224517822 + }, + { + "auxiliary_loss_clip": 0.01192523, + "auxiliary_loss_mlp": 0.01044262, + "balance_loss_clip": 1.06272411, + "balance_loss_mlp": 1.02211285, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.6330724588685097, + "language_loss": 0.82658249, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84895033, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.22143555, + "step": 1788, + "time_per_iteration": 2.428765058517456 + }, + { + "auxiliary_loss_clip": 0.01186501, + "auxiliary_loss_mlp": 0.01063283, + "balance_loss_clip": 1.0599004, + "balance_loss_mlp": 1.04132426, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 1.7763495553435869, + "language_loss": 0.78196669, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80446458, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.21984863, + "step": 1789, + "time_per_iteration": 2.4681241512298584 + }, + { + "auxiliary_loss_clip": 0.01188019, + "auxiliary_loss_mlp": 0.01059633, + "balance_loss_clip": 1.06233907, + "balance_loss_mlp": 1.03437209, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 3.750688819798608, + "language_loss": 0.74510193, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76757842, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.25256348, + "step": 1790, + "time_per_iteration": 2.448842763900757 + }, + { + "auxiliary_loss_clip": 0.0118737, + "auxiliary_loss_mlp": 0.01067912, + "balance_loss_clip": 1.06145, + "balance_loss_mlp": 1.0432235, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.422955662877877, + "language_loss": 0.78268313, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80523598, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.24694824, + "step": 1791, + "time_per_iteration": 2.495712995529175 + }, + { + "auxiliary_loss_clip": 0.01180181, + "auxiliary_loss_mlp": 0.01056424, + "balance_loss_clip": 1.06035089, + "balance_loss_mlp": 1.03448904, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.6018801543094703, + "language_loss": 0.70886219, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73122823, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.21923828, + "step": 1792, + "time_per_iteration": 2.5234413146972656 + }, + { + "auxiliary_loss_clip": 0.01188502, + "auxiliary_loss_mlp": 0.01050499, + "balance_loss_clip": 1.05898833, + "balance_loss_mlp": 1.02702665, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 1.8009879927461931, + "language_loss": 0.75791132, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78030133, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.23486328, + "step": 1793, + "time_per_iteration": 2.4626214504241943 + }, + { + "auxiliary_loss_clip": 0.01184183, + "auxiliary_loss_mlp": 0.0105936, + "balance_loss_clip": 1.06202269, + "balance_loss_mlp": 1.03696036, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.6713239860351665, + "language_loss": 0.85023159, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87266701, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.22399902, + "step": 1794, + "time_per_iteration": 2.479753017425537 + }, + { + "auxiliary_loss_clip": 0.01191242, + "auxiliary_loss_mlp": 0.0105375, + "balance_loss_clip": 1.05979526, + "balance_loss_mlp": 1.02931237, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.7334139168480722, + "language_loss": 0.74558008, + "learning_rate": 3.936646123375246e-06, + "loss": 0.76803005, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.24438477, + "step": 1795, + "time_per_iteration": 2.436511278152466 + }, + { + "auxiliary_loss_clip": 0.01191661, + "auxiliary_loss_mlp": 0.01060748, + "balance_loss_clip": 1.06374788, + "balance_loss_mlp": 1.03764486, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.9919860423442013, + "language_loss": 0.81224382, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83476794, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.2310791, + "step": 1796, + "time_per_iteration": 2.4414970874786377 + }, + { + "auxiliary_loss_clip": 0.01189282, + "auxiliary_loss_mlp": 0.01073267, + "balance_loss_clip": 1.06084085, + "balance_loss_mlp": 1.04880488, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.3787366412648274, + "language_loss": 0.73669767, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75932312, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.24487305, + "step": 1797, + "time_per_iteration": 2.416072130203247 + }, + { + "auxiliary_loss_clip": 0.01188995, + "auxiliary_loss_mlp": 0.01050783, + "balance_loss_clip": 1.06199992, + "balance_loss_mlp": 1.0299933, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 1.9977362551142988, + "language_loss": 0.8133443, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83574212, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.20776367, + "step": 1798, + "time_per_iteration": 2.406066656112671 + }, + { + "auxiliary_loss_clip": 0.0118418, + "auxiliary_loss_mlp": 0.0105685, + "balance_loss_clip": 1.0570693, + "balance_loss_mlp": 1.03272176, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.320841504268658, + "language_loss": 0.85655296, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87896323, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.24145508, + "step": 1799, + "time_per_iteration": 2.3844923973083496 + }, + { + "auxiliary_loss_clip": 0.0118395, + "auxiliary_loss_mlp": 0.01067211, + "balance_loss_clip": 1.06274509, + "balance_loss_mlp": 1.04509819, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.856720614953574, + "language_loss": 0.77320033, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79571199, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.22106934, + "step": 1800, + "time_per_iteration": 2.4467709064483643 + }, + { + "auxiliary_loss_clip": 0.01181177, + "auxiliary_loss_mlp": 0.01049383, + "balance_loss_clip": 1.05818748, + "balance_loss_mlp": 1.02815151, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.713405124746458, + "language_loss": 0.72904313, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75134861, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.21240234, + "step": 1801, + "time_per_iteration": 2.4834368228912354 + }, + { + "auxiliary_loss_clip": 0.01187443, + "auxiliary_loss_mlp": 0.01057796, + "balance_loss_clip": 1.05789495, + "balance_loss_mlp": 1.03449035, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.9051324028361551, + "language_loss": 0.66051471, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68296707, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.2331543, + "step": 1802, + "time_per_iteration": 2.5012922286987305 + }, + { + "auxiliary_loss_clip": 0.01187334, + "auxiliary_loss_mlp": 0.01061494, + "balance_loss_clip": 1.06441498, + "balance_loss_mlp": 1.03948832, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7508514421509873, + "language_loss": 0.8164264, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83891463, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.22009277, + "step": 1803, + "time_per_iteration": 2.4061975479125977 + }, + { + "auxiliary_loss_clip": 0.01186274, + "auxiliary_loss_mlp": 0.01057804, + "balance_loss_clip": 1.06340504, + "balance_loss_mlp": 1.03485608, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.6071469572723465, + "language_loss": 0.9124763, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93491709, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.22937012, + "step": 1804, + "time_per_iteration": 3.9171130657196045 + }, + { + "auxiliary_loss_clip": 0.01183354, + "auxiliary_loss_mlp": 0.01056285, + "balance_loss_clip": 1.05859971, + "balance_loss_mlp": 1.03356409, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 1.847163864132092, + "language_loss": 0.76576668, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78816307, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.22729492, + "step": 1805, + "time_per_iteration": 2.5035014152526855 + }, + { + "auxiliary_loss_clip": 0.01182175, + "auxiliary_loss_mlp": 0.01052852, + "balance_loss_clip": 1.05966997, + "balance_loss_mlp": 1.03202605, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 2.1427860278587594, + "language_loss": 0.86309117, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88544142, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.20825195, + "step": 1806, + "time_per_iteration": 3.79984974861145 + }, + { + "auxiliary_loss_clip": 0.01181946, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_clip": 1.05688238, + "balance_loss_mlp": 1.03375161, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 67.63968034347921, + "language_loss": 0.80810159, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.83047259, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.21398926, + "step": 1807, + "time_per_iteration": 2.4146649837493896 + }, + { + "auxiliary_loss_clip": 0.01183022, + "auxiliary_loss_mlp": 0.01052747, + "balance_loss_clip": 1.05975258, + "balance_loss_mlp": 1.03270769, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 2.14024703075428, + "language_loss": 0.79164934, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81400698, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.20031738, + "step": 1808, + "time_per_iteration": 3.835257053375244 + }, + { + "auxiliary_loss_clip": 0.01182526, + "auxiliary_loss_mlp": 0.01053824, + "balance_loss_clip": 1.05948186, + "balance_loss_mlp": 1.03229451, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6955539303174516, + "language_loss": 0.79472816, + "learning_rate": 3.935277444103342e-06, + "loss": 0.8170917, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.21520996, + "step": 1809, + "time_per_iteration": 2.477532148361206 + }, + { + "auxiliary_loss_clip": 0.01189349, + "auxiliary_loss_mlp": 0.01058877, + "balance_loss_clip": 1.06368685, + "balance_loss_mlp": 1.03731203, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.5207931089440865, + "language_loss": 0.84638882, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86887103, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.21569824, + "step": 1810, + "time_per_iteration": 3.904587745666504 + }, + { + "auxiliary_loss_clip": 0.01186183, + "auxiliary_loss_mlp": 0.01056757, + "balance_loss_clip": 1.05899441, + "balance_loss_mlp": 1.03358293, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 4.080835820451001, + "language_loss": 0.63546228, + "learning_rate": 3.935080744080564e-06, + "loss": 0.65789163, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.23181152, + "step": 1811, + "time_per_iteration": 2.487563133239746 + }, + { + "auxiliary_loss_clip": 0.01184055, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.05891836, + "balance_loss_mlp": 1.02608204, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 2.576541082817046, + "language_loss": 0.74339676, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76571977, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.22143555, + "step": 1812, + "time_per_iteration": 2.513904333114624 + }, + { + "auxiliary_loss_clip": 0.01184661, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.05975342, + "balance_loss_mlp": 1.03422773, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 3.3441852156138463, + "language_loss": 0.72591782, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74832845, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.22180176, + "step": 1813, + "time_per_iteration": 2.5619804859161377 + }, + { + "auxiliary_loss_clip": 0.01192139, + "auxiliary_loss_mlp": 0.01049462, + "balance_loss_clip": 1.06865406, + "balance_loss_mlp": 1.02817082, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 2.0173176484811575, + "language_loss": 0.83055407, + "learning_rate": 3.93478514371732e-06, + "loss": 0.85297012, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.21289062, + "step": 1814, + "time_per_iteration": 2.536464214324951 + }, + { + "auxiliary_loss_clip": 0.01186148, + "auxiliary_loss_mlp": 0.01059691, + "balance_loss_clip": 1.05990469, + "balance_loss_mlp": 1.03817391, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.0543439260084426, + "language_loss": 0.84167635, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86413473, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.21533203, + "step": 1815, + "time_per_iteration": 2.450951099395752 + }, + { + "auxiliary_loss_clip": 0.01187859, + "auxiliary_loss_mlp": 0.01054567, + "balance_loss_clip": 1.06542945, + "balance_loss_mlp": 1.03186917, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.906139096436843, + "language_loss": 0.71583593, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73826015, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.22705078, + "step": 1816, + "time_per_iteration": 2.507141351699829 + }, + { + "auxiliary_loss_clip": 0.0118815, + "auxiliary_loss_mlp": 0.01058222, + "balance_loss_clip": 1.06185031, + "balance_loss_mlp": 1.03578639, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.3768241578835316, + "language_loss": 0.72851264, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75097638, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.2244873, + "step": 1817, + "time_per_iteration": 2.4910147190093994 + }, + { + "auxiliary_loss_clip": 0.01186402, + "auxiliary_loss_mlp": 0.01060324, + "balance_loss_clip": 1.06142175, + "balance_loss_mlp": 1.0378406, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.716044392660274, + "language_loss": 0.67247599, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69494331, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.22485352, + "step": 1818, + "time_per_iteration": 2.477154493331909 + }, + { + "auxiliary_loss_clip": 0.0120181, + "auxiliary_loss_mlp": 0.01065212, + "balance_loss_clip": 1.07319188, + "balance_loss_mlp": 1.04292023, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.2737708084379453, + "language_loss": 0.72850502, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75117528, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.22302246, + "step": 1819, + "time_per_iteration": 2.515254497528076 + }, + { + "auxiliary_loss_clip": 0.01187867, + "auxiliary_loss_mlp": 0.01051123, + "balance_loss_clip": 1.06413484, + "balance_loss_mlp": 1.03048778, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 2.548467956035635, + "language_loss": 0.7373445, + "learning_rate": 3.934191962176335e-06, + "loss": 0.75973439, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20629883, + "step": 1820, + "time_per_iteration": 2.576469898223877 + }, + { + "auxiliary_loss_clip": 0.01185879, + "auxiliary_loss_mlp": 0.01065269, + "balance_loss_clip": 1.06260228, + "balance_loss_mlp": 1.04117727, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.42486875351252, + "language_loss": 0.82936239, + "learning_rate": 3.934092841857642e-06, + "loss": 0.85187387, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.2409668, + "step": 1821, + "time_per_iteration": 2.5024049282073975 + }, + { + "auxiliary_loss_clip": 0.01183348, + "auxiliary_loss_mlp": 0.01058066, + "balance_loss_clip": 1.05897212, + "balance_loss_mlp": 1.03645337, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 1.9048753593639502, + "language_loss": 0.76112223, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78353631, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.21606445, + "step": 1822, + "time_per_iteration": 2.566246509552002 + }, + { + "auxiliary_loss_clip": 0.01193837, + "auxiliary_loss_mlp": 0.01050914, + "balance_loss_clip": 1.07044423, + "balance_loss_mlp": 1.0303266, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 3.204267247818408, + "language_loss": 0.79482162, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81726915, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20593262, + "step": 1823, + "time_per_iteration": 2.601745367050171 + }, + { + "auxiliary_loss_clip": 0.01190571, + "auxiliary_loss_mlp": 0.01047159, + "balance_loss_clip": 1.06792879, + "balance_loss_mlp": 1.02555776, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.5921558654739183, + "language_loss": 0.79428577, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81666303, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.21594238, + "step": 1824, + "time_per_iteration": 2.5511744022369385 + }, + { + "auxiliary_loss_clip": 0.0118066, + "auxiliary_loss_mlp": 0.01053467, + "balance_loss_clip": 1.06046772, + "balance_loss_mlp": 1.0324862, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.013667328335689, + "language_loss": 0.87826431, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90060556, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.21008301, + "step": 1825, + "time_per_iteration": 2.554647445678711 + }, + { + "auxiliary_loss_clip": 0.01181886, + "auxiliary_loss_mlp": 0.01052387, + "balance_loss_clip": 1.05984485, + "balance_loss_mlp": 1.03100061, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8709406883526107, + "language_loss": 0.76494467, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78728735, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.21398926, + "step": 1826, + "time_per_iteration": 2.6483936309814453 + }, + { + "auxiliary_loss_clip": 0.01119612, + "auxiliary_loss_mlp": 0.01026489, + "balance_loss_clip": 1.06554103, + "balance_loss_mlp": 1.02326727, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8552516578351294, + "language_loss": 0.55006325, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57152426, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.54101562, + "router_z_loss_mlp": 0.03216553, + "step": 1827, + "time_per_iteration": 3.1820261478424072 + }, + { + "auxiliary_loss_clip": 0.01113981, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.05868626, + "balance_loss_mlp": 1.02773046, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7446143487817785, + "language_loss": 0.55394483, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57539499, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.55273438, + "router_z_loss_mlp": 0.03308105, + "step": 1828, + "time_per_iteration": 3.4714419841766357 + }, + { + "auxiliary_loss_clip": 0.01192074, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_clip": 1.06542313, + "balance_loss_mlp": 1.03135514, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.366513564343421, + "language_loss": 0.84221268, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86467493, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.22802734, + "step": 1829, + "time_per_iteration": 2.757570505142212 + }, + { + "auxiliary_loss_clip": 0.0118429, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.05946708, + "balance_loss_mlp": 1.02451444, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 1.977410546719858, + "language_loss": 0.88682127, + "learning_rate": 3.933197459096614e-06, + "loss": 0.9091413, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.23193359, + "step": 1830, + "time_per_iteration": 2.709014415740967 + }, + { + "auxiliary_loss_clip": 0.01098492, + "auxiliary_loss_mlp": 0.01015342, + "balance_loss_clip": 1.04639149, + "balance_loss_mlp": 1.01129532, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6937518611113406, + "language_loss": 0.55507255, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57621092, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.52148438, + "router_z_loss_mlp": 0.04052734, + "step": 1831, + "time_per_iteration": 3.0791521072387695 + }, + { + "auxiliary_loss_clip": 0.01195183, + "auxiliary_loss_mlp": 0.01073827, + "balance_loss_clip": 1.06449389, + "balance_loss_mlp": 1.04918694, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.5176130824680985, + "language_loss": 0.91020775, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93289781, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.24633789, + "step": 1832, + "time_per_iteration": 2.8802671432495117 + }, + { + "auxiliary_loss_clip": 0.01099915, + "auxiliary_loss_mlp": 0.01014957, + "balance_loss_clip": 1.04623842, + "balance_loss_mlp": 1.01183677, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7157299133288315, + "language_loss": 0.59883869, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61998737, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.53710938, + "router_z_loss_mlp": 0.03121948, + "step": 1833, + "time_per_iteration": 3.183952808380127 + }, + { + "auxiliary_loss_clip": 0.0118831, + "auxiliary_loss_mlp": 0.01053513, + "balance_loss_clip": 1.06222427, + "balance_loss_mlp": 1.03095841, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.7786908228975196, + "language_loss": 0.80333412, + "learning_rate": 3.93279760505609e-06, + "loss": 0.82575238, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.22570801, + "step": 1834, + "time_per_iteration": 2.4449188709259033 + }, + { + "auxiliary_loss_clip": 0.01197203, + "auxiliary_loss_mlp": 0.01061176, + "balance_loss_clip": 1.07011187, + "balance_loss_mlp": 1.03696465, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 3.0198672728920433, + "language_loss": 0.90542901, + "learning_rate": 3.932697458306779e-06, + "loss": 0.92801273, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.24243164, + "step": 1835, + "time_per_iteration": 2.4639949798583984 + }, + { + "auxiliary_loss_clip": 0.01189193, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.06365347, + "balance_loss_mlp": 1.03638804, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 1.9919861319760503, + "language_loss": 0.63286841, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65535831, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.23425293, + "step": 1836, + "time_per_iteration": 2.43735408782959 + }, + { + "auxiliary_loss_clip": 0.01186733, + "auxiliary_loss_mlp": 0.01066217, + "balance_loss_clip": 1.06054854, + "balance_loss_mlp": 1.04443789, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9783470981833953, + "language_loss": 0.73088837, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75341785, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 1.26269531, + "router_z_loss_mlp": 0.21777344, + "step": 1837, + "time_per_iteration": 2.522085428237915 + }, + { + "auxiliary_loss_clip": 0.01195122, + "auxiliary_loss_mlp": 0.01062323, + "balance_loss_clip": 1.06865132, + "balance_loss_mlp": 1.04024565, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 1.9938272115300606, + "language_loss": 0.78623354, + "learning_rate": 3.93239657834556e-06, + "loss": 0.80880797, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.22070312, + "step": 1838, + "time_per_iteration": 2.4260034561157227 + }, + { + "auxiliary_loss_clip": 0.01185026, + "auxiliary_loss_mlp": 0.01071231, + "balance_loss_clip": 1.06251884, + "balance_loss_mlp": 1.0488193, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 2.0725970546691395, + "language_loss": 0.71731806, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73988062, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.22436523, + "step": 1839, + "time_per_iteration": 2.428766965866089 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.06415665, + "balance_loss_mlp": 1.03594768, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.40086742902813, + "language_loss": 0.78536439, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80786014, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.23303223, + "step": 1840, + "time_per_iteration": 2.48101806640625 + }, + { + "auxiliary_loss_clip": 0.01184476, + "auxiliary_loss_mlp": 0.01054292, + "balance_loss_clip": 1.06245184, + "balance_loss_mlp": 1.03115332, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.842686321321798, + "language_loss": 0.87733638, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89972407, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.23156738, + "step": 1841, + "time_per_iteration": 2.4659688472747803 + }, + { + "auxiliary_loss_clip": 0.01188785, + "auxiliary_loss_mlp": 0.0106229, + "balance_loss_clip": 1.06726742, + "balance_loss_mlp": 1.03975964, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 4.165684606747619, + "language_loss": 0.90856737, + "learning_rate": 3.931994379208334e-06, + "loss": 0.93107814, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.22509766, + "step": 1842, + "time_per_iteration": 2.440635919570923 + }, + { + "auxiliary_loss_clip": 0.01183916, + "auxiliary_loss_mlp": 0.01064935, + "balance_loss_clip": 1.06019127, + "balance_loss_mlp": 1.0440973, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.31124272631711, + "language_loss": 0.8565287, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87901723, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20825195, + "step": 1843, + "time_per_iteration": 2.4233479499816895 + }, + { + "auxiliary_loss_clip": 0.0119284, + "auxiliary_loss_mlp": 0.01059682, + "balance_loss_clip": 1.07085013, + "balance_loss_mlp": 1.03654361, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.7001281872840375, + "language_loss": 0.74457985, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76710504, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.23132324, + "step": 1844, + "time_per_iteration": 2.480886220932007 + }, + { + "auxiliary_loss_clip": 0.01186465, + "auxiliary_loss_mlp": 0.01059729, + "balance_loss_clip": 1.06161356, + "balance_loss_mlp": 1.03568447, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.007945082522356, + "language_loss": 0.75646842, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77893043, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.24047852, + "step": 1845, + "time_per_iteration": 2.3963053226470947 + }, + { + "auxiliary_loss_clip": 0.01192121, + "auxiliary_loss_mlp": 0.01069709, + "balance_loss_clip": 1.06489122, + "balance_loss_mlp": 1.04777479, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 2.407286330389352, + "language_loss": 0.7631408, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78575903, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.21936035, + "step": 1846, + "time_per_iteration": 2.452530860900879 + }, + { + "auxiliary_loss_clip": 0.0119692, + "auxiliary_loss_mlp": 0.01060813, + "balance_loss_clip": 1.06844926, + "balance_loss_mlp": 1.03881836, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.748334890406079, + "language_loss": 0.86165303, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88423038, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.2199707, + "step": 1847, + "time_per_iteration": 3.898207664489746 + }, + { + "auxiliary_loss_clip": 0.01189909, + "auxiliary_loss_mlp": 0.01056827, + "balance_loss_clip": 1.0646404, + "balance_loss_mlp": 1.03453541, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.824500625620375, + "language_loss": 0.7709347, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79340214, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.2232666, + "step": 1848, + "time_per_iteration": 2.4599876403808594 + }, + { + "auxiliary_loss_clip": 0.01196556, + "auxiliary_loss_mlp": 0.01056629, + "balance_loss_clip": 1.07577693, + "balance_loss_mlp": 1.03580356, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.926164052637049, + "language_loss": 0.77903682, + "learning_rate": 3.931287710300832e-06, + "loss": 0.80156869, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.20837402, + "step": 1849, + "time_per_iteration": 3.848999500274658 + }, + { + "auxiliary_loss_clip": 0.01191319, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.06631088, + "balance_loss_mlp": 1.03499699, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 2.50795470513141, + "language_loss": 0.72533798, + "learning_rate": 3.931186464630601e-06, + "loss": 0.7478199, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.21875, + "step": 1850, + "time_per_iteration": 2.389335870742798 + }, + { + "auxiliary_loss_clip": 0.01195623, + "auxiliary_loss_mlp": 0.01055049, + "balance_loss_clip": 1.07194185, + "balance_loss_mlp": 1.03342485, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2355539385046317, + "language_loss": 0.81357384, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83608061, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.21606445, + "step": 1851, + "time_per_iteration": 3.790996551513672 + }, + { + "auxiliary_loss_clip": 0.01195619, + "auxiliary_loss_mlp": 0.01052511, + "balance_loss_clip": 1.07171762, + "balance_loss_mlp": 1.03176868, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.7071066622988966, + "language_loss": 0.88308752, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90556884, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.2076416, + "step": 1852, + "time_per_iteration": 2.3826403617858887 + }, + { + "auxiliary_loss_clip": 0.01194434, + "auxiliary_loss_mlp": 0.01064203, + "balance_loss_clip": 1.06687903, + "balance_loss_mlp": 1.0410645, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 5.082134115613975, + "language_loss": 0.72880983, + "learning_rate": 3.930882288250578e-06, + "loss": 0.75139618, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.23132324, + "step": 1853, + "time_per_iteration": 3.789536237716675 + }, + { + "auxiliary_loss_clip": 0.01103388, + "auxiliary_loss_mlp": 0.01050491, + "balance_loss_clip": 1.04991293, + "balance_loss_mlp": 1.04739714, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7821590321078509, + "language_loss": 0.53672385, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55826259, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.53417969, + "router_z_loss_mlp": 0.03094482, + "step": 1854, + "time_per_iteration": 3.0924324989318848 + }, + { + "auxiliary_loss_clip": 0.01196113, + "auxiliary_loss_mlp": 0.01057488, + "balance_loss_clip": 1.06604242, + "balance_loss_mlp": 1.03398025, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.154289610533035, + "language_loss": 0.85072076, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.8732568, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.23522949, + "step": 1855, + "time_per_iteration": 2.4192278385162354 + }, + { + "auxiliary_loss_clip": 0.01184706, + "auxiliary_loss_mlp": 0.01068114, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.04741931, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 4.303645886439197, + "language_loss": 0.81621921, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.83874738, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.20703125, + "step": 1856, + "time_per_iteration": 2.49538516998291 + }, + { + "auxiliary_loss_clip": 0.01188544, + "auxiliary_loss_mlp": 0.01052803, + "balance_loss_clip": 1.06668544, + "balance_loss_mlp": 1.03021264, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.1369348209012644, + "language_loss": 0.83132398, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85373741, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.22595215, + "step": 1857, + "time_per_iteration": 2.485463857650757 + }, + { + "auxiliary_loss_clip": 0.0118215, + "auxiliary_loss_mlp": 0.01046816, + "balance_loss_clip": 1.05821562, + "balance_loss_mlp": 1.02620435, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.2013130569542136, + "language_loss": 0.82818949, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85047913, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.20617676, + "step": 1858, + "time_per_iteration": 2.49849534034729 + }, + { + "auxiliary_loss_clip": 0.0118577, + "auxiliary_loss_mlp": 0.01057054, + "balance_loss_clip": 1.06186557, + "balance_loss_mlp": 1.03542912, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.1890668739952215, + "language_loss": 0.9188416, + "learning_rate": 3.930271958674866e-06, + "loss": 0.94126987, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.21630859, + "step": 1859, + "time_per_iteration": 2.5238473415374756 + }, + { + "auxiliary_loss_clip": 0.01190046, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.06407166, + "balance_loss_mlp": 1.02767479, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 3.8434776796844696, + "language_loss": 0.81687236, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83926493, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.2154541, + "step": 1860, + "time_per_iteration": 2.53489351272583 + }, + { + "auxiliary_loss_clip": 0.01186384, + "auxiliary_loss_mlp": 0.01066888, + "balance_loss_clip": 1.06239867, + "balance_loss_mlp": 1.04509676, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.6318353823110114, + "language_loss": 0.74987447, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77240723, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.21801758, + "step": 1861, + "time_per_iteration": 2.4214675426483154 + }, + { + "auxiliary_loss_clip": 0.01183851, + "auxiliary_loss_mlp": 0.01051997, + "balance_loss_clip": 1.06229579, + "balance_loss_mlp": 1.0318985, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 1.862992333738635, + "language_loss": 0.88848972, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91084826, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.2010498, + "step": 1862, + "time_per_iteration": 2.4509942531585693 + }, + { + "auxiliary_loss_clip": 0.01199148, + "auxiliary_loss_mlp": 0.01061474, + "balance_loss_clip": 1.07522535, + "balance_loss_mlp": 1.03998017, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1639268739721333, + "language_loss": 0.87151945, + "learning_rate": 3.92986360831752e-06, + "loss": 0.8941257, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.21484375, + "step": 1863, + "time_per_iteration": 2.4511051177978516 + }, + { + "auxiliary_loss_clip": 0.01187162, + "auxiliary_loss_mlp": 0.01060113, + "balance_loss_clip": 1.06328762, + "balance_loss_mlp": 1.03655779, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.186867467156109, + "language_loss": 0.64325023, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66572297, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.2355957, + "step": 1864, + "time_per_iteration": 2.4816205501556396 + }, + { + "auxiliary_loss_clip": 0.01185018, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_clip": 1.06401324, + "balance_loss_mlp": 1.02724624, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.901074988894668, + "language_loss": 0.73864186, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76096857, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.20422363, + "step": 1865, + "time_per_iteration": 2.3886055946350098 + }, + { + "auxiliary_loss_clip": 0.01184317, + "auxiliary_loss_mlp": 0.01058547, + "balance_loss_clip": 1.06204557, + "balance_loss_mlp": 1.03496718, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.731002025040415, + "language_loss": 0.84571975, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86814845, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.23596191, + "step": 1866, + "time_per_iteration": 2.472496747970581 + }, + { + "auxiliary_loss_clip": 0.01184815, + "auxiliary_loss_mlp": 0.0105297, + "balance_loss_clip": 1.06156695, + "balance_loss_mlp": 1.03091598, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.7860786899108596, + "language_loss": 0.81046784, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83284569, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.22070312, + "step": 1867, + "time_per_iteration": 2.492255449295044 + }, + { + "auxiliary_loss_clip": 0.01185519, + "auxiliary_loss_mlp": 0.0106034, + "balance_loss_clip": 1.06178677, + "balance_loss_mlp": 1.03885889, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.2108462995614464, + "language_loss": 0.86385983, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88631845, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.21484375, + "step": 1868, + "time_per_iteration": 2.501051425933838 + }, + { + "auxiliary_loss_clip": 0.01186268, + "auxiliary_loss_mlp": 0.01052508, + "balance_loss_clip": 1.06509018, + "balance_loss_mlp": 1.03152704, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.3773705592304664, + "language_loss": 0.68793786, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.7103256, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.20996094, + "step": 1869, + "time_per_iteration": 2.3945302963256836 + }, + { + "auxiliary_loss_clip": 0.01188833, + "auxiliary_loss_mlp": 0.01060559, + "balance_loss_clip": 1.06031871, + "balance_loss_mlp": 1.03756368, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.6735897400691506, + "language_loss": 0.77115631, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79365021, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.22998047, + "step": 1870, + "time_per_iteration": 2.483640193939209 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01058036, + "balance_loss_clip": 1.06009007, + "balance_loss_mlp": 1.03508878, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.9404405274050984, + "language_loss": 0.76080585, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78326881, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.22937012, + "step": 1871, + "time_per_iteration": 2.411745309829712 + }, + { + "auxiliary_loss_clip": 0.01184261, + "auxiliary_loss_mlp": 0.01059303, + "balance_loss_clip": 1.06105256, + "balance_loss_mlp": 1.03747594, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.423558454697112, + "language_loss": 0.81886029, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84129596, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.21801758, + "step": 1872, + "time_per_iteration": 2.438199043273926 + }, + { + "auxiliary_loss_clip": 0.01189879, + "auxiliary_loss_mlp": 0.010649, + "balance_loss_clip": 1.0645752, + "balance_loss_mlp": 1.04329908, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.5226969757080577, + "language_loss": 0.8297075, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85225523, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.21594238, + "step": 1873, + "time_per_iteration": 2.410595178604126 + }, + { + "auxiliary_loss_clip": 0.01195755, + "auxiliary_loss_mlp": 0.01068615, + "balance_loss_clip": 1.06328821, + "balance_loss_mlp": 1.04385483, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.8417668720280522, + "language_loss": 0.92262328, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94526696, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 1.32519531, + "router_z_loss_mlp": 0.24780273, + "step": 1874, + "time_per_iteration": 2.4707720279693604 + }, + { + "auxiliary_loss_clip": 0.01193284, + "auxiliary_loss_mlp": 0.01065765, + "balance_loss_clip": 1.06920958, + "balance_loss_mlp": 1.04483223, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.556875652175834, + "language_loss": 0.75269926, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77528989, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.20922852, + "step": 1875, + "time_per_iteration": 2.4557414054870605 + }, + { + "auxiliary_loss_clip": 0.01183119, + "auxiliary_loss_mlp": 0.01059081, + "balance_loss_clip": 1.06441569, + "balance_loss_mlp": 1.0391252, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.8955590158554092, + "language_loss": 0.7138747, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73629665, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.19946289, + "step": 1876, + "time_per_iteration": 2.493978261947632 + }, + { + "auxiliary_loss_clip": 0.01193956, + "auxiliary_loss_mlp": 0.01059704, + "balance_loss_clip": 1.06924331, + "balance_loss_mlp": 1.0373404, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 1.9281380592280692, + "language_loss": 0.76923501, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79177165, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.22363281, + "step": 1877, + "time_per_iteration": 2.4699158668518066 + }, + { + "auxiliary_loss_clip": 0.01194793, + "auxiliary_loss_mlp": 0.0106374, + "balance_loss_clip": 1.06909144, + "balance_loss_mlp": 1.04148388, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.4679328937811293, + "language_loss": 0.88036323, + "learning_rate": 3.928321868270436e-06, + "loss": 0.9029485, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.22277832, + "step": 1878, + "time_per_iteration": 2.4390881061553955 + }, + { + "auxiliary_loss_clip": 0.01190377, + "auxiliary_loss_mlp": 0.01048285, + "balance_loss_clip": 1.06402993, + "balance_loss_mlp": 1.02704215, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.9388637203931283, + "language_loss": 0.81108385, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83347046, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.21240234, + "step": 1879, + "time_per_iteration": 2.591306447982788 + }, + { + "auxiliary_loss_clip": 0.01195045, + "auxiliary_loss_mlp": 0.01058209, + "balance_loss_clip": 1.0694468, + "balance_loss_mlp": 1.03573775, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.0192602725616906, + "language_loss": 0.70355403, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72608656, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.22460938, + "step": 1880, + "time_per_iteration": 2.5582821369171143 + }, + { + "auxiliary_loss_clip": 0.01191772, + "auxiliary_loss_mlp": 0.01049127, + "balance_loss_clip": 1.06884503, + "balance_loss_mlp": 1.02679944, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.6606275230717267, + "language_loss": 0.72109061, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74349964, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.22338867, + "step": 1881, + "time_per_iteration": 2.4362714290618896 + }, + { + "auxiliary_loss_clip": 0.01189139, + "auxiliary_loss_mlp": 0.01059866, + "balance_loss_clip": 1.0633719, + "balance_loss_mlp": 1.03687048, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.508211555972009, + "language_loss": 0.74343818, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76592815, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.2298584, + "step": 1882, + "time_per_iteration": 2.419015884399414 + }, + { + "auxiliary_loss_clip": 0.01189702, + "auxiliary_loss_mlp": 0.01050264, + "balance_loss_clip": 1.06756926, + "balance_loss_mlp": 1.02729249, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.1955745314378046, + "language_loss": 0.79183459, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81423426, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.2298584, + "step": 1883, + "time_per_iteration": 2.497035264968872 + }, + { + "auxiliary_loss_clip": 0.01187748, + "auxiliary_loss_mlp": 0.01055814, + "balance_loss_clip": 1.06254959, + "balance_loss_mlp": 1.03348637, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 2.0749813910209247, + "language_loss": 0.77433729, + "learning_rate": 3.927700564817529e-06, + "loss": 0.7967729, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.22351074, + "step": 1884, + "time_per_iteration": 2.44232439994812 + }, + { + "auxiliary_loss_clip": 0.01114194, + "auxiliary_loss_mlp": 0.01007049, + "balance_loss_clip": 1.06183672, + "balance_loss_mlp": 1.00429201, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7907173062425837, + "language_loss": 0.5518409, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57305336, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 0.02755737, + "step": 1885, + "time_per_iteration": 2.9777650833129883 + }, + { + "auxiliary_loss_clip": 0.01184418, + "auxiliary_loss_mlp": 0.01049255, + "balance_loss_clip": 1.0661056, + "balance_loss_mlp": 1.02770209, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 1.8856722431251436, + "language_loss": 0.90354276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.9258796, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.21557617, + "step": 1886, + "time_per_iteration": 2.4701128005981445 + }, + { + "auxiliary_loss_clip": 0.01189383, + "auxiliary_loss_mlp": 0.01053907, + "balance_loss_clip": 1.06695771, + "balance_loss_mlp": 1.03227091, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.6334659818191013, + "language_loss": 0.85059243, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87302536, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.21630859, + "step": 1887, + "time_per_iteration": 2.470350742340088 + }, + { + "auxiliary_loss_clip": 0.01188351, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.06509209, + "balance_loss_mlp": 1.0399127, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 2.980109031130134, + "language_loss": 0.75807226, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78058648, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.23156738, + "step": 1888, + "time_per_iteration": 2.497922897338867 + }, + { + "auxiliary_loss_clip": 0.01195643, + "auxiliary_loss_mlp": 0.01062849, + "balance_loss_clip": 1.06585979, + "balance_loss_mlp": 1.03801751, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.1259289204438065, + "language_loss": 0.67858362, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70116854, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.24853516, + "step": 1889, + "time_per_iteration": 2.6314826011657715 + }, + { + "auxiliary_loss_clip": 0.01192217, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.06879735, + "balance_loss_mlp": 1.02165139, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.1984359934564965, + "language_loss": 0.84202963, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86438835, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.21984863, + "step": 1890, + "time_per_iteration": 2.4441962242126465 + }, + { + "auxiliary_loss_clip": 0.01186526, + "auxiliary_loss_mlp": 0.01054401, + "balance_loss_clip": 1.06332445, + "balance_loss_mlp": 1.03266919, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.255830186851208, + "language_loss": 0.64820993, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67061919, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.21740723, + "step": 1891, + "time_per_iteration": 3.894658088684082 + }, + { + "auxiliary_loss_clip": 0.01183762, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.05816603, + "balance_loss_mlp": 1.02642751, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 1.9817657816843355, + "language_loss": 0.88424456, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90658224, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.23571777, + "step": 1892, + "time_per_iteration": 2.4572808742523193 + }, + { + "auxiliary_loss_clip": 0.01188501, + "auxiliary_loss_mlp": 0.01067305, + "balance_loss_clip": 1.06328046, + "balance_loss_mlp": 1.04317653, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 4.076392798381851, + "language_loss": 0.73058701, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7531451, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.24133301, + "step": 1893, + "time_per_iteration": 3.859575033187866 + }, + { + "auxiliary_loss_clip": 0.01178631, + "auxiliary_loss_mlp": 0.01065096, + "balance_loss_clip": 1.05624199, + "balance_loss_mlp": 1.04009736, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.3208993659616026, + "language_loss": 0.79211354, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81455082, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.25, + "step": 1894, + "time_per_iteration": 2.437756061553955 + }, + { + "auxiliary_loss_clip": 0.01193401, + "auxiliary_loss_mlp": 0.01053501, + "balance_loss_clip": 1.06407833, + "balance_loss_mlp": 1.03029048, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.943026687470092, + "language_loss": 0.80040705, + "learning_rate": 3.926554674383371e-06, + "loss": 0.8228761, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.23217773, + "step": 1895, + "time_per_iteration": 3.972810983657837 + }, + { + "auxiliary_loss_clip": 0.01095687, + "auxiliary_loss_mlp": 0.01009704, + "balance_loss_clip": 1.04295659, + "balance_loss_mlp": 1.00710797, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8037782117133693, + "language_loss": 0.63357979, + "learning_rate": 3.926450064115686e-06, + "loss": 0.6546337, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 0.02597046, + "step": 1896, + "time_per_iteration": 4.546117544174194 + }, + { + "auxiliary_loss_clip": 0.01188764, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.06397915, + "balance_loss_mlp": 1.0351615, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.648920542562502, + "language_loss": 0.85023928, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87271762, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.23925781, + "step": 1897, + "time_per_iteration": 2.469836711883545 + }, + { + "auxiliary_loss_clip": 0.01188051, + "auxiliary_loss_mlp": 0.0105671, + "balance_loss_clip": 1.06338894, + "balance_loss_mlp": 1.0348233, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.749906929333753, + "language_loss": 0.79822701, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.82067466, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.21899414, + "step": 1898, + "time_per_iteration": 2.4847757816314697 + }, + { + "auxiliary_loss_clip": 0.01192811, + "auxiliary_loss_mlp": 0.01055044, + "balance_loss_clip": 1.06497955, + "balance_loss_mlp": 1.03138113, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 2.1558557407178207, + "language_loss": 0.73543322, + "learning_rate": 3.926135795021435e-06, + "loss": 0.7579118, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.23669434, + "step": 1899, + "time_per_iteration": 2.4627418518066406 + }, + { + "auxiliary_loss_clip": 0.01108075, + "auxiliary_loss_mlp": 0.01005564, + "balance_loss_clip": 1.05378556, + "balance_loss_mlp": 1.00276554, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9124253099875196, + "language_loss": 0.63425136, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65538776, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.54394531, + "router_z_loss_mlp": 0.02798462, + "step": 1900, + "time_per_iteration": 3.126343250274658 + }, + { + "auxiliary_loss_clip": 0.0118812, + "auxiliary_loss_mlp": 0.01072309, + "balance_loss_clip": 1.06145239, + "balance_loss_mlp": 1.0474062, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.5800194537735135, + "language_loss": 0.78230542, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80490971, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.24890137, + "step": 1901, + "time_per_iteration": 2.5464541912078857 + }, + { + "auxiliary_loss_clip": 0.01189315, + "auxiliary_loss_mlp": 0.01058864, + "balance_loss_clip": 1.06340909, + "balance_loss_mlp": 1.03630996, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.063237014133142, + "language_loss": 0.84084487, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86332667, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.22546387, + "step": 1902, + "time_per_iteration": 2.4593217372894287 + }, + { + "auxiliary_loss_clip": 0.01194645, + "auxiliary_loss_mlp": 0.01050607, + "balance_loss_clip": 1.06539345, + "balance_loss_mlp": 1.02723026, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.7211591217005444, + "language_loss": 0.78086889, + "learning_rate": 3.925715747031356e-06, + "loss": 0.80332148, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.23388672, + "step": 1903, + "time_per_iteration": 2.4670162200927734 + }, + { + "auxiliary_loss_clip": 0.01193273, + "auxiliary_loss_mlp": 0.01045147, + "balance_loss_clip": 1.06624806, + "balance_loss_mlp": 1.02532208, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 3.3882529487569393, + "language_loss": 0.75664455, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77902877, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.19836426, + "step": 1904, + "time_per_iteration": 2.4945290088653564 + }, + { + "auxiliary_loss_clip": 0.01189237, + "auxiliary_loss_mlp": 0.01059633, + "balance_loss_clip": 1.0677886, + "balance_loss_mlp": 1.03732872, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.3376930513637855, + "language_loss": 0.92228961, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94477838, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.2232666, + "step": 1905, + "time_per_iteration": 2.437364339828491 + }, + { + "auxiliary_loss_clip": 0.01189533, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.05776572, + "balance_loss_mlp": 1.0259645, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.663966051125704, + "language_loss": 0.77582741, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79820931, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.22668457, + "step": 1906, + "time_per_iteration": 2.4332520961761475 + }, + { + "auxiliary_loss_clip": 0.01194411, + "auxiliary_loss_mlp": 0.01052627, + "balance_loss_clip": 1.06690252, + "balance_loss_mlp": 1.03008437, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.270087046985886, + "language_loss": 0.81889212, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84136248, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.22570801, + "step": 1907, + "time_per_iteration": 2.5501601696014404 + }, + { + "auxiliary_loss_clip": 0.01197309, + "auxiliary_loss_mlp": 0.01065189, + "balance_loss_clip": 1.07272911, + "balance_loss_mlp": 1.04432702, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 2.572723586957679, + "language_loss": 0.8491919, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87181687, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.20861816, + "step": 1908, + "time_per_iteration": 2.486970901489258 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01008359, + "balance_loss_clip": 1.05111408, + "balance_loss_mlp": 1.00546217, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.924046507540169, + "language_loss": 0.61048728, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63159448, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 0.02896118, + "step": 1909, + "time_per_iteration": 2.9890599250793457 + }, + { + "auxiliary_loss_clip": 0.01220267, + "auxiliary_loss_mlp": 0.01055524, + "balance_loss_clip": 1.09005952, + "balance_loss_mlp": 1.03410244, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 1.978470502284213, + "language_loss": 0.79165137, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81440926, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.2142334, + "step": 1910, + "time_per_iteration": 2.4750618934631348 + }, + { + "auxiliary_loss_clip": 0.01190995, + "auxiliary_loss_mlp": 0.01054269, + "balance_loss_clip": 1.06632102, + "balance_loss_mlp": 1.03217959, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.0550203258676834, + "language_loss": 0.76696646, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.78941905, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.2208252, + "step": 1911, + "time_per_iteration": 2.486989974975586 + }, + { + "auxiliary_loss_clip": 0.01183943, + "auxiliary_loss_mlp": 0.01049446, + "balance_loss_clip": 1.06488729, + "balance_loss_mlp": 1.02845287, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 1.609415203102407, + "language_loss": 0.79103798, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.8133719, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.20996094, + "step": 1912, + "time_per_iteration": 2.4902617931365967 + }, + { + "auxiliary_loss_clip": 0.01195708, + "auxiliary_loss_mlp": 0.01060912, + "balance_loss_clip": 1.07067871, + "balance_loss_mlp": 1.03830934, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9312934015893215, + "language_loss": 0.77717978, + "learning_rate": 3.924660515982246e-06, + "loss": 0.79974598, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.22619629, + "step": 1913, + "time_per_iteration": 2.488483190536499 + }, + { + "auxiliary_loss_clip": 0.01212285, + "auxiliary_loss_mlp": 0.01051668, + "balance_loss_clip": 1.0828824, + "balance_loss_mlp": 1.02875626, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 2.175609361614286, + "language_loss": 0.69896281, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72160232, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.22912598, + "step": 1914, + "time_per_iteration": 2.4770731925964355 + }, + { + "auxiliary_loss_clip": 0.01096576, + "auxiliary_loss_mlp": 0.0102208, + "balance_loss_clip": 1.04396176, + "balance_loss_mlp": 1.0190376, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.8393300425977669, + "language_loss": 0.61042237, + "learning_rate": 3.92444859384433e-06, + "loss": 0.6316089, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.52539062, + "router_z_loss_mlp": 0.03042603, + "step": 1915, + "time_per_iteration": 3.152925729751587 + }, + { + "auxiliary_loss_clip": 0.01198013, + "auxiliary_loss_mlp": 0.01056923, + "balance_loss_clip": 1.07384241, + "balance_loss_mlp": 1.03506017, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.042945488724922, + "language_loss": 0.93361813, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95616746, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.21875, + "step": 1916, + "time_per_iteration": 2.439920425415039 + }, + { + "auxiliary_loss_clip": 0.01187402, + "auxiliary_loss_mlp": 0.01062031, + "balance_loss_clip": 1.06402516, + "balance_loss_mlp": 1.03779578, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 2.0948324362899973, + "language_loss": 0.72556424, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74805856, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.24243164, + "step": 1917, + "time_per_iteration": 2.456393003463745 + }, + { + "auxiliary_loss_clip": 0.01190471, + "auxiliary_loss_mlp": 0.01052733, + "balance_loss_clip": 1.06777334, + "balance_loss_mlp": 1.03004766, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 1.9567439730716056, + "language_loss": 0.74479192, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76722389, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.22668457, + "step": 1918, + "time_per_iteration": 2.451141119003296 + }, + { + "auxiliary_loss_clip": 0.01196202, + "auxiliary_loss_mlp": 0.01047327, + "balance_loss_clip": 1.06904578, + "balance_loss_mlp": 1.02586913, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.2904805237118473, + "language_loss": 0.86550486, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88794017, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.21435547, + "step": 1919, + "time_per_iteration": 2.4432413578033447 + }, + { + "auxiliary_loss_clip": 0.01185135, + "auxiliary_loss_mlp": 0.0105697, + "balance_loss_clip": 1.06315541, + "balance_loss_mlp": 1.03392696, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.522620003388002, + "language_loss": 0.86272597, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88514698, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.23046875, + "step": 1920, + "time_per_iteration": 2.419025421142578 + }, + { + "auxiliary_loss_clip": 0.01186248, + "auxiliary_loss_mlp": 0.01050371, + "balance_loss_clip": 1.06711924, + "balance_loss_mlp": 1.02813816, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.060225794813142, + "language_loss": 0.79716861, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81953478, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.22229004, + "step": 1921, + "time_per_iteration": 2.517247438430786 + }, + { + "auxiliary_loss_clip": 0.01190785, + "auxiliary_loss_mlp": 0.01061558, + "balance_loss_clip": 1.06567621, + "balance_loss_mlp": 1.03862166, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.325592105648956, + "language_loss": 0.79059345, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81311691, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.22937012, + "step": 1922, + "time_per_iteration": 2.4466567039489746 + }, + { + "auxiliary_loss_clip": 0.01187457, + "auxiliary_loss_mlp": 0.01061609, + "balance_loss_clip": 1.06299043, + "balance_loss_mlp": 1.03960323, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8227357222004792, + "language_loss": 0.84230769, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86479831, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.2199707, + "step": 1923, + "time_per_iteration": 2.583718776702881 + }, + { + "auxiliary_loss_clip": 0.01186758, + "auxiliary_loss_mlp": 0.0105515, + "balance_loss_clip": 1.0639168, + "balance_loss_mlp": 1.03296542, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.016958313354069, + "language_loss": 0.80606806, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82848716, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.22155762, + "step": 1924, + "time_per_iteration": 2.5182042121887207 + }, + { + "auxiliary_loss_clip": 0.0108882, + "auxiliary_loss_mlp": 0.01008289, + "balance_loss_clip": 1.03958464, + "balance_loss_mlp": 1.0054338, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.8071961780856337, + "language_loss": 0.61223501, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63320613, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 0.02853394, + "step": 1925, + "time_per_iteration": 3.156604051589966 + }, + { + "auxiliary_loss_clip": 0.01184937, + "auxiliary_loss_mlp": 0.01078507, + "balance_loss_clip": 1.06177986, + "balance_loss_mlp": 1.05515432, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.2380564832679575, + "language_loss": 0.75019729, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77283168, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.23352051, + "step": 1926, + "time_per_iteration": 2.4834647178649902 + }, + { + "auxiliary_loss_clip": 0.01206703, + "auxiliary_loss_mlp": 0.01064582, + "balance_loss_clip": 1.07730341, + "balance_loss_mlp": 1.03991807, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 3.9317380416096785, + "language_loss": 0.73011053, + "learning_rate": 3.923170932221222e-06, + "loss": 0.75282341, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.24658203, + "step": 1927, + "time_per_iteration": 2.4894909858703613 + }, + { + "auxiliary_loss_clip": 0.01184868, + "auxiliary_loss_mlp": 0.01056268, + "balance_loss_clip": 1.06315148, + "balance_loss_mlp": 1.03307009, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 1.9580901471367556, + "language_loss": 0.86823517, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89064652, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.23205566, + "step": 1928, + "time_per_iteration": 2.515989065170288 + }, + { + "auxiliary_loss_clip": 0.01186509, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.06336713, + "balance_loss_mlp": 1.03884029, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.6393759572370796, + "language_loss": 0.77364051, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79611862, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.22473145, + "step": 1929, + "time_per_iteration": 2.51916241645813 + }, + { + "auxiliary_loss_clip": 0.01183613, + "auxiliary_loss_mlp": 0.01069383, + "balance_loss_clip": 1.0630517, + "balance_loss_mlp": 1.0474602, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6691031386676658, + "language_loss": 0.76648128, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78901124, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.21936035, + "step": 1930, + "time_per_iteration": 2.526280164718628 + }, + { + "auxiliary_loss_clip": 0.01193024, + "auxiliary_loss_mlp": 0.01059038, + "balance_loss_clip": 1.06939769, + "balance_loss_mlp": 1.03707981, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 2.8303197394867405, + "language_loss": 0.71942484, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74194551, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.21960449, + "step": 1931, + "time_per_iteration": 2.4781765937805176 + }, + { + "auxiliary_loss_clip": 0.01186648, + "auxiliary_loss_mlp": 0.01065879, + "balance_loss_clip": 1.06272149, + "balance_loss_mlp": 1.04127455, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.555710701636003, + "language_loss": 0.82179838, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84432364, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.24597168, + "step": 1932, + "time_per_iteration": 2.4688076972961426 + }, + { + "auxiliary_loss_clip": 0.010748, + "auxiliary_loss_mlp": 0.01016263, + "balance_loss_clip": 1.02612329, + "balance_loss_mlp": 1.01228762, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7711221557781245, + "language_loss": 0.61060119, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63151181, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.48730469, + "router_z_loss_mlp": 0.03979492, + "step": 1933, + "time_per_iteration": 2.9967122077941895 + }, + { + "auxiliary_loss_clip": 0.01191836, + "auxiliary_loss_mlp": 0.01057775, + "balance_loss_clip": 1.06308532, + "balance_loss_mlp": 1.03624606, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.4040662078130657, + "language_loss": 0.86338305, + "learning_rate": 3.922420779525586e-06, + "loss": 0.88587916, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.21533203, + "step": 1934, + "time_per_iteration": 3.8789405822753906 + }, + { + "auxiliary_loss_clip": 0.01191239, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.06498206, + "balance_loss_mlp": 1.03325617, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.982952170229443, + "language_loss": 0.66274154, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.6852144, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.22790527, + "step": 1935, + "time_per_iteration": 2.4289934635162354 + }, + { + "auxiliary_loss_clip": 0.01199299, + "auxiliary_loss_mlp": 0.01048897, + "balance_loss_clip": 1.07198894, + "balance_loss_mlp": 1.02840531, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.917545392119737, + "language_loss": 0.75345838, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77594036, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.20483398, + "step": 1936, + "time_per_iteration": 2.480429172515869 + }, + { + "auxiliary_loss_clip": 0.01186636, + "auxiliary_loss_mlp": 0.01053013, + "balance_loss_clip": 1.06247711, + "balance_loss_mlp": 1.03027964, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 2.3412007852160044, + "language_loss": 0.84258842, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86498487, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.22741699, + "step": 1937, + "time_per_iteration": 3.7651865482330322 + }, + { + "auxiliary_loss_clip": 0.0118102, + "auxiliary_loss_mlp": 0.01057955, + "balance_loss_clip": 1.06066871, + "balance_loss_mlp": 1.03459036, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 1.9497984443241627, + "language_loss": 0.75984681, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78223652, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.23364258, + "step": 1938, + "time_per_iteration": 2.4985709190368652 + }, + { + "auxiliary_loss_clip": 0.01195526, + "auxiliary_loss_mlp": 0.01058925, + "balance_loss_clip": 1.06608844, + "balance_loss_mlp": 1.03725266, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.4050081825155254, + "language_loss": 0.79739356, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81993806, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.2166748, + "step": 1939, + "time_per_iteration": 5.498942852020264 + }, + { + "auxiliary_loss_clip": 0.01192466, + "auxiliary_loss_mlp": 0.01053582, + "balance_loss_clip": 1.06972313, + "balance_loss_mlp": 1.03180218, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.5975152709648306, + "language_loss": 0.86556131, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88802177, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.21777344, + "step": 1940, + "time_per_iteration": 2.4888062477111816 + }, + { + "auxiliary_loss_clip": 0.01193317, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_clip": 1.0719099, + "balance_loss_mlp": 1.04396284, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 2.1994862656549743, + "language_loss": 0.75912446, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7817235, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.22619629, + "step": 1941, + "time_per_iteration": 2.6706769466400146 + }, + { + "auxiliary_loss_clip": 0.01185528, + "auxiliary_loss_mlp": 0.01059733, + "balance_loss_clip": 1.06250334, + "balance_loss_mlp": 1.03794181, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.353880655396178, + "language_loss": 0.88581467, + "learning_rate": 3.921559088338068e-06, + "loss": 0.9082672, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.21789551, + "step": 1942, + "time_per_iteration": 2.407691478729248 + }, + { + "auxiliary_loss_clip": 0.01183979, + "auxiliary_loss_mlp": 0.01048466, + "balance_loss_clip": 1.06278694, + "balance_loss_mlp": 1.02865386, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.6351889298640134, + "language_loss": 0.67939335, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70171773, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.19824219, + "step": 1943, + "time_per_iteration": 2.572381019592285 + }, + { + "auxiliary_loss_clip": 0.01191513, + "auxiliary_loss_mlp": 0.01052276, + "balance_loss_clip": 1.06851625, + "balance_loss_mlp": 1.03040099, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.5530193219300803, + "language_loss": 0.69156575, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71400368, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.21875, + "step": 1944, + "time_per_iteration": 2.6191697120666504 + }, + { + "auxiliary_loss_clip": 0.01185395, + "auxiliary_loss_mlp": 0.01048685, + "balance_loss_clip": 1.06242657, + "balance_loss_mlp": 1.02850258, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.2601193799865547, + "language_loss": 0.82415724, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84649807, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.20178223, + "step": 1945, + "time_per_iteration": 2.541377544403076 + }, + { + "auxiliary_loss_clip": 0.01185017, + "auxiliary_loss_mlp": 0.0106152, + "balance_loss_clip": 1.06235027, + "balance_loss_mlp": 1.04062295, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.0772031792107204, + "language_loss": 0.76193833, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78440368, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.2088623, + "step": 1946, + "time_per_iteration": 2.538175106048584 + }, + { + "auxiliary_loss_clip": 0.01183769, + "auxiliary_loss_mlp": 0.0105273, + "balance_loss_clip": 1.06638813, + "balance_loss_mlp": 1.03289318, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.226666062039262, + "language_loss": 0.68632406, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70868897, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.19848633, + "step": 1947, + "time_per_iteration": 2.4734504222869873 + }, + { + "auxiliary_loss_clip": 0.0118525, + "auxiliary_loss_mlp": 0.01066136, + "balance_loss_clip": 1.06411624, + "balance_loss_mlp": 1.04470181, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.7566150438629533, + "language_loss": 0.84641027, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86892414, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.2142334, + "step": 1948, + "time_per_iteration": 2.4548046588897705 + }, + { + "auxiliary_loss_clip": 0.01081948, + "auxiliary_loss_mlp": 0.0100838, + "balance_loss_clip": 1.03325534, + "balance_loss_mlp": 1.00524807, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8174907776170632, + "language_loss": 0.65099448, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67189777, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 0.03131104, + "step": 1949, + "time_per_iteration": 3.097900152206421 + }, + { + "auxiliary_loss_clip": 0.01181984, + "auxiliary_loss_mlp": 0.01055557, + "balance_loss_clip": 1.06434345, + "balance_loss_mlp": 1.03525519, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.5474537113667168, + "language_loss": 0.71742463, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73979998, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.20300293, + "step": 1950, + "time_per_iteration": 2.52559494972229 + }, + { + "auxiliary_loss_clip": 0.01186842, + "auxiliary_loss_mlp": 0.01056574, + "balance_loss_clip": 1.06464291, + "balance_loss_mlp": 1.03646398, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.3807653675831957, + "language_loss": 0.7700581, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79249227, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.20117188, + "step": 1951, + "time_per_iteration": 2.409437894821167 + }, + { + "auxiliary_loss_clip": 0.01183011, + "auxiliary_loss_mlp": 0.01066168, + "balance_loss_clip": 1.06330681, + "balance_loss_mlp": 1.04361367, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.9160333950315946, + "language_loss": 0.76251149, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78500324, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.22558594, + "step": 1952, + "time_per_iteration": 2.545156717300415 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01062388, + "balance_loss_clip": 1.05964708, + "balance_loss_mlp": 1.04181302, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 2.0356036051449697, + "language_loss": 0.72050184, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74294615, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.20568848, + "step": 1953, + "time_per_iteration": 2.4657962322235107 + }, + { + "auxiliary_loss_clip": 0.01190738, + "auxiliary_loss_mlp": 0.01054418, + "balance_loss_clip": 1.06536174, + "balance_loss_mlp": 1.03219783, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.3219325414516794, + "language_loss": 0.79593718, + "learning_rate": 3.920257808329552e-06, + "loss": 0.8183887, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.22241211, + "step": 1954, + "time_per_iteration": 2.4826467037200928 + }, + { + "auxiliary_loss_clip": 0.01184776, + "auxiliary_loss_mlp": 0.01062401, + "balance_loss_clip": 1.06201959, + "balance_loss_mlp": 1.04136086, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 1.9461393511298393, + "language_loss": 0.85639238, + "learning_rate": 3.920148894924246e-06, + "loss": 0.87886417, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.21044922, + "step": 1955, + "time_per_iteration": 2.4278900623321533 + }, + { + "auxiliary_loss_clip": 0.01179343, + "auxiliary_loss_mlp": 0.01047261, + "balance_loss_clip": 1.05780947, + "balance_loss_mlp": 1.02729321, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.2293650737010973, + "language_loss": 0.78311479, + "learning_rate": 3.920039908706701e-06, + "loss": 0.80538088, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.19970703, + "step": 1956, + "time_per_iteration": 2.537099838256836 + }, + { + "auxiliary_loss_clip": 0.01177684, + "auxiliary_loss_mlp": 0.01058266, + "balance_loss_clip": 1.05928326, + "balance_loss_mlp": 1.03665328, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.447786569641914, + "language_loss": 0.80434418, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82670373, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.21618652, + "step": 1957, + "time_per_iteration": 2.464883804321289 + }, + { + "auxiliary_loss_clip": 0.01188337, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.06617606, + "balance_loss_mlp": 1.02999663, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 2.5134259051142687, + "language_loss": 0.7831828, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80557251, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.2064209, + "step": 1958, + "time_per_iteration": 2.597043514251709 + }, + { + "auxiliary_loss_clip": 0.01187741, + "auxiliary_loss_mlp": 0.01050069, + "balance_loss_clip": 1.06452703, + "balance_loss_mlp": 1.02770567, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 1.8575545567626122, + "language_loss": 0.76572728, + "learning_rate": 3.919712513221976e-06, + "loss": 0.78810537, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.22363281, + "step": 1959, + "time_per_iteration": 2.4749984741210938 + }, + { + "auxiliary_loss_clip": 0.01183213, + "auxiliary_loss_mlp": 0.01053438, + "balance_loss_clip": 1.06062984, + "balance_loss_mlp": 1.03335094, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 2.406168267278588, + "language_loss": 0.6983602, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72072673, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.20068359, + "step": 1960, + "time_per_iteration": 2.453603982925415 + }, + { + "auxiliary_loss_clip": 0.01187102, + "auxiliary_loss_mlp": 0.0105849, + "balance_loss_clip": 1.06300282, + "balance_loss_mlp": 1.03722334, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 7.490682199824461, + "language_loss": 0.81006312, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83251905, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.21264648, + "step": 1961, + "time_per_iteration": 2.3988099098205566 + }, + { + "auxiliary_loss_clip": 0.01172068, + "auxiliary_loss_mlp": 0.01057643, + "balance_loss_clip": 1.0576756, + "balance_loss_mlp": 1.03741336, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 2.213115666809285, + "language_loss": 0.92083538, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94313252, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.20227051, + "step": 1962, + "time_per_iteration": 2.5187933444976807 + }, + { + "auxiliary_loss_clip": 0.01180216, + "auxiliary_loss_mlp": 0.01058326, + "balance_loss_clip": 1.05953324, + "balance_loss_mlp": 1.03826308, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1472864071323565, + "language_loss": 0.87915313, + "learning_rate": 3.919274966788707e-06, + "loss": 0.90153855, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.20056152, + "step": 1963, + "time_per_iteration": 2.407226800918579 + }, + { + "auxiliary_loss_clip": 0.01181238, + "auxiliary_loss_mlp": 0.01057039, + "balance_loss_clip": 1.05777061, + "balance_loss_mlp": 1.03571248, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.170853619377219, + "language_loss": 0.84279853, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86518133, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.21325684, + "step": 1964, + "time_per_iteration": 2.528550148010254 + }, + { + "auxiliary_loss_clip": 0.01190982, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.07035255, + "balance_loss_mlp": 1.04144418, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.8200199399355097, + "language_loss": 0.83059585, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85311854, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.19836426, + "step": 1965, + "time_per_iteration": 2.4560887813568115 + }, + { + "auxiliary_loss_clip": 0.01188623, + "auxiliary_loss_mlp": 0.01052864, + "balance_loss_clip": 1.06681359, + "balance_loss_mlp": 1.03215742, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.5720837953281808, + "language_loss": 0.74417281, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76658767, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.20715332, + "step": 1966, + "time_per_iteration": 2.7192277908325195 + }, + { + "auxiliary_loss_clip": 0.01188914, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.06462359, + "balance_loss_mlp": 1.03664041, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 6.101603203051054, + "language_loss": 0.7289682, + "learning_rate": 3.918836255889908e-06, + "loss": 0.75143272, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.20898438, + "step": 1967, + "time_per_iteration": 2.424968957901001 + }, + { + "auxiliary_loss_clip": 0.01180235, + "auxiliary_loss_mlp": 0.01051074, + "balance_loss_clip": 1.06026947, + "balance_loss_mlp": 1.03058219, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.2550597159701855, + "language_loss": 0.88201964, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90433276, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.20507812, + "step": 1968, + "time_per_iteration": 2.5229549407958984 + }, + { + "auxiliary_loss_clip": 0.01183718, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_clip": 1.06323183, + "balance_loss_mlp": 1.03242755, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 1.8101216483632725, + "language_loss": 0.66978157, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69215596, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.21313477, + "step": 1969, + "time_per_iteration": 2.5000085830688477 + }, + { + "auxiliary_loss_clip": 0.01183546, + "auxiliary_loss_mlp": 0.01060244, + "balance_loss_clip": 1.06665647, + "balance_loss_mlp": 1.0394299, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.02313176749195, + "language_loss": 0.80777824, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83021617, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.20812988, + "step": 1970, + "time_per_iteration": 2.7318308353424072 + }, + { + "auxiliary_loss_clip": 0.0109026, + "auxiliary_loss_mlp": 0.01015056, + "balance_loss_clip": 1.04309499, + "balance_loss_mlp": 1.01121795, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.81641073812279, + "language_loss": 0.66145873, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68251193, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 0.03839111, + "step": 1971, + "time_per_iteration": 3.0783419609069824 + }, + { + "auxiliary_loss_clip": 0.01178338, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_clip": 1.05828357, + "balance_loss_mlp": 1.03456283, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 2.0647210208826507, + "language_loss": 0.79281902, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81514263, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.19470215, + "step": 1972, + "time_per_iteration": 2.5623128414154053 + }, + { + "auxiliary_loss_clip": 0.01178057, + "auxiliary_loss_mlp": 0.01053153, + "balance_loss_clip": 1.05951941, + "balance_loss_mlp": 1.0326612, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.2304273104681664, + "language_loss": 0.72724187, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74955404, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.20495605, + "step": 1973, + "time_per_iteration": 2.495591402053833 + }, + { + "auxiliary_loss_clip": 0.01177632, + "auxiliary_loss_mlp": 0.01047159, + "balance_loss_clip": 1.06132984, + "balance_loss_mlp": 1.0266912, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 1.6697815283366697, + "language_loss": 0.71981144, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74205935, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.20471191, + "step": 1974, + "time_per_iteration": 2.5494906902313232 + }, + { + "auxiliary_loss_clip": 0.01177271, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.06008577, + "balance_loss_mlp": 1.02294636, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.1523176895380334, + "language_loss": 0.77658343, + "learning_rate": 3.917955341761128e-06, + "loss": 0.79878747, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.2019043, + "step": 1975, + "time_per_iteration": 2.4031739234924316 + }, + { + "auxiliary_loss_clip": 0.01182387, + "auxiliary_loss_mlp": 0.01060303, + "balance_loss_clip": 1.06513882, + "balance_loss_mlp": 1.04022861, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.5785014544468834, + "language_loss": 0.75360096, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77602792, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.20080566, + "step": 1976, + "time_per_iteration": 2.4856488704681396 + }, + { + "auxiliary_loss_clip": 0.01173983, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.05800486, + "balance_loss_mlp": 1.02953196, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6715445460438019, + "language_loss": 0.7547828, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77701449, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.1965332, + "step": 1977, + "time_per_iteration": 4.081012725830078 + }, + { + "auxiliary_loss_clip": 0.01176633, + "auxiliary_loss_mlp": 0.0105272, + "balance_loss_clip": 1.05702555, + "balance_loss_mlp": 1.03299117, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.1904816080718543, + "language_loss": 0.73831499, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76060855, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.19726562, + "step": 1978, + "time_per_iteration": 2.443931818008423 + }, + { + "auxiliary_loss_clip": 0.01176049, + "auxiliary_loss_mlp": 0.010521, + "balance_loss_clip": 1.05960989, + "balance_loss_mlp": 1.03103566, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 2.2623645578911185, + "language_loss": 0.7341193, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75640076, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.21032715, + "step": 1979, + "time_per_iteration": 2.484513998031616 + }, + { + "auxiliary_loss_clip": 0.01192466, + "auxiliary_loss_mlp": 0.01054068, + "balance_loss_clip": 1.07106876, + "balance_loss_mlp": 1.03426778, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.6964360612260538, + "language_loss": 0.98650211, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00896752, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.19799805, + "step": 1980, + "time_per_iteration": 3.9370875358581543 + }, + { + "auxiliary_loss_clip": 0.01187158, + "auxiliary_loss_mlp": 0.01053201, + "balance_loss_clip": 1.0657202, + "balance_loss_mlp": 1.03223181, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8180776901366476, + "language_loss": 0.86364722, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88605082, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.2097168, + "step": 1981, + "time_per_iteration": 2.4718639850616455 + }, + { + "auxiliary_loss_clip": 0.01186337, + "auxiliary_loss_mlp": 0.01059538, + "balance_loss_clip": 1.06539631, + "balance_loss_mlp": 1.03815162, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.8925039035291276, + "language_loss": 0.85394704, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87640584, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.21374512, + "step": 1982, + "time_per_iteration": 5.330406665802002 + }, + { + "auxiliary_loss_clip": 0.01179059, + "auxiliary_loss_mlp": 0.01047783, + "balance_loss_clip": 1.06121504, + "balance_loss_mlp": 1.02884102, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 2.1589007662676276, + "language_loss": 0.85491353, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87718201, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.18920898, + "step": 1983, + "time_per_iteration": 2.4100399017333984 + }, + { + "auxiliary_loss_clip": 0.0118316, + "auxiliary_loss_mlp": 0.01057867, + "balance_loss_clip": 1.06255198, + "balance_loss_mlp": 1.03715992, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.267872373885702, + "language_loss": 0.77004981, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79246002, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.20690918, + "step": 1984, + "time_per_iteration": 2.4604012966156006 + }, + { + "auxiliary_loss_clip": 0.01186027, + "auxiliary_loss_mlp": 0.0105051, + "balance_loss_clip": 1.06710339, + "balance_loss_mlp": 1.03065002, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.8355679819201558, + "language_loss": 0.83132738, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85369277, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.19873047, + "step": 1985, + "time_per_iteration": 2.4494869709014893 + }, + { + "auxiliary_loss_clip": 0.01179852, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_clip": 1.06191993, + "balance_loss_mlp": 1.0262233, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9963826998407996, + "language_loss": 0.73767024, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75993288, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.20202637, + "step": 1986, + "time_per_iteration": 2.436108350753784 + }, + { + "auxiliary_loss_clip": 0.01181436, + "auxiliary_loss_mlp": 0.01050464, + "balance_loss_clip": 1.06331515, + "balance_loss_mlp": 1.03148532, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.5625105233376284, + "language_loss": 0.7199167, + "learning_rate": 3.916625243753819e-06, + "loss": 0.7422356, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.18981934, + "step": 1987, + "time_per_iteration": 2.485093832015991 + }, + { + "auxiliary_loss_clip": 0.01174741, + "auxiliary_loss_mlp": 0.01054061, + "balance_loss_clip": 1.05710304, + "balance_loss_mlp": 1.03317595, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 3.167772080944109, + "language_loss": 0.71756911, + "learning_rate": 3.916513929741799e-06, + "loss": 0.7398572, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.2088623, + "step": 1988, + "time_per_iteration": 2.469285011291504 + }, + { + "auxiliary_loss_clip": 0.0117259, + "auxiliary_loss_mlp": 0.01062297, + "balance_loss_clip": 1.05561769, + "balance_loss_mlp": 1.0401355, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.7361515432621484, + "language_loss": 0.80757672, + "learning_rate": 3.91640254305538e-06, + "loss": 0.82992554, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.22180176, + "step": 1989, + "time_per_iteration": 2.4874775409698486 + }, + { + "auxiliary_loss_clip": 0.01181127, + "auxiliary_loss_mlp": 0.01055087, + "balance_loss_clip": 1.06342816, + "balance_loss_mlp": 1.03510714, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.482836404021965, + "language_loss": 0.75333118, + "learning_rate": 3.916291083698784e-06, + "loss": 0.7756933, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.19995117, + "step": 1990, + "time_per_iteration": 2.5566115379333496 + }, + { + "auxiliary_loss_clip": 0.01091035, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.04264712, + "balance_loss_mlp": 1.03090847, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8595625265249799, + "language_loss": 0.5521698, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57342279, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 0.03369141, + "step": 1991, + "time_per_iteration": 3.1276206970214844 + }, + { + "auxiliary_loss_clip": 0.01176788, + "auxiliary_loss_mlp": 0.01053959, + "balance_loss_clip": 1.06273007, + "balance_loss_mlp": 1.03448057, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.251931440781551, + "language_loss": 0.77897537, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80128288, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.19482422, + "step": 1992, + "time_per_iteration": 2.4816653728485107 + }, + { + "auxiliary_loss_clip": 0.01179284, + "auxiliary_loss_mlp": 0.01056612, + "balance_loss_clip": 1.05954301, + "balance_loss_mlp": 1.03441536, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 2.2794339896248843, + "language_loss": 0.78900051, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81135952, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.22192383, + "step": 1993, + "time_per_iteration": 2.5648956298828125 + }, + { + "auxiliary_loss_clip": 0.01171107, + "auxiliary_loss_mlp": 0.01055607, + "balance_loss_clip": 1.05527139, + "balance_loss_mlp": 1.03633118, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 2.076010023311103, + "language_loss": 0.82263744, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84490454, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.19274902, + "step": 1994, + "time_per_iteration": 2.4781856536865234 + }, + { + "auxiliary_loss_clip": 0.01189625, + "auxiliary_loss_mlp": 0.01056652, + "balance_loss_clip": 1.07093239, + "balance_loss_mlp": 1.03778112, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.5671709893129884, + "language_loss": 0.88499343, + "learning_rate": 3.915732697011183e-06, + "loss": 0.90745628, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.18859863, + "step": 1995, + "time_per_iteration": 2.4650070667266846 + }, + { + "auxiliary_loss_clip": 0.01177918, + "auxiliary_loss_mlp": 0.01056368, + "balance_loss_clip": 1.06063104, + "balance_loss_mlp": 1.0366385, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.260986001373498, + "language_loss": 0.74310756, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76545036, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.19726562, + "step": 1996, + "time_per_iteration": 2.5221400260925293 + }, + { + "auxiliary_loss_clip": 0.01171365, + "auxiliary_loss_mlp": 0.01049409, + "balance_loss_clip": 1.05571246, + "balance_loss_mlp": 1.02871394, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9077818989171398, + "language_loss": 0.88282645, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90503424, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.20678711, + "step": 1997, + "time_per_iteration": 2.4876198768615723 + }, + { + "auxiliary_loss_clip": 0.01171052, + "auxiliary_loss_mlp": 0.01067403, + "balance_loss_clip": 1.05507672, + "balance_loss_mlp": 1.04680407, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 1.8079172210966061, + "language_loss": 0.78845537, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81083989, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.20605469, + "step": 1998, + "time_per_iteration": 2.5590267181396484 + }, + { + "auxiliary_loss_clip": 0.01181731, + "auxiliary_loss_mlp": 0.01054773, + "balance_loss_clip": 1.06423497, + "balance_loss_mlp": 1.03409028, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 1.828880949535967, + "language_loss": 0.73478043, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75714546, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.20678711, + "step": 1999, + "time_per_iteration": 2.554716110229492 + }, + { + "auxiliary_loss_clip": 0.01175535, + "auxiliary_loss_mlp": 0.0106949, + "balance_loss_clip": 1.05844259, + "balance_loss_mlp": 1.04896259, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.5890868606431674, + "language_loss": 0.73975718, + "learning_rate": 3.915172494204323e-06, + "loss": 0.76220739, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.20532227, + "step": 2000, + "time_per_iteration": 2.5261669158935547 + }, + { + "auxiliary_loss_clip": 0.01175587, + "auxiliary_loss_mlp": 0.01056587, + "balance_loss_clip": 1.05708694, + "balance_loss_mlp": 1.03672636, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.6485754296050776, + "language_loss": 0.84999037, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87231219, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.1986084, + "step": 2001, + "time_per_iteration": 2.472052574157715 + }, + { + "auxiliary_loss_clip": 0.01178575, + "auxiliary_loss_mlp": 0.01057288, + "balance_loss_clip": 1.0606786, + "balance_loss_mlp": 1.03757095, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.0733364063306823, + "language_loss": 0.74273431, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76509291, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.19714355, + "step": 2002, + "time_per_iteration": 2.422980546951294 + }, + { + "auxiliary_loss_clip": 0.01183917, + "auxiliary_loss_mlp": 0.01055649, + "balance_loss_clip": 1.06057322, + "balance_loss_mlp": 1.03406, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 3.3786529519498023, + "language_loss": 0.77997482, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80237049, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.21594238, + "step": 2003, + "time_per_iteration": 2.4684314727783203 + }, + { + "auxiliary_loss_clip": 0.01171776, + "auxiliary_loss_mlp": 0.0105182, + "balance_loss_clip": 1.05699122, + "balance_loss_mlp": 1.03092265, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.8473983604357156, + "language_loss": 0.72190118, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74413711, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.20898438, + "step": 2004, + "time_per_iteration": 2.4903388023376465 + }, + { + "auxiliary_loss_clip": 0.01180907, + "auxiliary_loss_mlp": 0.01067821, + "balance_loss_clip": 1.0593009, + "balance_loss_mlp": 1.04366946, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 1.9236028198142285, + "language_loss": 0.78179264, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80427992, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.24169922, + "step": 2005, + "time_per_iteration": 2.5329763889312744 + }, + { + "auxiliary_loss_clip": 0.01089292, + "auxiliary_loss_mlp": 0.01021352, + "balance_loss_clip": 1.04325747, + "balance_loss_mlp": 1.01808858, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9365286550070724, + "language_loss": 0.58075488, + "learning_rate": 3.914497854306543e-06, + "loss": 0.6018613, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.45996094, + "router_z_loss_mlp": 0.03262329, + "step": 2006, + "time_per_iteration": 2.882516860961914 + }, + { + "auxiliary_loss_clip": 0.0118246, + "auxiliary_loss_mlp": 0.01054743, + "balance_loss_clip": 1.06747901, + "balance_loss_mlp": 1.03410816, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.9008504940627093, + "language_loss": 0.76115239, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78352439, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.2064209, + "step": 2007, + "time_per_iteration": 2.5872926712036133 + }, + { + "auxiliary_loss_clip": 0.01181382, + "auxiliary_loss_mlp": 0.0105673, + "balance_loss_clip": 1.06301999, + "balance_loss_mlp": 1.03595161, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 2.9002861112890215, + "language_loss": 0.83191264, + "learning_rate": 3.914272393511494e-06, + "loss": 0.8542937, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.20776367, + "step": 2008, + "time_per_iteration": 2.4680802822113037 + }, + { + "auxiliary_loss_clip": 0.01179988, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.06313884, + "balance_loss_mlp": 1.02858675, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.335274635931841, + "language_loss": 0.83668804, + "learning_rate": 3.91415955422773e-06, + "loss": 0.85898095, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.20751953, + "step": 2009, + "time_per_iteration": 2.4806365966796875 + }, + { + "auxiliary_loss_clip": 0.0118695, + "auxiliary_loss_mlp": 0.01051286, + "balance_loss_clip": 1.06668055, + "balance_loss_mlp": 1.028934, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 2.2787321176259776, + "language_loss": 0.84071499, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86309737, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.22375488, + "step": 2010, + "time_per_iteration": 2.453023672103882 + }, + { + "auxiliary_loss_clip": 0.01183258, + "auxiliary_loss_mlp": 0.01055003, + "balance_loss_clip": 1.06611013, + "balance_loss_mlp": 1.03409338, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.7131244128382148, + "language_loss": 0.8433736, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86575621, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.20898438, + "step": 2011, + "time_per_iteration": 2.510286331176758 + }, + { + "auxiliary_loss_clip": 0.01178949, + "auxiliary_loss_mlp": 0.01058814, + "balance_loss_clip": 1.05969763, + "balance_loss_mlp": 1.03800046, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 2.110822325797608, + "language_loss": 0.96320176, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98557937, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.20812988, + "step": 2012, + "time_per_iteration": 2.464627742767334 + }, + { + "auxiliary_loss_clip": 0.01174151, + "auxiliary_loss_mlp": 0.01047069, + "balance_loss_clip": 1.05978119, + "balance_loss_mlp": 1.02615976, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.0785346756903254, + "language_loss": 0.8060292, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82824141, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.20922852, + "step": 2013, + "time_per_iteration": 2.5753071308135986 + }, + { + "auxiliary_loss_clip": 0.01180797, + "auxiliary_loss_mlp": 0.01048865, + "balance_loss_clip": 1.06008506, + "balance_loss_mlp": 1.02658439, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.8635937275942225, + "language_loss": 0.77244043, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79473704, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.22277832, + "step": 2014, + "time_per_iteration": 2.428131103515625 + }, + { + "auxiliary_loss_clip": 0.011779, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.06126833, + "balance_loss_mlp": 1.02622032, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.0954138603405363, + "language_loss": 0.86972308, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89197278, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.20837402, + "step": 2015, + "time_per_iteration": 2.5060226917266846 + }, + { + "auxiliary_loss_clip": 0.01179299, + "auxiliary_loss_mlp": 0.01052567, + "balance_loss_clip": 1.06179595, + "balance_loss_mlp": 1.03119326, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 1.967003715837108, + "language_loss": 0.69280875, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71512747, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.21386719, + "step": 2016, + "time_per_iteration": 2.420778751373291 + }, + { + "auxiliary_loss_clip": 0.01175939, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.05801058, + "balance_loss_mlp": 1.02922368, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.4320236845670355, + "language_loss": 0.80387962, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82616222, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.23059082, + "step": 2017, + "time_per_iteration": 2.56735897064209 + }, + { + "auxiliary_loss_clip": 0.01178658, + "auxiliary_loss_mlp": 0.01051318, + "balance_loss_clip": 1.06029582, + "balance_loss_mlp": 1.02925277, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.781365994635666, + "language_loss": 0.6873982, + "learning_rate": 3.913140734857731e-06, + "loss": 0.70969802, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.22058105, + "step": 2018, + "time_per_iteration": 2.4444868564605713 + }, + { + "auxiliary_loss_clip": 0.01186216, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.06959534, + "balance_loss_mlp": 1.0310992, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.7229108022777235, + "language_loss": 0.72484469, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74721771, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.19995117, + "step": 2019, + "time_per_iteration": 2.5419392585754395 + }, + { + "auxiliary_loss_clip": 0.01182481, + "auxiliary_loss_mlp": 0.01059887, + "balance_loss_clip": 1.06050146, + "balance_loss_mlp": 1.03772616, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8053495237973733, + "language_loss": 0.92293453, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94535822, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.22167969, + "step": 2020, + "time_per_iteration": 3.968510150909424 + }, + { + "auxiliary_loss_clip": 0.01186171, + "auxiliary_loss_mlp": 0.0105415, + "balance_loss_clip": 1.06771815, + "balance_loss_mlp": 1.03363371, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 2.455155926716498, + "language_loss": 0.77700615, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79940933, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.2052002, + "step": 2021, + "time_per_iteration": 2.474491834640503 + }, + { + "auxiliary_loss_clip": 0.01193533, + "auxiliary_loss_mlp": 0.01050794, + "balance_loss_clip": 1.07531762, + "balance_loss_mlp": 1.02971756, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 1.962009016660513, + "language_loss": 0.80431044, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82675374, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.21057129, + "step": 2022, + "time_per_iteration": 3.9081220626831055 + }, + { + "auxiliary_loss_clip": 0.01182285, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.06565619, + "balance_loss_mlp": 1.0293076, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.801797943442884, + "language_loss": 0.85187376, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87419581, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.20617676, + "step": 2023, + "time_per_iteration": 2.4129059314727783 + }, + { + "auxiliary_loss_clip": 0.01185571, + "auxiliary_loss_mlp": 0.01054582, + "balance_loss_clip": 1.06507468, + "balance_loss_mlp": 1.0327189, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.172701211292546, + "language_loss": 0.85779905, + "learning_rate": 3.912458257159335e-06, + "loss": 0.88020051, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.21875, + "step": 2024, + "time_per_iteration": 2.437638998031616 + }, + { + "auxiliary_loss_clip": 0.01172421, + "auxiliary_loss_mlp": 0.01067543, + "balance_loss_clip": 1.0542146, + "balance_loss_mlp": 1.04483366, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.158362164639516, + "language_loss": 0.72035307, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74275267, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.22680664, + "step": 2025, + "time_per_iteration": 5.3924925327301025 + }, + { + "auxiliary_loss_clip": 0.01177734, + "auxiliary_loss_mlp": 0.0104648, + "balance_loss_clip": 1.05800903, + "balance_loss_mlp": 1.02529693, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.673322752085921, + "language_loss": 0.7604233, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78266537, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.21179199, + "step": 2026, + "time_per_iteration": 2.4763431549072266 + }, + { + "auxiliary_loss_clip": 0.01184812, + "auxiliary_loss_mlp": 0.01045854, + "balance_loss_clip": 1.06483376, + "balance_loss_mlp": 1.02480125, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.5494392671334833, + "language_loss": 0.8871671, + "learning_rate": 3.912116039223659e-06, + "loss": 0.90947378, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.21057129, + "step": 2027, + "time_per_iteration": 2.4778783321380615 + }, + { + "auxiliary_loss_clip": 0.01173323, + "auxiliary_loss_mlp": 0.01057112, + "balance_loss_clip": 1.05725861, + "balance_loss_mlp": 1.03622687, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.6137193271623167, + "language_loss": 0.75609362, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77839804, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.20910645, + "step": 2028, + "time_per_iteration": 2.4711415767669678 + }, + { + "auxiliary_loss_clip": 0.01180721, + "auxiliary_loss_mlp": 0.01054531, + "balance_loss_clip": 1.06332612, + "balance_loss_mlp": 1.03233421, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.168773435508853, + "language_loss": 0.76904082, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79139328, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.2220459, + "step": 2029, + "time_per_iteration": 2.4691717624664307 + }, + { + "auxiliary_loss_clip": 0.01185836, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_clip": 1.06650484, + "balance_loss_mlp": 1.02838302, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.792454724173938, + "language_loss": 0.79032123, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81266999, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.20666504, + "step": 2030, + "time_per_iteration": 2.4419448375701904 + }, + { + "auxiliary_loss_clip": 0.01175364, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.0612824, + "balance_loss_mlp": 1.02875423, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 1.9503930273359242, + "language_loss": 0.74201739, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76427662, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.21813965, + "step": 2031, + "time_per_iteration": 2.5038983821868896 + }, + { + "auxiliary_loss_clip": 0.01181409, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.06525028, + "balance_loss_mlp": 1.02892232, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.8986874109891718, + "language_loss": 0.74855769, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77085304, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.19213867, + "step": 2032, + "time_per_iteration": 2.4296340942382812 + }, + { + "auxiliary_loss_clip": 0.01168366, + "auxiliary_loss_mlp": 0.01043083, + "balance_loss_clip": 1.05640149, + "balance_loss_mlp": 1.0231154, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.575329720341608, + "language_loss": 0.88999069, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91210514, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.19958496, + "step": 2033, + "time_per_iteration": 2.533507823944092 + }, + { + "auxiliary_loss_clip": 0.01178738, + "auxiliary_loss_mlp": 0.01051945, + "balance_loss_clip": 1.06161582, + "balance_loss_mlp": 1.03150034, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 1.974795743400854, + "language_loss": 0.64961541, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67192227, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.2043457, + "step": 2034, + "time_per_iteration": 2.424577474594116 + }, + { + "auxiliary_loss_clip": 0.0117799, + "auxiliary_loss_mlp": 0.01052709, + "balance_loss_clip": 1.06177664, + "balance_loss_mlp": 1.03107262, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.7801236064433696, + "language_loss": 0.76738751, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78969455, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.21643066, + "step": 2035, + "time_per_iteration": 2.54542875289917 + }, + { + "auxiliary_loss_clip": 0.01175147, + "auxiliary_loss_mlp": 0.0105245, + "balance_loss_clip": 1.05620623, + "balance_loss_mlp": 1.02950239, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 2.9283925277762326, + "language_loss": 0.71678221, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73905826, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.22949219, + "step": 2036, + "time_per_iteration": 2.473358631134033 + }, + { + "auxiliary_loss_clip": 0.01184297, + "auxiliary_loss_mlp": 0.01054129, + "balance_loss_clip": 1.06789529, + "balance_loss_mlp": 1.03281403, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.6886102117175448, + "language_loss": 0.83306193, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85544622, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.21337891, + "step": 2037, + "time_per_iteration": 2.4965946674346924 + }, + { + "auxiliary_loss_clip": 0.0118325, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_clip": 1.06183171, + "balance_loss_mlp": 1.03663659, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 3.260368331169022, + "language_loss": 0.79818046, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82060325, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.22399902, + "step": 2038, + "time_per_iteration": 2.478450059890747 + }, + { + "auxiliary_loss_clip": 0.01087513, + "auxiliary_loss_mlp": 0.01024814, + "balance_loss_clip": 1.04332674, + "balance_loss_mlp": 1.02100563, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8256730281458312, + "language_loss": 0.5863741, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60749739, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.44189453, + "router_z_loss_mlp": 0.03808594, + "step": 2039, + "time_per_iteration": 2.9883179664611816 + }, + { + "auxiliary_loss_clip": 0.01176087, + "auxiliary_loss_mlp": 0.01056017, + "balance_loss_clip": 1.05743384, + "balance_loss_mlp": 1.03387964, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.0919969217607726, + "language_loss": 0.80394375, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82626486, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.22143555, + "step": 2040, + "time_per_iteration": 2.4523632526397705 + }, + { + "auxiliary_loss_clip": 0.01177521, + "auxiliary_loss_mlp": 0.01050291, + "balance_loss_clip": 1.06179404, + "balance_loss_mlp": 1.0291549, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.9673644335762077, + "language_loss": 0.82884252, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85112071, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.21118164, + "step": 2041, + "time_per_iteration": 2.488646984100342 + }, + { + "auxiliary_loss_clip": 0.01196238, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.07479489, + "balance_loss_mlp": 1.02793336, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7797134265943229, + "language_loss": 0.67614126, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.6985985, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.21557617, + "step": 2042, + "time_per_iteration": 2.4353713989257812 + }, + { + "auxiliary_loss_clip": 0.01174846, + "auxiliary_loss_mlp": 0.01055957, + "balance_loss_clip": 1.05917788, + "balance_loss_mlp": 1.03511918, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.7135947554179423, + "language_loss": 0.8152715, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83757949, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.20849609, + "step": 2043, + "time_per_iteration": 2.495570659637451 + }, + { + "auxiliary_loss_clip": 0.01170362, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.05344486, + "balance_loss_mlp": 1.02617168, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 1.8554277097585665, + "language_loss": 0.80373043, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82591152, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.21569824, + "step": 2044, + "time_per_iteration": 2.424705982208252 + }, + { + "auxiliary_loss_clip": 0.01174703, + "auxiliary_loss_mlp": 0.01051979, + "balance_loss_clip": 1.06022143, + "balance_loss_mlp": 1.03068805, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7942392900390758, + "language_loss": 0.78282231, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80508912, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.21289062, + "step": 2045, + "time_per_iteration": 2.5107040405273438 + }, + { + "auxiliary_loss_clip": 0.01185877, + "auxiliary_loss_mlp": 0.01057435, + "balance_loss_clip": 1.0645715, + "balance_loss_mlp": 1.03653765, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 1.8527145479078035, + "language_loss": 0.67293584, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69536901, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.20898438, + "step": 2046, + "time_per_iteration": 2.4236183166503906 + }, + { + "auxiliary_loss_clip": 0.01180678, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_clip": 1.06594861, + "balance_loss_mlp": 1.03345037, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.666661210150538, + "language_loss": 0.73023218, + "learning_rate": 3.909817915225297e-06, + "loss": 0.75257659, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.203125, + "step": 2047, + "time_per_iteration": 2.4763784408569336 + }, + { + "auxiliary_loss_clip": 0.01170744, + "auxiliary_loss_mlp": 0.01058025, + "balance_loss_clip": 1.05648732, + "balance_loss_mlp": 1.03584051, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.849402150865258, + "language_loss": 0.76228768, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78457534, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.22167969, + "step": 2048, + "time_per_iteration": 2.47602915763855 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_clip": 1.06016457, + "balance_loss_mlp": 1.02617717, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 1.9364011079325383, + "language_loss": 0.84942704, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87162209, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.19067383, + "step": 2049, + "time_per_iteration": 2.521935224533081 + }, + { + "auxiliary_loss_clip": 0.01179562, + "auxiliary_loss_mlp": 0.01049307, + "balance_loss_clip": 1.06230736, + "balance_loss_mlp": 1.02857637, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 1.7650485256831405, + "language_loss": 0.75522131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77750999, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.20727539, + "step": 2050, + "time_per_iteration": 2.480121612548828 + }, + { + "auxiliary_loss_clip": 0.01170039, + "auxiliary_loss_mlp": 0.0104901, + "balance_loss_clip": 1.05638814, + "balance_loss_mlp": 1.02973366, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 2.2672049322989665, + "language_loss": 0.80636513, + "learning_rate": 3.909354813123452e-06, + "loss": 0.82855558, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.19274902, + "step": 2051, + "time_per_iteration": 2.467597246170044 + }, + { + "auxiliary_loss_clip": 0.01168073, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.05621612, + "balance_loss_mlp": 1.02960849, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7391973248996375, + "language_loss": 0.80294824, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82512021, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.19506836, + "step": 2052, + "time_per_iteration": 2.54262375831604 + }, + { + "auxiliary_loss_clip": 0.01175274, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.05915117, + "balance_loss_mlp": 1.03479481, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.405234113743818, + "language_loss": 0.73934376, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76165283, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.20825195, + "step": 2053, + "time_per_iteration": 2.459033489227295 + }, + { + "auxiliary_loss_clip": 0.01186362, + "auxiliary_loss_mlp": 0.01058319, + "balance_loss_clip": 1.06651866, + "balance_loss_mlp": 1.03781521, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.6724238906327553, + "language_loss": 0.74041188, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76285875, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.2052002, + "step": 2054, + "time_per_iteration": 2.6498584747314453 + }, + { + "auxiliary_loss_clip": 0.01166598, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.05546999, + "balance_loss_mlp": 1.02321625, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 1.886567423388807, + "language_loss": 0.85193539, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87401998, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.18640137, + "step": 2055, + "time_per_iteration": 2.4949533939361572 + }, + { + "auxiliary_loss_clip": 0.01177184, + "auxiliary_loss_mlp": 0.01049017, + "balance_loss_clip": 1.0618732, + "balance_loss_mlp": 1.0301702, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 2.000798807971863, + "language_loss": 0.77657199, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79883403, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.18859863, + "step": 2056, + "time_per_iteration": 2.5365352630615234 + }, + { + "auxiliary_loss_clip": 0.01166381, + "auxiliary_loss_mlp": 0.01051781, + "balance_loss_clip": 1.05268323, + "balance_loss_mlp": 1.031587, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 2.0607071391938256, + "language_loss": 0.82962048, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85180211, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.2019043, + "step": 2057, + "time_per_iteration": 2.5745270252227783 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01054625, + "balance_loss_clip": 1.05432081, + "balance_loss_mlp": 1.03351331, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.7556208269664773, + "language_loss": 0.78126872, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80351734, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.21130371, + "step": 2058, + "time_per_iteration": 2.6162467002868652 + }, + { + "auxiliary_loss_clip": 0.01172485, + "auxiliary_loss_mlp": 0.01053085, + "balance_loss_clip": 1.05547822, + "balance_loss_mlp": 1.03206801, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.169747722649983, + "language_loss": 0.83926976, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.86152542, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.21020508, + "step": 2059, + "time_per_iteration": 2.5553205013275146 + }, + { + "auxiliary_loss_clip": 0.01178834, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.0607481, + "balance_loss_mlp": 1.03809273, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 3.3964964696461872, + "language_loss": 0.81503403, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83742321, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.22009277, + "step": 2060, + "time_per_iteration": 2.4663751125335693 + }, + { + "auxiliary_loss_clip": 0.01173531, + "auxiliary_loss_mlp": 0.01055445, + "balance_loss_clip": 1.05734146, + "balance_loss_mlp": 1.03458381, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 1.9145868980429093, + "language_loss": 0.86186928, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88415897, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.20861816, + "step": 2061, + "time_per_iteration": 2.5325095653533936 + }, + { + "auxiliary_loss_clip": 0.01169046, + "auxiliary_loss_mlp": 0.01049782, + "balance_loss_clip": 1.05743349, + "balance_loss_mlp": 1.02930236, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.233319024239588, + "language_loss": 0.84947324, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87166154, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.20495605, + "step": 2062, + "time_per_iteration": 2.4452362060546875 + }, + { + "auxiliary_loss_clip": 0.01173649, + "auxiliary_loss_mlp": 0.01056686, + "balance_loss_clip": 1.05960953, + "balance_loss_mlp": 1.03520417, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.4928887936774995, + "language_loss": 0.78953147, + "learning_rate": 3.907958557264774e-06, + "loss": 0.81183481, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.21472168, + "step": 2063, + "time_per_iteration": 4.021647691726685 + }, + { + "auxiliary_loss_clip": 0.01180571, + "auxiliary_loss_mlp": 0.01051409, + "balance_loss_clip": 1.06445527, + "balance_loss_mlp": 1.02945065, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.1240028398762956, + "language_loss": 0.79028136, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81260121, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.21948242, + "step": 2064, + "time_per_iteration": 2.502113103866577 + }, + { + "auxiliary_loss_clip": 0.01175462, + "auxiliary_loss_mlp": 0.01052461, + "balance_loss_clip": 1.06078255, + "balance_loss_mlp": 1.03274369, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.4354488954153415, + "language_loss": 0.92661345, + "learning_rate": 3.907724834849002e-06, + "loss": 0.94889265, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.19714355, + "step": 2065, + "time_per_iteration": 3.9566469192504883 + }, + { + "auxiliary_loss_clip": 0.01183943, + "auxiliary_loss_mlp": 0.01047735, + "balance_loss_clip": 1.06593418, + "balance_loss_mlp": 1.02712405, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.8923050441280795, + "language_loss": 0.80551696, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82783377, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.20617676, + "step": 2066, + "time_per_iteration": 2.5306103229522705 + }, + { + "auxiliary_loss_clip": 0.01097014, + "auxiliary_loss_mlp": 0.01007973, + "balance_loss_clip": 1.05107522, + "balance_loss_mlp": 1.00547242, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8705534159881847, + "language_loss": 0.63298762, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65403748, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.45947266, + "router_z_loss_mlp": 0.0249939, + "step": 2067, + "time_per_iteration": 3.0358715057373047 + }, + { + "auxiliary_loss_clip": 0.01173615, + "auxiliary_loss_mlp": 0.01052506, + "balance_loss_clip": 1.05759859, + "balance_loss_mlp": 1.03201413, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9278379358205333, + "language_loss": 0.9310317, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95329291, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.2052002, + "step": 2068, + "time_per_iteration": 3.8738460540771484 + }, + { + "auxiliary_loss_clip": 0.01172579, + "auxiliary_loss_mlp": 0.0104795, + "balance_loss_clip": 1.05906129, + "balance_loss_mlp": 1.02965188, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.964196213413863, + "language_loss": 0.8106336, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83283889, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.1829834, + "step": 2069, + "time_per_iteration": 3.8540165424346924 + }, + { + "auxiliary_loss_clip": 0.01176555, + "auxiliary_loss_mlp": 0.01062512, + "balance_loss_clip": 1.0594939, + "balance_loss_mlp": 1.04168618, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 2.4993686356935187, + "language_loss": 0.77609432, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79848504, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.20800781, + "step": 2070, + "time_per_iteration": 2.5214595794677734 + }, + { + "auxiliary_loss_clip": 0.01183775, + "auxiliary_loss_mlp": 0.0104678, + "balance_loss_clip": 1.06706822, + "balance_loss_mlp": 1.02601421, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 4.643938654400888, + "language_loss": 0.80928618, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83159173, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.2076416, + "step": 2071, + "time_per_iteration": 2.434297561645508 + }, + { + "auxiliary_loss_clip": 0.01171813, + "auxiliary_loss_mlp": 0.01050617, + "balance_loss_clip": 1.06102228, + "balance_loss_mlp": 1.02969587, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 2.98739980154607, + "language_loss": 0.78355211, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80577648, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.20910645, + "step": 2072, + "time_per_iteration": 2.560587167739868 + }, + { + "auxiliary_loss_clip": 0.01174982, + "auxiliary_loss_mlp": 0.0105276, + "balance_loss_clip": 1.06136775, + "balance_loss_mlp": 1.03325689, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 1.9908173311952726, + "language_loss": 0.75216985, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77444732, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.19506836, + "step": 2073, + "time_per_iteration": 2.4244887828826904 + }, + { + "auxiliary_loss_clip": 0.0116974, + "auxiliary_loss_mlp": 0.01051223, + "balance_loss_clip": 1.05631948, + "balance_loss_mlp": 1.03082585, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 2.23029010022001, + "language_loss": 0.90939951, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93160915, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.20397949, + "step": 2074, + "time_per_iteration": 2.5024068355560303 + }, + { + "auxiliary_loss_clip": 0.01172833, + "auxiliary_loss_mlp": 0.01051363, + "balance_loss_clip": 1.05636644, + "balance_loss_mlp": 1.02872515, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3634905802186625, + "language_loss": 0.8423779, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86461985, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.2265625, + "step": 2075, + "time_per_iteration": 2.5633530616760254 + }, + { + "auxiliary_loss_clip": 0.0117945, + "auxiliary_loss_mlp": 0.01055529, + "balance_loss_clip": 1.06393147, + "balance_loss_mlp": 1.03389239, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.9356835392924783, + "language_loss": 0.73795819, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76030803, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.21630859, + "step": 2076, + "time_per_iteration": 2.5580289363861084 + }, + { + "auxiliary_loss_clip": 0.01167723, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.05677271, + "balance_loss_mlp": 1.02377272, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.8794210441510506, + "language_loss": 0.76165766, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78377014, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.19750977, + "step": 2077, + "time_per_iteration": 2.476935386657715 + }, + { + "auxiliary_loss_clip": 0.01171671, + "auxiliary_loss_mlp": 0.01050438, + "balance_loss_clip": 1.05700529, + "balance_loss_mlp": 1.02882564, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 2.8727077802898653, + "language_loss": 0.8294037, + "learning_rate": 3.906198587476043e-06, + "loss": 0.85162485, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.21606445, + "step": 2078, + "time_per_iteration": 2.4650325775146484 + }, + { + "auxiliary_loss_clip": 0.01184488, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.06631398, + "balance_loss_mlp": 1.03144073, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.6696488644959473, + "language_loss": 0.75073814, + "learning_rate": 3.906080677724374e-06, + "loss": 0.7731142, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.21679688, + "step": 2079, + "time_per_iteration": 2.454777717590332 + }, + { + "auxiliary_loss_clip": 0.01194042, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.07533097, + "balance_loss_mlp": 1.0350312, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.5217549763013705, + "language_loss": 0.82988489, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85238338, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.20788574, + "step": 2080, + "time_per_iteration": 2.5594425201416016 + }, + { + "auxiliary_loss_clip": 0.01173699, + "auxiliary_loss_mlp": 0.01055965, + "balance_loss_clip": 1.06095815, + "balance_loss_mlp": 1.03593743, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.3088733916501853, + "language_loss": 0.84919357, + "learning_rate": 3.9058446413892e-06, + "loss": 0.87149012, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.20019531, + "step": 2081, + "time_per_iteration": 2.4848623275756836 + }, + { + "auxiliary_loss_clip": 0.01168726, + "auxiliary_loss_mlp": 0.01046702, + "balance_loss_clip": 1.05582643, + "balance_loss_mlp": 1.02685356, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 2.031560232498254, + "language_loss": 0.76705813, + "learning_rate": 3.905726514814646e-06, + "loss": 0.78921247, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.1986084, + "step": 2082, + "time_per_iteration": 2.475303888320923 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.01052606, + "balance_loss_clip": 1.06643295, + "balance_loss_mlp": 1.03073168, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 4.802844678486503, + "language_loss": 0.79151273, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81392622, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.21875, + "step": 2083, + "time_per_iteration": 2.443634510040283 + }, + { + "auxiliary_loss_clip": 0.01177027, + "auxiliary_loss_mlp": 0.01052091, + "balance_loss_clip": 1.05964744, + "balance_loss_mlp": 1.02908397, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.9818334642055606, + "language_loss": 0.90266573, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92495692, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.23010254, + "step": 2084, + "time_per_iteration": 2.4462835788726807 + }, + { + "auxiliary_loss_clip": 0.01172764, + "auxiliary_loss_mlp": 0.01051101, + "balance_loss_clip": 1.05854416, + "balance_loss_mlp": 1.03062081, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 2.1541977193129562, + "language_loss": 0.8022154, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82445407, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.20483398, + "step": 2085, + "time_per_iteration": 2.4997968673706055 + }, + { + "auxiliary_loss_clip": 0.01169134, + "auxiliary_loss_mlp": 0.01050804, + "balance_loss_clip": 1.05672598, + "balance_loss_mlp": 1.02967978, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 2.390177190335823, + "language_loss": 0.88179719, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90399653, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.21130371, + "step": 2086, + "time_per_iteration": 2.533555030822754 + }, + { + "auxiliary_loss_clip": 0.0116502, + "auxiliary_loss_mlp": 0.01050427, + "balance_loss_clip": 1.05716884, + "balance_loss_mlp": 1.03093624, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.404716962400695, + "language_loss": 0.86790133, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89005578, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.19494629, + "step": 2087, + "time_per_iteration": 2.432016372680664 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.01055477, + "balance_loss_clip": 1.05911696, + "balance_loss_mlp": 1.03380442, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 2.768439788888713, + "language_loss": 0.73626351, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75852847, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.2166748, + "step": 2088, + "time_per_iteration": 2.506577968597412 + }, + { + "auxiliary_loss_clip": 0.01076048, + "auxiliary_loss_mlp": 0.01007847, + "balance_loss_clip": 1.03458953, + "balance_loss_mlp": 1.00554585, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7627100222274776, + "language_loss": 0.6173377, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63817668, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.02294922, + "step": 2089, + "time_per_iteration": 3.109724998474121 + }, + { + "auxiliary_loss_clip": 0.0116984, + "auxiliary_loss_mlp": 0.0105734, + "balance_loss_clip": 1.05936289, + "balance_loss_mlp": 1.03683567, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.3890455289093806, + "language_loss": 0.7834056, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80567741, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.20507812, + "step": 2090, + "time_per_iteration": 2.546168088912964 + }, + { + "auxiliary_loss_clip": 0.01080102, + "auxiliary_loss_mlp": 0.01004331, + "balance_loss_clip": 1.03826857, + "balance_loss_mlp": 1.00202394, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.7697537357215444, + "language_loss": 0.59358722, + "learning_rate": 3.90466012424176e-06, + "loss": 0.6144315, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.41894531, + "router_z_loss_mlp": 0.02304077, + "step": 2091, + "time_per_iteration": 2.977024555206299 + }, + { + "auxiliary_loss_clip": 0.01176974, + "auxiliary_loss_mlp": 0.01049452, + "balance_loss_clip": 1.06284845, + "balance_loss_mlp": 1.03021133, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.766416266552201, + "language_loss": 0.62967277, + "learning_rate": 3.904541275215825e-06, + "loss": 0.65193701, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.19226074, + "step": 2092, + "time_per_iteration": 2.630402088165283 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01058709, + "balance_loss_clip": 1.05628681, + "balance_loss_mlp": 1.03765702, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 2.056330283750415, + "language_loss": 0.80411768, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82643938, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.21057129, + "step": 2093, + "time_per_iteration": 2.438443899154663 + }, + { + "auxiliary_loss_clip": 0.01167728, + "auxiliary_loss_mlp": 0.01059341, + "balance_loss_clip": 1.0560708, + "balance_loss_mlp": 1.03896821, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.8889624766955389, + "language_loss": 0.75761855, + "learning_rate": 3.904303360507276e-06, + "loss": 0.77988923, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.20397949, + "step": 2094, + "time_per_iteration": 2.4869158267974854 + }, + { + "auxiliary_loss_clip": 0.0116659, + "auxiliary_loss_mlp": 0.01050846, + "balance_loss_clip": 1.05654335, + "balance_loss_mlp": 1.03111649, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.770901226435384, + "language_loss": 0.76971829, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79189265, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1973877, + "step": 2095, + "time_per_iteration": 2.7149016857147217 + }, + { + "auxiliary_loss_clip": 0.01187095, + "auxiliary_loss_mlp": 0.0106177, + "balance_loss_clip": 1.06955922, + "balance_loss_mlp": 1.04148066, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 4.564931129790199, + "language_loss": 0.82807475, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85056341, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.20300293, + "step": 2096, + "time_per_iteration": 2.455821990966797 + }, + { + "auxiliary_loss_clip": 0.01171949, + "auxiliary_loss_mlp": 0.01050553, + "balance_loss_clip": 1.05999947, + "balance_loss_mlp": 1.03081191, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 2.485528076436857, + "language_loss": 0.75400203, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77622706, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1973877, + "step": 2097, + "time_per_iteration": 2.442356586456299 + }, + { + "auxiliary_loss_clip": 0.01169093, + "auxiliary_loss_mlp": 0.01057111, + "balance_loss_clip": 1.05738807, + "balance_loss_mlp": 1.03844261, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0806144382320784, + "language_loss": 0.87172413, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89398617, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.18664551, + "step": 2098, + "time_per_iteration": 2.5530014038085938 + }, + { + "auxiliary_loss_clip": 0.01174029, + "auxiliary_loss_mlp": 0.01059247, + "balance_loss_clip": 1.05592322, + "balance_loss_mlp": 1.03583407, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.922080881925289, + "language_loss": 0.69500679, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71733958, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.23413086, + "step": 2099, + "time_per_iteration": 2.424910306930542 + }, + { + "auxiliary_loss_clip": 0.01171616, + "auxiliary_loss_mlp": 0.01069427, + "balance_loss_clip": 1.05461943, + "balance_loss_mlp": 1.04584765, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 2.417899901839665, + "language_loss": 0.82200074, + "learning_rate": 3.903587883453228e-06, + "loss": 0.84441113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.23571777, + "step": 2100, + "time_per_iteration": 2.497406005859375 + }, + { + "auxiliary_loss_clip": 0.01173088, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.05856085, + "balance_loss_mlp": 1.03060138, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 1.8931284945543905, + "language_loss": 0.80326807, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82551688, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.21179199, + "step": 2101, + "time_per_iteration": 2.4520652294158936 + }, + { + "auxiliary_loss_clip": 0.01090939, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.04795015, + "balance_loss_mlp": 1.02621043, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.711426499444342, + "language_loss": 0.57089579, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59209454, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.43017578, + "router_z_loss_mlp": 0.02722168, + "step": 2102, + "time_per_iteration": 3.1651198863983154 + }, + { + "auxiliary_loss_clip": 0.01178143, + "auxiliary_loss_mlp": 0.01054399, + "balance_loss_clip": 1.06363344, + "balance_loss_mlp": 1.03412175, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 2.0634292141810002, + "language_loss": 0.93116498, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95349038, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.20263672, + "step": 2103, + "time_per_iteration": 2.465594530105591 + }, + { + "auxiliary_loss_clip": 0.01165936, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.05847478, + "balance_loss_mlp": 1.02064943, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 2.7744544410199503, + "language_loss": 0.77722454, + "learning_rate": 3.903109455005387e-06, + "loss": 0.79928041, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18994141, + "step": 2104, + "time_per_iteration": 2.552351951599121 + }, + { + "auxiliary_loss_clip": 0.01173761, + "auxiliary_loss_mlp": 0.01053018, + "balance_loss_clip": 1.06090331, + "balance_loss_mlp": 1.03450465, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.7949727566540776, + "language_loss": 0.81359375, + "learning_rate": 3.902989667466828e-06, + "loss": 0.8358615, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.18530273, + "step": 2105, + "time_per_iteration": 2.5503249168395996 + }, + { + "auxiliary_loss_clip": 0.01183284, + "auxiliary_loss_mlp": 0.01060978, + "balance_loss_clip": 1.06432831, + "balance_loss_mlp": 1.03901947, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.258062421962707, + "language_loss": 0.82645023, + "learning_rate": 3.90286980776671e-06, + "loss": 0.84889281, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.21972656, + "step": 2106, + "time_per_iteration": 4.041336536407471 + }, + { + "auxiliary_loss_clip": 0.01169974, + "auxiliary_loss_mlp": 0.01050141, + "balance_loss_clip": 1.06044364, + "balance_loss_mlp": 1.02978015, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 1.8309252505281253, + "language_loss": 0.73381859, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75601977, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.20349121, + "step": 2107, + "time_per_iteration": 2.589121103286743 + }, + { + "auxiliary_loss_clip": 0.01174467, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.06206882, + "balance_loss_mlp": 1.02891207, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 1.9545854161923795, + "language_loss": 0.78915632, + "learning_rate": 3.90262987189998e-06, + "loss": 0.8113876, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.19763184, + "step": 2108, + "time_per_iteration": 3.8557345867156982 + }, + { + "auxiliary_loss_clip": 0.01174054, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.0606153, + "balance_loss_mlp": 1.02288854, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.0511343589064923, + "language_loss": 0.75586766, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77804786, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.21069336, + "step": 2109, + "time_per_iteration": 2.4684083461761475 + }, + { + "auxiliary_loss_clip": 0.01167437, + "auxiliary_loss_mlp": 0.01048696, + "balance_loss_clip": 1.05797362, + "balance_loss_mlp": 1.02934849, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.9875072871649242, + "language_loss": 0.82852298, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85068429, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.19360352, + "step": 2110, + "time_per_iteration": 2.4678499698638916 + }, + { + "auxiliary_loss_clip": 0.01169275, + "auxiliary_loss_mlp": 0.01050156, + "balance_loss_clip": 1.05792999, + "balance_loss_mlp": 1.02878153, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8987664098025012, + "language_loss": 0.78643972, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80863404, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.21374512, + "step": 2111, + "time_per_iteration": 3.904179334640503 + }, + { + "auxiliary_loss_clip": 0.01187592, + "auxiliary_loss_mlp": 0.01067297, + "balance_loss_clip": 1.06857932, + "balance_loss_mlp": 1.04440856, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.1821573411455195, + "language_loss": 0.76773274, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79028165, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.22888184, + "step": 2112, + "time_per_iteration": 2.538346767425537 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01057038, + "balance_loss_clip": 1.06400216, + "balance_loss_mlp": 1.03656936, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.9574535781791407, + "language_loss": 0.85512602, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87747079, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.20483398, + "step": 2113, + "time_per_iteration": 3.9652090072631836 + }, + { + "auxiliary_loss_clip": 0.01170707, + "auxiliary_loss_mlp": 0.01049222, + "balance_loss_clip": 1.05929887, + "balance_loss_mlp": 1.0275017, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.3531772794902897, + "language_loss": 0.73474979, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.75694919, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.21716309, + "step": 2114, + "time_per_iteration": 2.4433202743530273 + }, + { + "auxiliary_loss_clip": 0.01168795, + "auxiliary_loss_mlp": 0.01062086, + "balance_loss_clip": 1.05720556, + "balance_loss_mlp": 1.03972197, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.890619891049986, + "language_loss": 0.8358351, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85814393, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.22375488, + "step": 2115, + "time_per_iteration": 2.4824841022491455 + }, + { + "auxiliary_loss_clip": 0.01166643, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_clip": 1.05479622, + "balance_loss_mlp": 1.03326094, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.612074225238534, + "language_loss": 0.86424935, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88645816, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.20983887, + "step": 2116, + "time_per_iteration": 2.5495896339416504 + }, + { + "auxiliary_loss_clip": 0.01163403, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.05433643, + "balance_loss_mlp": 1.03044319, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.7143621453014297, + "language_loss": 0.70807517, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.73021328, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.19970703, + "step": 2117, + "time_per_iteration": 2.584339141845703 + }, + { + "auxiliary_loss_clip": 0.0117685, + "auxiliary_loss_mlp": 0.01057816, + "balance_loss_clip": 1.06269014, + "balance_loss_mlp": 1.03640652, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.2894956335703904, + "language_loss": 0.86833555, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89068222, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.21411133, + "step": 2118, + "time_per_iteration": 2.419067621231079 + }, + { + "auxiliary_loss_clip": 0.01170325, + "auxiliary_loss_mlp": 0.01056866, + "balance_loss_clip": 1.05709076, + "balance_loss_mlp": 1.03673136, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 2.2417686814670534, + "language_loss": 0.87739074, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89966267, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.20141602, + "step": 2119, + "time_per_iteration": 2.4161691665649414 + }, + { + "auxiliary_loss_clip": 0.01173908, + "auxiliary_loss_mlp": 0.01054135, + "balance_loss_clip": 1.06012297, + "balance_loss_mlp": 1.03321385, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.3125142253816664, + "language_loss": 0.8771928, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89947319, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.20910645, + "step": 2120, + "time_per_iteration": 2.4069807529449463 + }, + { + "auxiliary_loss_clip": 0.01172097, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_clip": 1.06002498, + "balance_loss_mlp": 1.02351213, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 2.779595321289988, + "language_loss": 0.75332564, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77547884, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.19726562, + "step": 2121, + "time_per_iteration": 2.497260570526123 + }, + { + "auxiliary_loss_clip": 0.01166515, + "auxiliary_loss_mlp": 0.01046412, + "balance_loss_clip": 1.05590069, + "balance_loss_mlp": 1.02633762, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.150727684266073, + "language_loss": 0.83152783, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85365713, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.20080566, + "step": 2122, + "time_per_iteration": 2.4791250228881836 + }, + { + "auxiliary_loss_clip": 0.01169302, + "auxiliary_loss_mlp": 0.01062389, + "balance_loss_clip": 1.0566082, + "balance_loss_mlp": 1.04034722, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 1.799837707455642, + "language_loss": 0.78846121, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81077814, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.22021484, + "step": 2123, + "time_per_iteration": 2.4769933223724365 + }, + { + "auxiliary_loss_clip": 0.01177994, + "auxiliary_loss_mlp": 0.01056238, + "balance_loss_clip": 1.06292379, + "balance_loss_mlp": 1.03512609, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.690685423348094, + "language_loss": 0.79359591, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81593823, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.21130371, + "step": 2124, + "time_per_iteration": 2.439507246017456 + }, + { + "auxiliary_loss_clip": 0.01171108, + "auxiliary_loss_mlp": 0.01052582, + "balance_loss_clip": 1.05627465, + "balance_loss_mlp": 1.03232789, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.0004881104269234, + "language_loss": 0.75878692, + "learning_rate": 3.900578768829623e-06, + "loss": 0.7810238, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.20251465, + "step": 2125, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01172921, + "auxiliary_loss_mlp": 0.01053123, + "balance_loss_clip": 1.05905259, + "balance_loss_mlp": 1.02941179, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 3.072724374867361, + "language_loss": 0.77895844, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80121887, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.23718262, + "step": 2126, + "time_per_iteration": 2.549510955810547 + }, + { + "auxiliary_loss_clip": 0.01181922, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_clip": 1.06644237, + "balance_loss_mlp": 1.02582216, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.5006463900065856, + "language_loss": 0.69195122, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71422738, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.1986084, + "step": 2127, + "time_per_iteration": 2.662203550338745 + }, + { + "auxiliary_loss_clip": 0.01085022, + "auxiliary_loss_mlp": 0.01013735, + "balance_loss_clip": 1.04333329, + "balance_loss_mlp": 1.01118648, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8578634740715247, + "language_loss": 0.62743837, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64842594, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.41699219, + "router_z_loss_mlp": 0.02548218, + "step": 2128, + "time_per_iteration": 3.139765977859497 + }, + { + "auxiliary_loss_clip": 0.01172637, + "auxiliary_loss_mlp": 0.01049601, + "balance_loss_clip": 1.05650055, + "balance_loss_mlp": 1.02778614, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.197794440613641, + "language_loss": 0.76972449, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79194683, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.21813965, + "step": 2129, + "time_per_iteration": 2.4553041458129883 + }, + { + "auxiliary_loss_clip": 0.01188932, + "auxiliary_loss_mlp": 0.01051871, + "balance_loss_clip": 1.06609511, + "balance_loss_mlp": 1.02862477, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.06825752998424, + "language_loss": 0.79206121, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81446916, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.23254395, + "step": 2130, + "time_per_iteration": 2.428130865097046 + }, + { + "auxiliary_loss_clip": 0.01177497, + "auxiliary_loss_mlp": 0.01043665, + "balance_loss_clip": 1.06153882, + "balance_loss_mlp": 1.02341092, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.8365618747425798, + "language_loss": 0.70715106, + "learning_rate": 3.899849876099518e-06, + "loss": 0.72936267, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.20239258, + "step": 2131, + "time_per_iteration": 2.502539873123169 + }, + { + "auxiliary_loss_clip": 0.01170572, + "auxiliary_loss_mlp": 0.01054519, + "balance_loss_clip": 1.05673897, + "balance_loss_mlp": 1.03314447, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2065184218134863, + "language_loss": 0.72422421, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74647516, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.21374512, + "step": 2132, + "time_per_iteration": 2.573441982269287 + }, + { + "auxiliary_loss_clip": 0.01169588, + "auxiliary_loss_mlp": 0.01053254, + "balance_loss_clip": 1.0584538, + "balance_loss_mlp": 1.03215361, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.253679468019247, + "language_loss": 0.81578398, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.83801234, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.21105957, + "step": 2133, + "time_per_iteration": 2.53981876373291 + }, + { + "auxiliary_loss_clip": 0.01178883, + "auxiliary_loss_mlp": 0.01063391, + "balance_loss_clip": 1.05872726, + "balance_loss_mlp": 1.04032445, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.2172243843070585, + "language_loss": 0.80159032, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82401311, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.23071289, + "step": 2134, + "time_per_iteration": 2.438990831375122 + }, + { + "auxiliary_loss_clip": 0.01177816, + "auxiliary_loss_mlp": 0.01059656, + "balance_loss_clip": 1.06064451, + "balance_loss_mlp": 1.03632724, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 1.948115365154893, + "language_loss": 0.83409691, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85647166, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.23352051, + "step": 2135, + "time_per_iteration": 2.4487361907958984 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01060721, + "balance_loss_clip": 1.05453682, + "balance_loss_mlp": 1.03640199, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.8551324001208287, + "language_loss": 0.77346158, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79574656, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.24316406, + "step": 2136, + "time_per_iteration": 2.4049203395843506 + }, + { + "auxiliary_loss_clip": 0.01085359, + "auxiliary_loss_mlp": 0.01014217, + "balance_loss_clip": 1.04248357, + "balance_loss_mlp": 1.01183832, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.9071048806692416, + "language_loss": 0.59185159, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61284733, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.42822266, + "router_z_loss_mlp": 0.02374268, + "step": 2137, + "time_per_iteration": 3.2130796909332275 + }, + { + "auxiliary_loss_clip": 0.01188707, + "auxiliary_loss_mlp": 0.01059456, + "balance_loss_clip": 1.06896794, + "balance_loss_mlp": 1.0385468, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4458995824086336, + "language_loss": 0.82183313, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84431481, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.20898438, + "step": 2138, + "time_per_iteration": 2.4041483402252197 + }, + { + "auxiliary_loss_clip": 0.01181097, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.06304646, + "balance_loss_mlp": 1.03650689, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.723220387374835, + "language_loss": 0.79122621, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81364727, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.24487305, + "step": 2139, + "time_per_iteration": 2.444441795349121 + }, + { + "auxiliary_loss_clip": 0.01179403, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.06113672, + "balance_loss_mlp": 1.02683163, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 1.8553096505500883, + "language_loss": 0.84840292, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87067354, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.20837402, + "step": 2140, + "time_per_iteration": 2.4018938541412354 + }, + { + "auxiliary_loss_clip": 0.01170778, + "auxiliary_loss_mlp": 0.01051186, + "balance_loss_clip": 1.05811608, + "balance_loss_mlp": 1.03065848, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 6.862638170009975, + "language_loss": 0.86141551, + "learning_rate": 3.898629291976476e-06, + "loss": 0.8836351, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.2052002, + "step": 2141, + "time_per_iteration": 2.4115002155303955 + }, + { + "auxiliary_loss_clip": 0.01180258, + "auxiliary_loss_mlp": 0.0105582, + "balance_loss_clip": 1.05776298, + "balance_loss_mlp": 1.03388596, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 2.672641656913743, + "language_loss": 0.68335426, + "learning_rate": 3.898506837508518e-06, + "loss": 0.705715, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.21911621, + "step": 2142, + "time_per_iteration": 2.4962291717529297 + }, + { + "auxiliary_loss_clip": 0.01181905, + "auxiliary_loss_mlp": 0.01053363, + "balance_loss_clip": 1.06225276, + "balance_loss_mlp": 1.03115475, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.040334951216574, + "language_loss": 0.83487439, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85722709, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.2220459, + "step": 2143, + "time_per_iteration": 2.5283827781677246 + }, + { + "auxiliary_loss_clip": 0.01175985, + "auxiliary_loss_mlp": 0.01054236, + "balance_loss_clip": 1.06098163, + "balance_loss_mlp": 1.03265929, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.6543230261634103, + "language_loss": 0.82066107, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84296334, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.21582031, + "step": 2144, + "time_per_iteration": 2.5395095348358154 + }, + { + "auxiliary_loss_clip": 0.01173213, + "auxiliary_loss_mlp": 0.01052724, + "balance_loss_clip": 1.05535495, + "balance_loss_mlp": 1.02931118, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 2.0020425944637497, + "language_loss": 0.78073609, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80299544, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.234375, + "step": 2145, + "time_per_iteration": 2.5039055347442627 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01049666, + "balance_loss_clip": 1.06007922, + "balance_loss_mlp": 1.02786243, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.337449835067227, + "language_loss": 0.82400435, + "learning_rate": 3.898016299767465e-06, + "loss": 0.8462745, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.21789551, + "step": 2146, + "time_per_iteration": 2.421018362045288 + }, + { + "auxiliary_loss_clip": 0.01173337, + "auxiliary_loss_mlp": 0.01050889, + "balance_loss_clip": 1.05725014, + "balance_loss_mlp": 1.02869201, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.063234279310564, + "language_loss": 0.71231097, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73455322, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.22216797, + "step": 2147, + "time_per_iteration": 2.6123132705688477 + }, + { + "auxiliary_loss_clip": 0.01174471, + "auxiliary_loss_mlp": 0.01058149, + "balance_loss_clip": 1.05791736, + "balance_loss_mlp": 1.03578544, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.063994891465849, + "language_loss": 0.71446371, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73678994, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.22351074, + "step": 2148, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176264, + "auxiliary_loss_mlp": 0.01049817, + "balance_loss_clip": 1.06165576, + "balance_loss_mlp": 1.0288837, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.5849833943303913, + "language_loss": 0.79023528, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81249607, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.20947266, + "step": 2149, + "time_per_iteration": 3.9096975326538086 + }, + { + "auxiliary_loss_clip": 0.01172474, + "auxiliary_loss_mlp": 0.01054418, + "balance_loss_clip": 1.05710363, + "balance_loss_mlp": 1.03011072, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.78196462397123, + "language_loss": 0.76528001, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78754896, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.24316406, + "step": 2150, + "time_per_iteration": 2.5094153881073 + }, + { + "auxiliary_loss_clip": 0.01179417, + "auxiliary_loss_mlp": 0.01058025, + "balance_loss_clip": 1.06100607, + "balance_loss_mlp": 1.03611469, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 4.582834468569645, + "language_loss": 0.70785081, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.73022527, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.21875, + "step": 2151, + "time_per_iteration": 3.8033440113067627 + }, + { + "auxiliary_loss_clip": 0.01173483, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.06152725, + "balance_loss_mlp": 1.02984333, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9112242485706115, + "language_loss": 0.84049606, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86273253, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.20324707, + "step": 2152, + "time_per_iteration": 2.45802640914917 + }, + { + "auxiliary_loss_clip": 0.01173913, + "auxiliary_loss_mlp": 0.01059761, + "balance_loss_clip": 1.05729318, + "balance_loss_mlp": 1.03903091, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.8023046774394351, + "language_loss": 0.78798044, + "learning_rate": 3.897155087940906e-06, + "loss": 0.8103171, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.20727539, + "step": 2153, + "time_per_iteration": 2.4397802352905273 + }, + { + "auxiliary_loss_clip": 0.01173572, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.05982184, + "balance_loss_mlp": 1.03068125, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.2436044951092007, + "language_loss": 0.80330473, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82555056, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.20324707, + "step": 2154, + "time_per_iteration": 2.6119260787963867 + }, + { + "auxiliary_loss_clip": 0.01178043, + "auxiliary_loss_mlp": 0.01050067, + "balance_loss_clip": 1.06366479, + "balance_loss_mlp": 1.02841902, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 3.674460420647549, + "language_loss": 0.83332366, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85560477, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.21643066, + "step": 2155, + "time_per_iteration": 4.128102779388428 + }, + { + "auxiliary_loss_clip": 0.01176534, + "auxiliary_loss_mlp": 0.0105683, + "balance_loss_clip": 1.05689931, + "balance_loss_mlp": 1.03482437, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.680226708141038, + "language_loss": 0.75658977, + "learning_rate": 3.896784917960055e-06, + "loss": 0.77892339, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.22009277, + "step": 2156, + "time_per_iteration": 3.859060287475586 + }, + { + "auxiliary_loss_clip": 0.01170151, + "auxiliary_loss_mlp": 0.01053171, + "balance_loss_clip": 1.05841935, + "balance_loss_mlp": 1.03236878, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.791518623171595, + "language_loss": 0.86801624, + "learning_rate": 3.896661384107648e-06, + "loss": 0.89024949, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.20788574, + "step": 2157, + "time_per_iteration": 2.444108247756958 + }, + { + "auxiliary_loss_clip": 0.01179449, + "auxiliary_loss_mlp": 0.01051934, + "balance_loss_clip": 1.05958271, + "balance_loss_mlp": 1.03007054, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.125882804146285, + "language_loss": 0.81265575, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83496958, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.21862793, + "step": 2158, + "time_per_iteration": 2.4981987476348877 + }, + { + "auxiliary_loss_clip": 0.01176358, + "auxiliary_loss_mlp": 0.0107163, + "balance_loss_clip": 1.05794084, + "balance_loss_mlp": 1.04807401, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.9776725970768356, + "language_loss": 0.74531245, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76779228, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.23547363, + "step": 2159, + "time_per_iteration": 2.4462485313415527 + }, + { + "auxiliary_loss_clip": 0.01166264, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_clip": 1.05170166, + "balance_loss_mlp": 1.02857816, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 1.956059270526173, + "language_loss": 0.82793999, + "learning_rate": 3.89629035103964e-06, + "loss": 0.85009962, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.21154785, + "step": 2160, + "time_per_iteration": 2.478937864303589 + }, + { + "auxiliary_loss_clip": 0.01171926, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.02514434, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 2.1027184292468313, + "language_loss": 0.82312065, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84530497, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.21362305, + "step": 2161, + "time_per_iteration": 2.428591251373291 + }, + { + "auxiliary_loss_clip": 0.01170677, + "auxiliary_loss_mlp": 0.01052399, + "balance_loss_clip": 1.0568192, + "balance_loss_mlp": 1.03103673, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.6506499968801935, + "language_loss": 0.82836825, + "learning_rate": 3.896042636115551e-06, + "loss": 0.85059905, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.21374512, + "step": 2162, + "time_per_iteration": 2.4711031913757324 + }, + { + "auxiliary_loss_clip": 0.01200673, + "auxiliary_loss_mlp": 0.01055707, + "balance_loss_clip": 1.07937264, + "balance_loss_mlp": 1.03432047, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 4.39968989607179, + "language_loss": 0.72745436, + "learning_rate": 3.895918670803968e-06, + "loss": 0.75001818, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.21386719, + "step": 2163, + "time_per_iteration": 2.4356589317321777 + }, + { + "auxiliary_loss_clip": 0.01174511, + "auxiliary_loss_mlp": 0.01058302, + "balance_loss_clip": 1.05604243, + "balance_loss_mlp": 1.03335142, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.3326925446962674, + "language_loss": 0.81287313, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83520126, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.24926758, + "step": 2164, + "time_per_iteration": 2.4191858768463135 + }, + { + "auxiliary_loss_clip": 0.01173397, + "auxiliary_loss_mlp": 0.0105131, + "balance_loss_clip": 1.0551374, + "balance_loss_mlp": 1.0310806, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.7398778822035363, + "language_loss": 0.72110844, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74335551, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.20239258, + "step": 2165, + "time_per_iteration": 2.453580856323242 + }, + { + "auxiliary_loss_clip": 0.01182374, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.06411481, + "balance_loss_mlp": 1.0243578, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 1.6286018294927416, + "language_loss": 0.74879098, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.77107829, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.22009277, + "step": 2166, + "time_per_iteration": 2.4890410900115967 + }, + { + "auxiliary_loss_clip": 0.01174624, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.05770564, + "balance_loss_mlp": 1.02792704, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.6481808719700293, + "language_loss": 0.83089566, + "learning_rate": 3.895422090670421e-06, + "loss": 0.853127, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.20593262, + "step": 2167, + "time_per_iteration": 2.5388879776000977 + }, + { + "auxiliary_loss_clip": 0.01173721, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.06054735, + "balance_loss_mlp": 1.03878498, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.7303506225147733, + "language_loss": 0.83347845, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85581684, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.21325684, + "step": 2168, + "time_per_iteration": 2.453659772872925 + }, + { + "auxiliary_loss_clip": 0.01172235, + "auxiliary_loss_mlp": 0.01051151, + "balance_loss_clip": 1.05584741, + "balance_loss_mlp": 1.02946723, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 1.9880458118002389, + "language_loss": 0.79975146, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82198536, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.21679688, + "step": 2169, + "time_per_iteration": 2.497026205062866 + }, + { + "auxiliary_loss_clip": 0.0117132, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.05660701, + "balance_loss_mlp": 1.02094126, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 3.15893292382564, + "language_loss": 0.6663394, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68848026, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.21826172, + "step": 2170, + "time_per_iteration": 2.513645887374878 + }, + { + "auxiliary_loss_clip": 0.01169114, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.05537736, + "balance_loss_mlp": 1.0275104, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.6919257045799339, + "language_loss": 0.6699487, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69212329, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.20825195, + "step": 2171, + "time_per_iteration": 2.623631000518799 + }, + { + "auxiliary_loss_clip": 0.01170564, + "auxiliary_loss_mlp": 0.01057409, + "balance_loss_clip": 1.05459774, + "balance_loss_mlp": 1.03305411, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 2.8294327338276175, + "language_loss": 0.72334039, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74562013, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.24365234, + "step": 2172, + "time_per_iteration": 2.426788568496704 + }, + { + "auxiliary_loss_clip": 0.01200545, + "auxiliary_loss_mlp": 0.0104818, + "balance_loss_clip": 1.08475637, + "balance_loss_mlp": 1.0286411, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8928533599888446, + "language_loss": 0.75785697, + "learning_rate": 3.894675064326678e-06, + "loss": 0.78034419, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.19543457, + "step": 2173, + "time_per_iteration": 2.5130300521850586 + }, + { + "auxiliary_loss_clip": 0.01170149, + "auxiliary_loss_mlp": 0.01059989, + "balance_loss_clip": 1.05435288, + "balance_loss_mlp": 1.03721988, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.288925493539164, + "language_loss": 0.70289743, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72519886, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.22766113, + "step": 2174, + "time_per_iteration": 2.483006000518799 + }, + { + "auxiliary_loss_clip": 0.01090311, + "auxiliary_loss_mlp": 0.01015009, + "balance_loss_clip": 1.04738593, + "balance_loss_mlp": 1.01185584, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 1.3058770410036489, + "language_loss": 0.59019697, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61125016, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.42919922, + "router_z_loss_mlp": 0.03155518, + "step": 2175, + "time_per_iteration": 3.1964895725250244 + }, + { + "auxiliary_loss_clip": 0.01175957, + "auxiliary_loss_mlp": 0.01051881, + "balance_loss_clip": 1.06175983, + "balance_loss_mlp": 1.03081703, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.168865186792601, + "language_loss": 0.79757434, + "learning_rate": 3.894300581166417e-06, + "loss": 0.81985271, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.21069336, + "step": 2176, + "time_per_iteration": 2.427940607070923 + }, + { + "auxiliary_loss_clip": 0.0118087, + "auxiliary_loss_mlp": 0.01068669, + "balance_loss_clip": 1.06122661, + "balance_loss_mlp": 1.04545867, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 2.5933663263469833, + "language_loss": 0.7425133, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76500869, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.23217773, + "step": 2177, + "time_per_iteration": 2.561669111251831 + }, + { + "auxiliary_loss_clip": 0.0116903, + "auxiliary_loss_mlp": 0.01054469, + "balance_loss_clip": 1.05530179, + "balance_loss_mlp": 1.03211713, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 1.928308098800671, + "language_loss": 0.82316768, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84540272, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.22363281, + "step": 2178, + "time_per_iteration": 2.4780466556549072 + }, + { + "auxiliary_loss_clip": 0.01172376, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.05710459, + "balance_loss_mlp": 1.03179574, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.5114826673573662, + "language_loss": 0.74906087, + "learning_rate": 3.893925451517562e-06, + "loss": 0.7713145, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.21203613, + "step": 2179, + "time_per_iteration": 2.4497933387756348 + }, + { + "auxiliary_loss_clip": 0.01174276, + "auxiliary_loss_mlp": 0.01052055, + "balance_loss_clip": 1.06099617, + "balance_loss_mlp": 1.03126454, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 3.704937396903676, + "language_loss": 0.84598446, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86824775, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.20788574, + "step": 2180, + "time_per_iteration": 2.4517414569854736 + }, + { + "auxiliary_loss_clip": 0.01179371, + "auxiliary_loss_mlp": 0.01077232, + "balance_loss_clip": 1.06336617, + "balance_loss_mlp": 1.05573833, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 1.9017985841580605, + "language_loss": 0.89905894, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92162502, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.21496582, + "step": 2181, + "time_per_iteration": 2.4860944747924805 + }, + { + "auxiliary_loss_clip": 0.01177291, + "auxiliary_loss_mlp": 0.01053894, + "balance_loss_clip": 1.06272519, + "balance_loss_mlp": 1.03360415, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 2.94673086031055, + "language_loss": 0.68824124, + "learning_rate": 3.893549675508137e-06, + "loss": 0.71055305, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.20288086, + "step": 2182, + "time_per_iteration": 2.484271287918091 + }, + { + "auxiliary_loss_clip": 0.01177215, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_clip": 1.06045771, + "balance_loss_mlp": 1.03967977, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.1449251353294545, + "language_loss": 0.78854477, + "learning_rate": 3.893424273224806e-06, + "loss": 0.81092805, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.21447754, + "step": 2183, + "time_per_iteration": 2.491933822631836 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01068114, + "balance_loss_clip": 1.05721903, + "balance_loss_mlp": 1.04412961, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 1.7321791173778511, + "language_loss": 0.85721982, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87963068, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.23986816, + "step": 2184, + "time_per_iteration": 2.430065155029297 + }, + { + "auxiliary_loss_clip": 0.01175519, + "auxiliary_loss_mlp": 0.01072474, + "balance_loss_clip": 1.05693865, + "balance_loss_mlp": 1.04724967, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.0091597324339707, + "language_loss": 0.82605135, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84853125, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.25256348, + "step": 2185, + "time_per_iteration": 2.4644293785095215 + }, + { + "auxiliary_loss_clip": 0.01176755, + "auxiliary_loss_mlp": 0.01057649, + "balance_loss_clip": 1.05743349, + "balance_loss_mlp": 1.03544009, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.3620248982534124, + "language_loss": 0.73183018, + "learning_rate": 3.893047635600818e-06, + "loss": 0.75417423, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.2220459, + "step": 2186, + "time_per_iteration": 2.3900492191314697 + }, + { + "auxiliary_loss_clip": 0.01174104, + "auxiliary_loss_mlp": 0.01051135, + "balance_loss_clip": 1.05861163, + "balance_loss_mlp": 1.02872396, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 1.9525590871342158, + "language_loss": 0.80035579, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82260817, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.22399902, + "step": 2187, + "time_per_iteration": 2.431260585784912 + }, + { + "auxiliary_loss_clip": 0.01101358, + "auxiliary_loss_mlp": 0.01013713, + "balance_loss_clip": 1.05586696, + "balance_loss_mlp": 1.01047671, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8434003474163958, + "language_loss": 0.59087688, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61202759, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 0.03240967, + "step": 2188, + "time_per_iteration": 3.085664749145508 + }, + { + "auxiliary_loss_clip": 0.01188266, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.07245219, + "balance_loss_mlp": 1.02489901, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.53612934059419, + "language_loss": 0.74161464, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76395297, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.20666504, + "step": 2189, + "time_per_iteration": 2.4442899227142334 + }, + { + "auxiliary_loss_clip": 0.01179954, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.06578314, + "balance_loss_mlp": 1.02741313, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 2.9099161071908246, + "language_loss": 0.728513, + "learning_rate": 3.892544447140657e-06, + "loss": 0.75079387, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.20727539, + "step": 2190, + "time_per_iteration": 2.5190982818603516 + }, + { + "auxiliary_loss_clip": 0.01176308, + "auxiliary_loss_mlp": 0.0105442, + "balance_loss_clip": 1.06146562, + "balance_loss_mlp": 1.03274751, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 2.1703129265479233, + "language_loss": 0.74677968, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76908696, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.21679688, + "step": 2191, + "time_per_iteration": 2.5175342559814453 + }, + { + "auxiliary_loss_clip": 0.01171439, + "auxiliary_loss_mlp": 0.01046894, + "balance_loss_clip": 1.0581181, + "balance_loss_mlp": 1.02587783, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 1.9474183232443711, + "language_loss": 0.79178804, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8139714, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.21008301, + "step": 2192, + "time_per_iteration": 3.9394185543060303 + }, + { + "auxiliary_loss_clip": 0.01173191, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_clip": 1.05707932, + "balance_loss_mlp": 1.02791095, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9539245226852118, + "language_loss": 0.85397625, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87619489, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.2076416, + "step": 2193, + "time_per_iteration": 2.463088274002075 + }, + { + "auxiliary_loss_clip": 0.01101064, + "auxiliary_loss_mlp": 0.01011379, + "balance_loss_clip": 1.05693245, + "balance_loss_mlp": 1.00685453, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7609811605026496, + "language_loss": 0.54116189, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56228638, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 0.04522705, + "step": 2194, + "time_per_iteration": 3.0774292945861816 + }, + { + "auxiliary_loss_clip": 0.0118373, + "auxiliary_loss_mlp": 0.01057291, + "balance_loss_clip": 1.06869674, + "balance_loss_mlp": 1.03518927, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 2.8781209915331063, + "language_loss": 0.71874189, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74115205, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.22070312, + "step": 2195, + "time_per_iteration": 3.8827285766601562 + }, + { + "auxiliary_loss_clip": 0.01182077, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.06169295, + "balance_loss_mlp": 1.02355504, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 2.131898044105854, + "language_loss": 0.78404391, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80632895, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.2286377, + "step": 2196, + "time_per_iteration": 2.43465518951416 + }, + { + "auxiliary_loss_clip": 0.01176249, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.05836058, + "balance_loss_mlp": 1.03100026, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.470886366000082, + "language_loss": 0.75036895, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77264881, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.20727539, + "step": 2197, + "time_per_iteration": 2.4705615043640137 + }, + { + "auxiliary_loss_clip": 0.01184483, + "auxiliary_loss_mlp": 0.01050517, + "balance_loss_clip": 1.06423855, + "balance_loss_mlp": 1.02951229, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 1.9211972731511093, + "language_loss": 0.79694486, + "learning_rate": 3.891534625783685e-06, + "loss": 0.81929493, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.20996094, + "step": 2198, + "time_per_iteration": 4.017429828643799 + }, + { + "auxiliary_loss_clip": 0.01174299, + "auxiliary_loss_mlp": 0.0105572, + "balance_loss_clip": 1.06140614, + "balance_loss_mlp": 1.03583598, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.188352210425171, + "language_loss": 0.82970905, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85200918, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.19897461, + "step": 2199, + "time_per_iteration": 3.816314697265625 + }, + { + "auxiliary_loss_clip": 0.01186133, + "auxiliary_loss_mlp": 0.01053207, + "balance_loss_clip": 1.06868923, + "balance_loss_mlp": 1.03232145, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.0996559286031995, + "language_loss": 0.69236845, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71476185, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.2088623, + "step": 2200, + "time_per_iteration": 2.592052698135376 + }, + { + "auxiliary_loss_clip": 0.01179589, + "auxiliary_loss_mlp": 0.01063413, + "balance_loss_clip": 1.0623405, + "balance_loss_mlp": 1.04104924, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.830013175485165, + "language_loss": 0.84723258, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86966264, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.22351074, + "step": 2201, + "time_per_iteration": 2.448230028152466 + }, + { + "auxiliary_loss_clip": 0.0117517, + "auxiliary_loss_mlp": 0.01052792, + "balance_loss_clip": 1.0593617, + "balance_loss_mlp": 1.03177571, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 2.0596298551808094, + "language_loss": 0.87003762, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89231718, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.20996094, + "step": 2202, + "time_per_iteration": 2.5379042625427246 + }, + { + "auxiliary_loss_clip": 0.01178955, + "auxiliary_loss_mlp": 0.01048359, + "balance_loss_clip": 1.06284666, + "balance_loss_mlp": 1.02854693, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 3.4941770091031996, + "language_loss": 0.72887039, + "learning_rate": 3.89090115614658e-06, + "loss": 0.75114357, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.19824219, + "step": 2203, + "time_per_iteration": 2.4542624950408936 + }, + { + "auxiliary_loss_clip": 0.01171958, + "auxiliary_loss_mlp": 0.01060929, + "balance_loss_clip": 1.05651855, + "balance_loss_mlp": 1.03920889, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.721708643628308, + "language_loss": 0.7385093, + "learning_rate": 3.890774247090444e-06, + "loss": 0.76083815, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.21716309, + "step": 2204, + "time_per_iteration": 2.6215219497680664 + }, + { + "auxiliary_loss_clip": 0.0119962, + "auxiliary_loss_mlp": 0.01055322, + "balance_loss_clip": 1.08162141, + "balance_loss_mlp": 1.03313684, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.6701822709591372, + "language_loss": 0.7855323, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.22180176, + "step": 2205, + "time_per_iteration": 2.5076539516448975 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01049126, + "balance_loss_clip": 1.06078911, + "balance_loss_mlp": 1.02936101, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.070293470950002, + "language_loss": 0.79126728, + "learning_rate": 3.890520213887941e-06, + "loss": 0.81349581, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.19787598, + "step": 2206, + "time_per_iteration": 2.4594430923461914 + }, + { + "auxiliary_loss_clip": 0.01175431, + "auxiliary_loss_mlp": 0.01053316, + "balance_loss_clip": 1.06068838, + "balance_loss_mlp": 1.03382516, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 1.9664317809026568, + "language_loss": 0.74376047, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76604795, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.19494629, + "step": 2207, + "time_per_iteration": 2.4275131225585938 + }, + { + "auxiliary_loss_clip": 0.01169748, + "auxiliary_loss_mlp": 0.01049313, + "balance_loss_clip": 1.05859745, + "balance_loss_mlp": 1.0283556, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.0644242995240054, + "language_loss": 0.84060574, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86279631, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.20959473, + "step": 2208, + "time_per_iteration": 2.5023715496063232 + }, + { + "auxiliary_loss_clip": 0.01168437, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.0606966, + "balance_loss_mlp": 1.03419387, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.708446981032195, + "language_loss": 0.85296637, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87518489, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.19226074, + "step": 2209, + "time_per_iteration": 2.5083611011505127 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01054807, + "balance_loss_clip": 1.05650747, + "balance_loss_mlp": 1.03562593, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.605903180957004, + "language_loss": 0.81786287, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84010381, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.19189453, + "step": 2210, + "time_per_iteration": 2.525390148162842 + }, + { + "auxiliary_loss_clip": 0.01093385, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.04995942, + "balance_loss_mlp": 1.03573775, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7665494297857917, + "language_loss": 0.57974988, + "learning_rate": 3.889883876413563e-06, + "loss": 0.60107219, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.43408203, + "router_z_loss_mlp": 0.03106689, + "step": 2211, + "time_per_iteration": 3.169757127761841 + }, + { + "auxiliary_loss_clip": 0.01103697, + "auxiliary_loss_mlp": 0.01007057, + "balance_loss_clip": 1.06322193, + "balance_loss_mlp": 1.00387979, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.8044826906747673, + "language_loss": 0.55300903, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57411653, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.40478516, + "router_z_loss_mlp": 0.03173828, + "step": 2212, + "time_per_iteration": 3.1391689777374268 + }, + { + "auxiliary_loss_clip": 0.01181048, + "auxiliary_loss_mlp": 0.01053851, + "balance_loss_clip": 1.06205475, + "balance_loss_mlp": 1.03222644, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 22.583212569083802, + "language_loss": 0.74467742, + "learning_rate": 3.889628839737908e-06, + "loss": 0.76702642, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.21618652, + "step": 2213, + "time_per_iteration": 2.4326040744781494 + }, + { + "auxiliary_loss_clip": 0.01164997, + "auxiliary_loss_mlp": 0.01049898, + "balance_loss_clip": 1.05605805, + "balance_loss_mlp": 1.03121805, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.787680470316349, + "language_loss": 0.79196423, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81411314, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.18676758, + "step": 2214, + "time_per_iteration": 2.461132287979126 + }, + { + "auxiliary_loss_clip": 0.0117373, + "auxiliary_loss_mlp": 0.01051548, + "balance_loss_clip": 1.05947614, + "balance_loss_mlp": 1.03080511, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 1.874786737740801, + "language_loss": 0.68956137, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71181417, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.20739746, + "step": 2215, + "time_per_iteration": 2.541696548461914 + }, + { + "auxiliary_loss_clip": 0.01174262, + "auxiliary_loss_mlp": 0.01046649, + "balance_loss_clip": 1.05821037, + "balance_loss_mlp": 1.02579951, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.066140795522621, + "language_loss": 0.81324673, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83545589, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.20861816, + "step": 2216, + "time_per_iteration": 2.530383825302124 + }, + { + "auxiliary_loss_clip": 0.01173373, + "auxiliary_loss_mlp": 0.01057583, + "balance_loss_clip": 1.05941904, + "balance_loss_mlp": 1.03693581, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 2.466196079343374, + "language_loss": 0.87264705, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89495659, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.20654297, + "step": 2217, + "time_per_iteration": 2.4204201698303223 + }, + { + "auxiliary_loss_clip": 0.01172028, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.05924881, + "balance_loss_mlp": 1.03343582, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.614793032360565, + "language_loss": 0.73369265, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75596869, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.22131348, + "step": 2218, + "time_per_iteration": 2.484193801879883 + }, + { + "auxiliary_loss_clip": 0.01175401, + "auxiliary_loss_mlp": 0.01049526, + "balance_loss_clip": 1.06159186, + "balance_loss_mlp": 1.02850902, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 2.089426694985728, + "language_loss": 0.86712372, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.88937294, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.21020508, + "step": 2219, + "time_per_iteration": 2.498058557510376 + }, + { + "auxiliary_loss_clip": 0.01178898, + "auxiliary_loss_mlp": 0.01058977, + "balance_loss_clip": 1.06439352, + "balance_loss_mlp": 1.03941512, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 3.383042318708491, + "language_loss": 0.77201319, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79439193, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.19555664, + "step": 2220, + "time_per_iteration": 2.4741194248199463 + }, + { + "auxiliary_loss_clip": 0.0119634, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_clip": 1.07949972, + "balance_loss_mlp": 1.02950239, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.681270520555751, + "language_loss": 0.79000598, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81245399, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.18945312, + "step": 2221, + "time_per_iteration": 2.4581611156463623 + }, + { + "auxiliary_loss_clip": 0.01094795, + "auxiliary_loss_mlp": 0.01033732, + "balance_loss_clip": 1.05381858, + "balance_loss_mlp": 1.03065681, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9837352835715255, + "language_loss": 0.68981725, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71110249, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.40966797, + "router_z_loss_mlp": 0.03070068, + "step": 2222, + "time_per_iteration": 2.9413657188415527 + }, + { + "auxiliary_loss_clip": 0.0117174, + "auxiliary_loss_mlp": 0.01046639, + "balance_loss_clip": 1.06022096, + "balance_loss_mlp": 1.02725589, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 3.14126609430937, + "language_loss": 0.67724305, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69942689, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.19384766, + "step": 2223, + "time_per_iteration": 2.52824330329895 + }, + { + "auxiliary_loss_clip": 0.01168567, + "auxiliary_loss_mlp": 0.01053307, + "balance_loss_clip": 1.05458474, + "balance_loss_mlp": 1.03169489, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 3.2638504978663145, + "language_loss": 0.8187381, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84095687, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.21606445, + "step": 2224, + "time_per_iteration": 2.442058801651001 + }, + { + "auxiliary_loss_clip": 0.0117305, + "auxiliary_loss_mlp": 0.01045486, + "balance_loss_clip": 1.05773246, + "balance_loss_mlp": 1.02379036, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.309610223798871, + "language_loss": 0.65602422, + "learning_rate": 3.888092602028167e-06, + "loss": 0.67820966, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.21691895, + "step": 2225, + "time_per_iteration": 2.4729905128479004 + }, + { + "auxiliary_loss_clip": 0.0117398, + "auxiliary_loss_mlp": 0.0105454, + "balance_loss_clip": 1.05855691, + "balance_loss_mlp": 1.03312969, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.3141342609407785, + "language_loss": 0.89955723, + "learning_rate": 3.887964116724835e-06, + "loss": 0.92184246, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.2142334, + "step": 2226, + "time_per_iteration": 2.4272687435150146 + }, + { + "auxiliary_loss_clip": 0.01180256, + "auxiliary_loss_mlp": 0.0104811, + "balance_loss_clip": 1.06502748, + "balance_loss_mlp": 1.0280354, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 1.972515845626869, + "language_loss": 0.74049056, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76277423, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.20080566, + "step": 2227, + "time_per_iteration": 2.5089192390441895 + }, + { + "auxiliary_loss_clip": 0.01167384, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.05479586, + "balance_loss_mlp": 1.03142357, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.022758938006512, + "language_loss": 0.85369515, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87588912, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.20581055, + "step": 2228, + "time_per_iteration": 2.4257590770721436 + }, + { + "auxiliary_loss_clip": 0.01165252, + "auxiliary_loss_mlp": 0.01052838, + "balance_loss_clip": 1.0551008, + "balance_loss_mlp": 1.03127301, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.2021049528150263, + "language_loss": 0.80921209, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.831393, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.2154541, + "step": 2229, + "time_per_iteration": 2.454152822494507 + }, + { + "auxiliary_loss_clip": 0.01174955, + "auxiliary_loss_mlp": 0.01071844, + "balance_loss_clip": 1.06016922, + "balance_loss_mlp": 1.04919457, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.4552014597731557, + "language_loss": 0.74465829, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76712632, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.22619629, + "step": 2230, + "time_per_iteration": 2.4782748222351074 + }, + { + "auxiliary_loss_clip": 0.0118316, + "auxiliary_loss_mlp": 0.01059366, + "balance_loss_clip": 1.06633365, + "balance_loss_mlp": 1.03896928, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 1.7749117926170794, + "language_loss": 0.80272287, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82514811, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.20397949, + "step": 2231, + "time_per_iteration": 2.462587833404541 + }, + { + "auxiliary_loss_clip": 0.01174383, + "auxiliary_loss_mlp": 0.01074982, + "balance_loss_clip": 1.0597949, + "balance_loss_mlp": 1.05237961, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.7558151685317187, + "language_loss": 0.72132492, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74381852, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.22607422, + "step": 2232, + "time_per_iteration": 2.5010435581207275 + }, + { + "auxiliary_loss_clip": 0.01181333, + "auxiliary_loss_mlp": 0.01049589, + "balance_loss_clip": 1.06468129, + "balance_loss_mlp": 1.02816749, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 4.024465999343383, + "language_loss": 0.65593874, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67824793, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.21435547, + "step": 2233, + "time_per_iteration": 2.4898619651794434 + }, + { + "auxiliary_loss_clip": 0.01172148, + "auxiliary_loss_mlp": 0.01046692, + "balance_loss_clip": 1.05804193, + "balance_loss_mlp": 1.02586603, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.865692898833471, + "language_loss": 0.81811017, + "learning_rate": 3.886933657403615e-06, + "loss": 0.84029853, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.20837402, + "step": 2234, + "time_per_iteration": 2.4309589862823486 + }, + { + "auxiliary_loss_clip": 0.01175681, + "auxiliary_loss_mlp": 0.0105413, + "balance_loss_clip": 1.06143939, + "balance_loss_mlp": 1.03323305, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9045192606235999, + "language_loss": 0.81927383, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84157193, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.2088623, + "step": 2235, + "time_per_iteration": 3.918654441833496 + }, + { + "auxiliary_loss_clip": 0.01168374, + "auxiliary_loss_mlp": 0.01063092, + "balance_loss_clip": 1.05512834, + "balance_loss_mlp": 1.03957248, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6873372243328741, + "language_loss": 0.86340141, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88571614, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.23498535, + "step": 2236, + "time_per_iteration": 2.5461370944976807 + }, + { + "auxiliary_loss_clip": 0.01172562, + "auxiliary_loss_mlp": 0.01052691, + "balance_loss_clip": 1.05846858, + "balance_loss_mlp": 1.03113806, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 2.304812545155749, + "language_loss": 0.769943, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79219556, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.21557617, + "step": 2237, + "time_per_iteration": 2.4959287643432617 + }, + { + "auxiliary_loss_clip": 0.01173633, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.05964792, + "balance_loss_mlp": 1.03039551, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.9787225609695844, + "language_loss": 0.7866683, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80893171, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.2232666, + "step": 2238, + "time_per_iteration": 2.4112415313720703 + }, + { + "auxiliary_loss_clip": 0.01170775, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.05857933, + "balance_loss_mlp": 1.02903295, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.6241524634723596, + "language_loss": 0.6858564, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70807457, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.22009277, + "step": 2239, + "time_per_iteration": 3.9000251293182373 + }, + { + "auxiliary_loss_clip": 0.01174467, + "auxiliary_loss_mlp": 0.01068397, + "balance_loss_clip": 1.05873454, + "balance_loss_mlp": 1.04534209, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.150891662704823, + "language_loss": 0.81579185, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83822048, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.23034668, + "step": 2240, + "time_per_iteration": 2.454096794128418 + }, + { + "auxiliary_loss_clip": 0.01175553, + "auxiliary_loss_mlp": 0.01051134, + "balance_loss_clip": 1.05970049, + "balance_loss_mlp": 1.03027248, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.8787955529486846, + "language_loss": 0.77910638, + "learning_rate": 3.886028248895093e-06, + "loss": 0.80137324, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.20874023, + "step": 2241, + "time_per_iteration": 2.43579363822937 + }, + { + "auxiliary_loss_clip": 0.01189076, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_clip": 1.07559669, + "balance_loss_mlp": 1.02638149, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.6812686070016944, + "language_loss": 0.8307904, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85313201, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.18713379, + "step": 2242, + "time_per_iteration": 2.4910802841186523 + }, + { + "auxiliary_loss_clip": 0.01183007, + "auxiliary_loss_mlp": 0.01055498, + "balance_loss_clip": 1.06687593, + "balance_loss_mlp": 1.0336585, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 2.4817515610802556, + "language_loss": 0.64377689, + "learning_rate": 3.885768917010744e-06, + "loss": 0.66616195, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.21850586, + "step": 2243, + "time_per_iteration": 5.379875421524048 + }, + { + "auxiliary_loss_clip": 0.01165057, + "auxiliary_loss_mlp": 0.01041769, + "balance_loss_clip": 1.05789757, + "balance_loss_mlp": 1.02183735, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.5638330205265787, + "language_loss": 0.7261039, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74817216, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.19946289, + "step": 2244, + "time_per_iteration": 2.4782018661499023 + }, + { + "auxiliary_loss_clip": 0.0117256, + "auxiliary_loss_mlp": 0.01049171, + "balance_loss_clip": 1.06157851, + "balance_loss_mlp": 1.02996683, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 2.1082708785557918, + "language_loss": 0.86391997, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88613725, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.19213867, + "step": 2245, + "time_per_iteration": 2.47786808013916 + }, + { + "auxiliary_loss_clip": 0.01169061, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_clip": 1.06122041, + "balance_loss_mlp": 1.03001904, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 2.9132763738228653, + "language_loss": 0.78969681, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81187844, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.19067383, + "step": 2246, + "time_per_iteration": 2.4234395027160645 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01011351, + "balance_loss_clip": 1.06764805, + "balance_loss_mlp": 1.00835311, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7517409282438079, + "language_loss": 0.60527349, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62648201, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.41845703, + "router_z_loss_mlp": 0.02999878, + "step": 2247, + "time_per_iteration": 3.162614583969116 + }, + { + "auxiliary_loss_clip": 0.01182016, + "auxiliary_loss_mlp": 0.01059041, + "balance_loss_clip": 1.06441832, + "balance_loss_mlp": 1.03633189, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.0026382624138583, + "language_loss": 0.80833262, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83074319, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.22717285, + "step": 2248, + "time_per_iteration": 2.4624271392822266 + }, + { + "auxiliary_loss_clip": 0.01169424, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.06082547, + "balance_loss_mlp": 1.02674174, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 1.9197946424830357, + "language_loss": 0.7692014, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79136682, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.20373535, + "step": 2249, + "time_per_iteration": 2.4928476810455322 + }, + { + "auxiliary_loss_clip": 0.01177597, + "auxiliary_loss_mlp": 0.0105621, + "balance_loss_clip": 1.06609464, + "balance_loss_mlp": 1.03711236, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4623334092487899, + "language_loss": 0.8461144, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86845243, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.19104004, + "step": 2250, + "time_per_iteration": 2.4994618892669678 + }, + { + "auxiliary_loss_clip": 0.01171994, + "auxiliary_loss_mlp": 0.01048052, + "balance_loss_clip": 1.05990529, + "balance_loss_mlp": 1.02695203, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 2.454513214139303, + "language_loss": 0.82254076, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84474117, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.21105957, + "step": 2251, + "time_per_iteration": 2.4731833934783936 + }, + { + "auxiliary_loss_clip": 0.01177516, + "auxiliary_loss_mlp": 0.01055454, + "balance_loss_clip": 1.06282926, + "balance_loss_mlp": 1.033746, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.8244515418484444, + "language_loss": 0.85804719, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88037694, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.21740723, + "step": 2252, + "time_per_iteration": 2.5019538402557373 + }, + { + "auxiliary_loss_clip": 0.01082659, + "auxiliary_loss_mlp": 0.01068067, + "balance_loss_clip": 1.04148209, + "balance_loss_mlp": 1.06508422, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7716640202120212, + "language_loss": 0.61771309, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63922036, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.41162109, + "router_z_loss_mlp": 0.02984619, + "step": 2253, + "time_per_iteration": 3.187117338180542 + }, + { + "auxiliary_loss_clip": 0.01181159, + "auxiliary_loss_mlp": 0.0105754, + "balance_loss_clip": 1.06711078, + "balance_loss_mlp": 1.03812051, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.22941695404117, + "language_loss": 0.89603555, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91842246, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.1940918, + "step": 2254, + "time_per_iteration": 2.5658533573150635 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01054591, + "balance_loss_clip": 1.0593555, + "balance_loss_mlp": 1.03200138, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 4.955466177346869, + "language_loss": 0.84534311, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86764193, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.22595215, + "step": 2255, + "time_per_iteration": 2.475494384765625 + }, + { + "auxiliary_loss_clip": 0.01174637, + "auxiliary_loss_mlp": 0.01053404, + "balance_loss_clip": 1.06286681, + "balance_loss_mlp": 1.03332865, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.4737286550338884, + "language_loss": 0.74919009, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77147055, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.20068359, + "step": 2256, + "time_per_iteration": 2.464465618133545 + }, + { + "auxiliary_loss_clip": 0.01187852, + "auxiliary_loss_mlp": 0.01055615, + "balance_loss_clip": 1.06815648, + "balance_loss_mlp": 1.034217, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 2.0283268878369363, + "language_loss": 0.83294761, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85538232, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.21398926, + "step": 2257, + "time_per_iteration": 2.4574244022369385 + }, + { + "auxiliary_loss_clip": 0.01177445, + "auxiliary_loss_mlp": 0.01048892, + "balance_loss_clip": 1.05822468, + "balance_loss_mlp": 1.02633715, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 4.288261457311049, + "language_loss": 0.82135403, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84361738, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 1.19042969, + "router_z_loss_mlp": 0.22570801, + "step": 2258, + "time_per_iteration": 2.393380641937256 + }, + { + "auxiliary_loss_clip": 0.01169992, + "auxiliary_loss_mlp": 0.01056859, + "balance_loss_clip": 1.05593204, + "balance_loss_mlp": 1.03376818, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.2149513897913016, + "language_loss": 0.82858723, + "learning_rate": 3.883683968018669e-06, + "loss": 0.85085571, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.2310791, + "step": 2259, + "time_per_iteration": 2.4866397380828857 + }, + { + "auxiliary_loss_clip": 0.01167002, + "auxiliary_loss_mlp": 0.01055912, + "balance_loss_clip": 1.05566812, + "balance_loss_mlp": 1.03714871, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9384755397045423, + "language_loss": 0.73620367, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75843275, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.1875, + "step": 2260, + "time_per_iteration": 2.474548816680908 + }, + { + "auxiliary_loss_clip": 0.01169138, + "auxiliary_loss_mlp": 0.01055649, + "balance_loss_clip": 1.05685258, + "balance_loss_mlp": 1.03580034, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.785771986096579, + "language_loss": 0.75067115, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77291894, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.1986084, + "step": 2261, + "time_per_iteration": 2.600968360900879 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01049723, + "balance_loss_clip": 1.05335951, + "balance_loss_mlp": 1.02908778, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 5.221374440312557, + "language_loss": 0.63188803, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65405703, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.20629883, + "step": 2262, + "time_per_iteration": 2.5496017932891846 + }, + { + "auxiliary_loss_clip": 0.01168589, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_clip": 1.05551755, + "balance_loss_mlp": 1.02863717, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.422167140122083, + "language_loss": 0.82668871, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84887254, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.21166992, + "step": 2263, + "time_per_iteration": 2.4953250885009766 + }, + { + "auxiliary_loss_clip": 0.0118311, + "auxiliary_loss_mlp": 0.01059632, + "balance_loss_clip": 1.06631494, + "balance_loss_mlp": 1.03722119, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.9148649701417342, + "language_loss": 0.87446225, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.89688969, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.22424316, + "step": 2264, + "time_per_iteration": 2.441588878631592 + }, + { + "auxiliary_loss_clip": 0.0117545, + "auxiliary_loss_mlp": 0.01049206, + "balance_loss_clip": 1.060233, + "balance_loss_mlp": 1.02753425, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 3.095899794330901, + "language_loss": 0.71239609, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73464262, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.2166748, + "step": 2265, + "time_per_iteration": 2.495781660079956 + }, + { + "auxiliary_loss_clip": 0.01172174, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.06218565, + "balance_loss_mlp": 1.02343297, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.7211224106895813, + "language_loss": 0.66853893, + "learning_rate": 3.882766051566027e-06, + "loss": 0.69069141, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1965332, + "step": 2266, + "time_per_iteration": 2.49505615234375 + }, + { + "auxiliary_loss_clip": 0.01169722, + "auxiliary_loss_mlp": 0.01069183, + "balance_loss_clip": 1.05911016, + "balance_loss_mlp": 1.04842854, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.8653583398744857, + "language_loss": 0.7658627, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78825176, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.20739746, + "step": 2267, + "time_per_iteration": 2.481165885925293 + }, + { + "auxiliary_loss_clip": 0.01172725, + "auxiliary_loss_mlp": 0.01049973, + "balance_loss_clip": 1.06088257, + "balance_loss_mlp": 1.02989841, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 1.9354366985751439, + "language_loss": 0.82378799, + "learning_rate": 3.882503147095667e-06, + "loss": 0.84601498, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.20092773, + "step": 2268, + "time_per_iteration": 2.414043664932251 + }, + { + "auxiliary_loss_clip": 0.01176781, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.06469011, + "balance_loss_mlp": 1.02229857, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 1.7989462648142158, + "language_loss": 0.76429433, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78648639, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.20117188, + "step": 2269, + "time_per_iteration": 2.58100962638855 + }, + { + "auxiliary_loss_clip": 0.01186368, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.07289922, + "balance_loss_mlp": 1.02636755, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.070949563158691, + "language_loss": 0.81510949, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83743376, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.19689941, + "step": 2270, + "time_per_iteration": 2.433173656463623 + }, + { + "auxiliary_loss_clip": 0.01175221, + "auxiliary_loss_mlp": 0.01058027, + "balance_loss_clip": 1.0604378, + "balance_loss_mlp": 1.0375824, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 3.2509328837925295, + "language_loss": 0.75711673, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77944916, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.20446777, + "step": 2271, + "time_per_iteration": 2.4948959350585938 + }, + { + "auxiliary_loss_clip": 0.01168603, + "auxiliary_loss_mlp": 0.01056027, + "balance_loss_clip": 1.05617332, + "balance_loss_mlp": 1.03497422, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.949099096486864, + "language_loss": 0.80542511, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82767141, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.21057129, + "step": 2272, + "time_per_iteration": 2.4014575481414795 + }, + { + "auxiliary_loss_clip": 0.01084197, + "auxiliary_loss_mlp": 0.01023123, + "balance_loss_clip": 1.04258585, + "balance_loss_mlp": 1.01999664, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.6891843927463582, + "language_loss": 0.60657811, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62765133, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.41503906, + "router_z_loss_mlp": 0.03125, + "step": 2273, + "time_per_iteration": 3.1628262996673584 + }, + { + "auxiliary_loss_clip": 0.01168778, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.05768979, + "balance_loss_mlp": 1.02447295, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.8723769056733048, + "language_loss": 0.77775013, + "learning_rate": 3.881712720611336e-06, + "loss": 0.79987919, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.19665527, + "step": 2274, + "time_per_iteration": 2.4422953128814697 + }, + { + "auxiliary_loss_clip": 0.01171949, + "auxiliary_loss_mlp": 0.01047847, + "balance_loss_clip": 1.05934179, + "balance_loss_mlp": 1.02650881, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.130563391385945, + "language_loss": 0.78631157, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80850947, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.21325684, + "step": 2275, + "time_per_iteration": 2.5416958332061768 + }, + { + "auxiliary_loss_clip": 0.0118064, + "auxiliary_loss_mlp": 0.01041293, + "balance_loss_clip": 1.06804216, + "balance_loss_mlp": 1.02207673, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.299159312073731, + "language_loss": 0.81726301, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83948237, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.19226074, + "step": 2276, + "time_per_iteration": 2.4278156757354736 + }, + { + "auxiliary_loss_clip": 0.01178674, + "auxiliary_loss_mlp": 0.0106101, + "balance_loss_clip": 1.06202006, + "balance_loss_mlp": 1.03844368, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.3345170415870315, + "language_loss": 0.69214499, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71454179, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.22570801, + "step": 2277, + "time_per_iteration": 2.5168166160583496 + }, + { + "auxiliary_loss_clip": 0.01174679, + "auxiliary_loss_mlp": 0.01055586, + "balance_loss_clip": 1.05939949, + "balance_loss_mlp": 1.03475976, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.0692961294040413, + "language_loss": 0.80380177, + "learning_rate": 3.88118434246049e-06, + "loss": 0.8261044, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.20837402, + "step": 2278, + "time_per_iteration": 2.454125165939331 + }, + { + "auxiliary_loss_clip": 0.01173055, + "auxiliary_loss_mlp": 0.01061864, + "balance_loss_clip": 1.06007504, + "balance_loss_mlp": 1.04146719, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 5.606817429924498, + "language_loss": 0.75656015, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77890933, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.20397949, + "step": 2279, + "time_per_iteration": 4.024616003036499 + }, + { + "auxiliary_loss_clip": 0.0118097, + "auxiliary_loss_mlp": 0.01065612, + "balance_loss_clip": 1.06444132, + "balance_loss_mlp": 1.04528666, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.8552351304027694, + "language_loss": 0.7675845, + "learning_rate": 3.880919725356831e-06, + "loss": 0.79005033, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.20324707, + "step": 2280, + "time_per_iteration": 2.5494532585144043 + }, + { + "auxiliary_loss_clip": 0.01163864, + "auxiliary_loss_mlp": 0.01052369, + "balance_loss_clip": 1.05538094, + "balance_loss_mlp": 1.03361726, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7945424247343214, + "language_loss": 0.8000493, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82221162, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.1875, + "step": 2281, + "time_per_iteration": 2.541611909866333 + }, + { + "auxiliary_loss_clip": 0.01176371, + "auxiliary_loss_mlp": 0.0107368, + "balance_loss_clip": 1.06084752, + "balance_loss_mlp": 1.0533545, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 4.582668083821148, + "language_loss": 0.83734387, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85984439, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.20336914, + "step": 2282, + "time_per_iteration": 3.865480899810791 + }, + { + "auxiliary_loss_clip": 0.01172061, + "auxiliary_loss_mlp": 0.01068009, + "balance_loss_clip": 1.05902433, + "balance_loss_mlp": 1.04866076, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.5908301085848564, + "language_loss": 0.74032259, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76272333, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.19335938, + "step": 2283, + "time_per_iteration": 2.5121452808380127 + }, + { + "auxiliary_loss_clip": 0.01177644, + "auxiliary_loss_mlp": 0.01055485, + "balance_loss_clip": 1.06675422, + "balance_loss_mlp": 1.03618526, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.460659352855589, + "language_loss": 0.83862829, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86095965, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.19299316, + "step": 2284, + "time_per_iteration": 2.5200510025024414 + }, + { + "auxiliary_loss_clip": 0.01180423, + "auxiliary_loss_mlp": 0.0105754, + "balance_loss_clip": 1.06316519, + "balance_loss_mlp": 1.03607094, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.7653633584160593, + "language_loss": 0.75167906, + "learning_rate": 3.880256934503974e-06, + "loss": 0.7740587, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.21472168, + "step": 2285, + "time_per_iteration": 2.537308692932129 + }, + { + "auxiliary_loss_clip": 0.01160618, + "auxiliary_loss_mlp": 0.01053726, + "balance_loss_clip": 1.05303347, + "balance_loss_mlp": 1.03485501, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.929396187983727, + "language_loss": 0.74709362, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76923704, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.1887207, + "step": 2286, + "time_per_iteration": 5.388335704803467 + }, + { + "auxiliary_loss_clip": 0.01168939, + "auxiliary_loss_mlp": 0.01050626, + "balance_loss_clip": 1.05538762, + "balance_loss_mlp": 1.029037, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.328064265041781, + "language_loss": 0.86662006, + "learning_rate": 3.879991319030908e-06, + "loss": 0.8888157, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.21582031, + "step": 2287, + "time_per_iteration": 2.529198408126831 + }, + { + "auxiliary_loss_clip": 0.01170946, + "auxiliary_loss_mlp": 0.01046703, + "balance_loss_clip": 1.05775905, + "balance_loss_mlp": 1.02736735, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 1.7645316172243315, + "language_loss": 0.68450707, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70668358, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.1932373, + "step": 2288, + "time_per_iteration": 2.6110973358154297 + }, + { + "auxiliary_loss_clip": 0.01168363, + "auxiliary_loss_mlp": 0.01054902, + "balance_loss_clip": 1.05686975, + "balance_loss_mlp": 1.03444576, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 6.240974649445005, + "language_loss": 0.87110114, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89333379, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.2043457, + "step": 2289, + "time_per_iteration": 2.433483839035034 + }, + { + "auxiliary_loss_clip": 0.01163045, + "auxiliary_loss_mlp": 0.0105114, + "balance_loss_clip": 1.052881, + "balance_loss_mlp": 1.03192401, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 2.0241533387393256, + "language_loss": 0.7430256, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76516742, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19213867, + "step": 2290, + "time_per_iteration": 2.5005133152008057 + }, + { + "auxiliary_loss_clip": 0.01082778, + "auxiliary_loss_mlp": 0.01015274, + "balance_loss_clip": 1.04294133, + "balance_loss_mlp": 1.01171517, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7068131162030653, + "language_loss": 0.51623589, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53721637, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 0.0355835, + "step": 2291, + "time_per_iteration": 3.1368250846862793 + }, + { + "auxiliary_loss_clip": 0.01166745, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_clip": 1.05569947, + "balance_loss_mlp": 1.02913702, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 1.8563845707079334, + "language_loss": 0.70985425, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73201537, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.20227051, + "step": 2292, + "time_per_iteration": 2.46901273727417 + }, + { + "auxiliary_loss_clip": 0.01164239, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_clip": 1.05428326, + "balance_loss_mlp": 1.02465725, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.74347187295306, + "language_loss": 0.80320197, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82528567, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19458008, + "step": 2293, + "time_per_iteration": 2.4206438064575195 + }, + { + "auxiliary_loss_clip": 0.01167481, + "auxiliary_loss_mlp": 0.01047414, + "balance_loss_clip": 1.05361509, + "balance_loss_mlp": 1.02726746, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.3803830732098588, + "language_loss": 0.78743577, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80958468, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.20141602, + "step": 2294, + "time_per_iteration": 2.5287961959838867 + }, + { + "auxiliary_loss_clip": 0.01169532, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.05935216, + "balance_loss_mlp": 1.02755022, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.5030454859929514, + "language_loss": 0.79804504, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82019967, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.18383789, + "step": 2295, + "time_per_iteration": 2.4178051948547363 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01048116, + "balance_loss_clip": 1.05634189, + "balance_loss_mlp": 1.02756476, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.9043056104333613, + "language_loss": 0.78354573, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80571806, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.20544434, + "step": 2296, + "time_per_iteration": 2.447441339492798 + }, + { + "auxiliary_loss_clip": 0.01169719, + "auxiliary_loss_mlp": 0.0106693, + "balance_loss_clip": 1.05860901, + "balance_loss_mlp": 1.04575884, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 2.1393238699894472, + "language_loss": 0.78905761, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.81142414, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.21179199, + "step": 2297, + "time_per_iteration": 2.434401750564575 + }, + { + "auxiliary_loss_clip": 0.0115976, + "auxiliary_loss_mlp": 0.01050574, + "balance_loss_clip": 1.05181253, + "balance_loss_mlp": 1.03127408, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.296489354762841, + "language_loss": 0.6916312, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71373457, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.19299316, + "step": 2298, + "time_per_iteration": 2.5045506954193115 + }, + { + "auxiliary_loss_clip": 0.01174547, + "auxiliary_loss_mlp": 0.01053048, + "balance_loss_clip": 1.0609535, + "balance_loss_mlp": 1.03311574, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.9361254003982462, + "language_loss": 0.86767477, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88995075, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.19946289, + "step": 2299, + "time_per_iteration": 2.4286367893218994 + }, + { + "auxiliary_loss_clip": 0.01170287, + "auxiliary_loss_mlp": 0.01058037, + "balance_loss_clip": 1.05652094, + "balance_loss_mlp": 1.03626919, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.4966882937832193, + "language_loss": 0.75577193, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77805519, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.21765137, + "step": 2300, + "time_per_iteration": 2.494251251220703 + }, + { + "auxiliary_loss_clip": 0.01167802, + "auxiliary_loss_mlp": 0.01052653, + "balance_loss_clip": 1.05713987, + "balance_loss_mlp": 1.03099275, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.9091615584565322, + "language_loss": 0.82259226, + "learning_rate": 3.878124028561692e-06, + "loss": 0.84479684, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.2166748, + "step": 2301, + "time_per_iteration": 2.476658582687378 + }, + { + "auxiliary_loss_clip": 0.01159987, + "auxiliary_loss_mlp": 0.01048883, + "balance_loss_clip": 1.05211854, + "balance_loss_mlp": 1.0290823, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 1.9350618944001061, + "language_loss": 0.85560775, + "learning_rate": 3.877990116366466e-06, + "loss": 0.87769651, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.19787598, + "step": 2302, + "time_per_iteration": 2.5903267860412598 + }, + { + "auxiliary_loss_clip": 0.01077914, + "auxiliary_loss_mlp": 0.01014788, + "balance_loss_clip": 1.0379988, + "balance_loss_mlp": 1.01041353, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7530891069510158, + "language_loss": 0.65621376, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67714083, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.39892578, + "router_z_loss_mlp": 0.04376221, + "step": 2303, + "time_per_iteration": 3.194581985473633 + }, + { + "auxiliary_loss_clip": 0.01162941, + "auxiliary_loss_mlp": 0.01047988, + "balance_loss_clip": 1.05415905, + "balance_loss_mlp": 1.02651834, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 1.7859808535244561, + "language_loss": 0.78464663, + "learning_rate": 3.877722078340374e-06, + "loss": 0.8067559, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.21472168, + "step": 2304, + "time_per_iteration": 2.4605047702789307 + }, + { + "auxiliary_loss_clip": 0.01172336, + "auxiliary_loss_mlp": 0.01044196, + "balance_loss_clip": 1.05938995, + "balance_loss_mlp": 1.02453852, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.7605499502833926, + "language_loss": 0.77983803, + "learning_rate": 3.877587952519672e-06, + "loss": 0.80200338, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.19665527, + "step": 2305, + "time_per_iteration": 2.475146532058716 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.01048533, + "balance_loss_clip": 1.05287492, + "balance_loss_mlp": 1.02941167, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 4.7902185241306485, + "language_loss": 0.87750453, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89958292, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.19140625, + "step": 2306, + "time_per_iteration": 2.5001425743103027 + }, + { + "auxiliary_loss_clip": 0.01080758, + "auxiliary_loss_mlp": 0.0101125, + "balance_loss_clip": 1.04107988, + "balance_loss_mlp": 1.00711894, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8861501672819871, + "language_loss": 0.59056103, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61148113, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.39599609, + "router_z_loss_mlp": 0.0413208, + "step": 2307, + "time_per_iteration": 3.1093201637268066 + }, + { + "auxiliary_loss_clip": 0.01176274, + "auxiliary_loss_mlp": 0.01055585, + "balance_loss_clip": 1.05952907, + "balance_loss_mlp": 1.03394866, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.9315205466937115, + "language_loss": 0.79511034, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81742895, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.21643066, + "step": 2308, + "time_per_iteration": 2.4930100440979004 + }, + { + "auxiliary_loss_clip": 0.0116708, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.05736744, + "balance_loss_mlp": 1.02545166, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.0925286315155094, + "language_loss": 0.78736186, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80948567, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.19836426, + "step": 2309, + "time_per_iteration": 2.4444901943206787 + }, + { + "auxiliary_loss_clip": 0.01170679, + "auxiliary_loss_mlp": 0.01053776, + "balance_loss_clip": 1.05687296, + "balance_loss_mlp": 1.03208017, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9836726194249894, + "language_loss": 0.6791712, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70141578, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.21679688, + "step": 2310, + "time_per_iteration": 2.461958169937134 + }, + { + "auxiliary_loss_clip": 0.01177838, + "auxiliary_loss_mlp": 0.01051908, + "balance_loss_clip": 1.06585097, + "balance_loss_mlp": 1.03052139, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8705409970698, + "language_loss": 0.83811575, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86041319, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.21374512, + "step": 2311, + "time_per_iteration": 2.395962953567505 + }, + { + "auxiliary_loss_clip": 0.0117133, + "auxiliary_loss_mlp": 0.01053265, + "balance_loss_clip": 1.05719876, + "balance_loss_mlp": 1.03092492, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.1239730925523643, + "language_loss": 0.82357818, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84582406, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.22351074, + "step": 2312, + "time_per_iteration": 2.4839279651641846 + }, + { + "auxiliary_loss_clip": 0.01181064, + "auxiliary_loss_mlp": 0.01059766, + "balance_loss_clip": 1.06404591, + "balance_loss_mlp": 1.03815353, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.0290120107535072, + "language_loss": 0.86969477, + "learning_rate": 3.876512383242215e-06, + "loss": 0.89210314, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.21606445, + "step": 2313, + "time_per_iteration": 2.4643874168395996 + }, + { + "auxiliary_loss_clip": 0.01184007, + "auxiliary_loss_mlp": 0.01053941, + "balance_loss_clip": 1.06947601, + "balance_loss_mlp": 1.03272176, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 1.974252159904429, + "language_loss": 0.79611105, + "learning_rate": 3.876377616820024e-06, + "loss": 0.81849051, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.2121582, + "step": 2314, + "time_per_iteration": 2.4786503314971924 + }, + { + "auxiliary_loss_clip": 0.01172841, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.06185246, + "balance_loss_mlp": 1.02707422, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 4.646954829550199, + "language_loss": 0.85554707, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87774462, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.19824219, + "step": 2315, + "time_per_iteration": 2.4062557220458984 + }, + { + "auxiliary_loss_clip": 0.01175918, + "auxiliary_loss_mlp": 0.0105585, + "balance_loss_clip": 1.06171906, + "balance_loss_mlp": 1.03453541, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 15.429525783683466, + "language_loss": 0.77245867, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79477638, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.21337891, + "step": 2316, + "time_per_iteration": 2.476229667663574 + }, + { + "auxiliary_loss_clip": 0.01172752, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.0606941, + "balance_loss_mlp": 1.04295075, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.9870442477794144, + "language_loss": 0.77048188, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79285729, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.21862793, + "step": 2317, + "time_per_iteration": 2.4446897506713867 + }, + { + "auxiliary_loss_clip": 0.01173118, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.05915678, + "balance_loss_mlp": 1.03141916, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.3422934338061534, + "language_loss": 0.80431831, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82656968, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.20593262, + "step": 2318, + "time_per_iteration": 2.4652116298675537 + }, + { + "auxiliary_loss_clip": 0.0109028, + "auxiliary_loss_mlp": 0.01008257, + "balance_loss_clip": 1.05096853, + "balance_loss_mlp": 1.00558364, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8575268959187206, + "language_loss": 0.59047103, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61145645, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.39355469, + "router_z_loss_mlp": 0.0267334, + "step": 2319, + "time_per_iteration": 3.1031277179718018 + }, + { + "auxiliary_loss_clip": 0.01175447, + "auxiliary_loss_mlp": 0.01051509, + "balance_loss_clip": 1.06176758, + "balance_loss_mlp": 1.03043294, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.580135403247294, + "language_loss": 0.65322602, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67549562, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.2109375, + "step": 2320, + "time_per_iteration": 2.582085132598877 + }, + { + "auxiliary_loss_clip": 0.0116281, + "auxiliary_loss_mlp": 0.01051837, + "balance_loss_clip": 1.05406404, + "balance_loss_mlp": 1.03055763, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6921885502655072, + "language_loss": 0.70781654, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72996294, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.21264648, + "step": 2321, + "time_per_iteration": 2.462427854537964 + }, + { + "auxiliary_loss_clip": 0.01168963, + "auxiliary_loss_mlp": 0.01073232, + "balance_loss_clip": 1.05457282, + "balance_loss_mlp": 1.04860353, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.721496580596936, + "language_loss": 0.86024833, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88267028, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.24597168, + "step": 2322, + "time_per_iteration": 3.8584959506988525 + }, + { + "auxiliary_loss_clip": 0.01161467, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_clip": 1.05576777, + "balance_loss_mlp": 1.03586006, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 2.243248557890028, + "language_loss": 0.66963243, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69179785, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.1920166, + "step": 2323, + "time_per_iteration": 2.6328418254852295 + }, + { + "auxiliary_loss_clip": 0.01176383, + "auxiliary_loss_mlp": 0.01055995, + "balance_loss_clip": 1.05970752, + "balance_loss_mlp": 1.03345239, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.2233648817876626, + "language_loss": 0.88985062, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91217434, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.22521973, + "step": 2324, + "time_per_iteration": 2.3982858657836914 + }, + { + "auxiliary_loss_clip": 0.01182279, + "auxiliary_loss_mlp": 0.01060696, + "balance_loss_clip": 1.06662464, + "balance_loss_mlp": 1.03942907, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 2.3681600969085896, + "language_loss": 0.70673192, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.72916168, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.21289062, + "step": 2325, + "time_per_iteration": 2.508455991744995 + }, + { + "auxiliary_loss_clip": 0.01173796, + "auxiliary_loss_mlp": 0.01057222, + "balance_loss_clip": 1.06355786, + "balance_loss_mlp": 1.03743291, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 5.060593503477539, + "language_loss": 0.81377685, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83608699, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19787598, + "step": 2326, + "time_per_iteration": 3.9691359996795654 + }, + { + "auxiliary_loss_clip": 0.01167256, + "auxiliary_loss_mlp": 0.01052052, + "balance_loss_clip": 1.05833447, + "balance_loss_mlp": 1.03295445, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.9854996184030178, + "language_loss": 0.89093983, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91313291, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.19091797, + "step": 2327, + "time_per_iteration": 2.4493892192840576 + }, + { + "auxiliary_loss_clip": 0.01165587, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.05644798, + "balance_loss_mlp": 1.05457985, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.419846559913227, + "language_loss": 0.85082066, + "learning_rate": 3.874483418234632e-06, + "loss": 0.8732515, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.22924805, + "step": 2328, + "time_per_iteration": 2.566685676574707 + }, + { + "auxiliary_loss_clip": 0.0117639, + "auxiliary_loss_mlp": 0.01057612, + "balance_loss_clip": 1.06117308, + "balance_loss_mlp": 1.03616655, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.6404847443045425, + "language_loss": 0.74044436, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76278436, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.21447754, + "step": 2329, + "time_per_iteration": 2.527442693710327 + }, + { + "auxiliary_loss_clip": 0.01174946, + "auxiliary_loss_mlp": 0.01050354, + "balance_loss_clip": 1.06077278, + "balance_loss_mlp": 1.02982628, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.242093302291733, + "language_loss": 0.7827667, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80501968, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.2052002, + "step": 2330, + "time_per_iteration": 5.327906131744385 + }, + { + "auxiliary_loss_clip": 0.01166291, + "auxiliary_loss_mlp": 0.01053047, + "balance_loss_clip": 1.05358648, + "balance_loss_mlp": 1.03270996, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.0852719346547643, + "language_loss": 0.7219733, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74416673, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.20336914, + "step": 2331, + "time_per_iteration": 2.498131275177002 + }, + { + "auxiliary_loss_clip": 0.01167499, + "auxiliary_loss_mlp": 0.01056469, + "balance_loss_clip": 1.06079221, + "balance_loss_mlp": 1.0363344, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.696866592583868, + "language_loss": 0.72841024, + "learning_rate": 3.873939659120557e-06, + "loss": 0.75064987, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.20129395, + "step": 2332, + "time_per_iteration": 2.4284989833831787 + }, + { + "auxiliary_loss_clip": 0.01084095, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.04227686, + "balance_loss_mlp": 1.0358938, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8296780834453786, + "language_loss": 0.56142396, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58264923, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 0.02536011, + "step": 2333, + "time_per_iteration": 2.951795816421509 + }, + { + "auxiliary_loss_clip": 0.01170399, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.05871987, + "balance_loss_mlp": 1.02823389, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 2.199537128014302, + "language_loss": 0.82881516, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85100603, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.20446777, + "step": 2334, + "time_per_iteration": 2.474660873413086 + }, + { + "auxiliary_loss_clip": 0.01174235, + "auxiliary_loss_mlp": 0.0104915, + "balance_loss_clip": 1.06203938, + "balance_loss_mlp": 1.02985024, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 3.714935336840152, + "language_loss": 0.81108189, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83331573, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.19287109, + "step": 2335, + "time_per_iteration": 2.500685453414917 + }, + { + "auxiliary_loss_clip": 0.01171954, + "auxiliary_loss_mlp": 0.01051946, + "balance_loss_clip": 1.05780101, + "balance_loss_mlp": 1.02824688, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.8440146624662856, + "language_loss": 0.8190484, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84128737, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.23669434, + "step": 2336, + "time_per_iteration": 2.4538838863372803 + }, + { + "auxiliary_loss_clip": 0.01172194, + "auxiliary_loss_mlp": 0.01054063, + "balance_loss_clip": 1.06003368, + "balance_loss_mlp": 1.0337975, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 2.054298933334128, + "language_loss": 0.8081364, + "learning_rate": 3.873258361417225e-06, + "loss": 0.83039904, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.20251465, + "step": 2337, + "time_per_iteration": 2.479889154434204 + }, + { + "auxiliary_loss_clip": 0.01168337, + "auxiliary_loss_mlp": 0.01051477, + "balance_loss_clip": 1.05433083, + "balance_loss_mlp": 1.03098536, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.067683537374725, + "language_loss": 0.79141641, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81361449, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.20483398, + "step": 2338, + "time_per_iteration": 2.4333972930908203 + }, + { + "auxiliary_loss_clip": 0.0118658, + "auxiliary_loss_mlp": 0.01052886, + "balance_loss_clip": 1.06900144, + "balance_loss_mlp": 1.03135633, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.0451950771677487, + "language_loss": 0.79896045, + "learning_rate": 3.87298534506069e-06, + "loss": 0.8213551, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.21520996, + "step": 2339, + "time_per_iteration": 2.502984046936035 + }, + { + "auxiliary_loss_clip": 0.01171955, + "auxiliary_loss_mlp": 0.010626, + "balance_loss_clip": 1.05918026, + "balance_loss_mlp": 1.04202461, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.8755506689934918, + "language_loss": 0.65537596, + "learning_rate": 3.872848730344146e-06, + "loss": 0.6777215, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.20593262, + "step": 2340, + "time_per_iteration": 2.576653242111206 + }, + { + "auxiliary_loss_clip": 0.01167913, + "auxiliary_loss_mlp": 0.01060005, + "balance_loss_clip": 1.05823851, + "balance_loss_mlp": 1.03896427, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.6783592767936004, + "language_loss": 0.78767228, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80995142, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.21044922, + "step": 2341, + "time_per_iteration": 2.438082218170166 + }, + { + "auxiliary_loss_clip": 0.01175282, + "auxiliary_loss_mlp": 0.01056465, + "balance_loss_clip": 1.06418264, + "balance_loss_mlp": 1.03615165, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 3.0406299431997086, + "language_loss": 0.80476981, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82708728, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.20324707, + "step": 2342, + "time_per_iteration": 2.418076276779175 + }, + { + "auxiliary_loss_clip": 0.01173001, + "auxiliary_loss_mlp": 0.01057462, + "balance_loss_clip": 1.06087649, + "balance_loss_mlp": 1.03665996, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.8248545153477136, + "language_loss": 0.77127379, + "learning_rate": 3.87243846010358e-06, + "loss": 0.79357839, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.20812988, + "step": 2343, + "time_per_iteration": 2.5049097537994385 + }, + { + "auxiliary_loss_clip": 0.01090874, + "auxiliary_loss_mlp": 0.01024254, + "balance_loss_clip": 1.04824162, + "balance_loss_mlp": 1.02169752, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8499790438129137, + "language_loss": 0.61450219, + "learning_rate": 3.872301561343699e-06, + "loss": 0.6356535, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 0.02557373, + "step": 2344, + "time_per_iteration": 3.008615016937256 + }, + { + "auxiliary_loss_clip": 0.01177228, + "auxiliary_loss_mlp": 0.01049509, + "balance_loss_clip": 1.06662369, + "balance_loss_mlp": 1.03045905, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.4841734073326582, + "language_loss": 0.64329672, + "learning_rate": 3.872164591585956e-06, + "loss": 0.66556406, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.19055176, + "step": 2345, + "time_per_iteration": 2.5318996906280518 + }, + { + "auxiliary_loss_clip": 0.01173513, + "auxiliary_loss_mlp": 0.01050539, + "balance_loss_clip": 1.05571771, + "balance_loss_mlp": 1.02821088, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.153090132592846, + "language_loss": 0.73424304, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.75648355, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.22314453, + "step": 2346, + "time_per_iteration": 2.4637110233306885 + }, + { + "auxiliary_loss_clip": 0.01183399, + "auxiliary_loss_mlp": 0.01050426, + "balance_loss_clip": 1.06509042, + "balance_loss_mlp": 1.02857471, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 1.8230005515703944, + "language_loss": 0.77476132, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79709953, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 1.18359375, + "router_z_loss_mlp": 0.21850586, + "step": 2347, + "time_per_iteration": 2.5007588863372803 + }, + { + "auxiliary_loss_clip": 0.01174966, + "auxiliary_loss_mlp": 0.01054646, + "balance_loss_clip": 1.059919, + "balance_loss_mlp": 1.03484523, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 2.005030326048492, + "language_loss": 0.76891518, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79121137, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.19787598, + "step": 2348, + "time_per_iteration": 2.504535675048828 + }, + { + "auxiliary_loss_clip": 0.01171474, + "auxiliary_loss_mlp": 0.01048263, + "balance_loss_clip": 1.0581491, + "balance_loss_mlp": 1.02818787, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.9971007116237565, + "language_loss": 0.86549354, + "learning_rate": 3.871616002680272e-06, + "loss": 0.8876909, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.20068359, + "step": 2349, + "time_per_iteration": 2.4476912021636963 + }, + { + "auxiliary_loss_clip": 0.01169294, + "auxiliary_loss_mlp": 0.01056313, + "balance_loss_clip": 1.05979586, + "balance_loss_mlp": 1.03619075, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.8973296376996802, + "language_loss": 0.88985872, + "learning_rate": 3.871478678011177e-06, + "loss": 0.91211486, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.2010498, + "step": 2350, + "time_per_iteration": 2.532410144805908 + }, + { + "auxiliary_loss_clip": 0.01177109, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.06443274, + "balance_loss_mlp": 1.03169632, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 2.0402648290991636, + "language_loss": 0.8085326, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83084238, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.22180176, + "step": 2351, + "time_per_iteration": 2.5709688663482666 + }, + { + "auxiliary_loss_clip": 0.01186511, + "auxiliary_loss_mlp": 0.01051451, + "balance_loss_clip": 1.06991506, + "balance_loss_mlp": 1.03132808, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.159651217201981, + "language_loss": 0.83606434, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85844409, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.20117188, + "step": 2352, + "time_per_iteration": 2.589590549468994 + }, + { + "auxiliary_loss_clip": 0.01078575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.03853798, + "balance_loss_mlp": 1.02543032, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9089572626327186, + "language_loss": 0.61918873, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64025187, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.39990234, + "router_z_loss_mlp": 0.02310181, + "step": 2353, + "time_per_iteration": 3.0159192085266113 + }, + { + "auxiliary_loss_clip": 0.01172452, + "auxiliary_loss_mlp": 0.01054009, + "balance_loss_clip": 1.0620656, + "balance_loss_mlp": 1.03462529, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 2.2402437370002266, + "language_loss": 0.87247658, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89474118, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.19396973, + "step": 2354, + "time_per_iteration": 2.4623160362243652 + }, + { + "auxiliary_loss_clip": 0.011746, + "auxiliary_loss_mlp": 0.01051424, + "balance_loss_clip": 1.06110227, + "balance_loss_mlp": 1.03000212, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 2.1222551768947606, + "language_loss": 0.74443859, + "learning_rate": 3.870790990270057e-06, + "loss": 0.76669884, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.21411133, + "step": 2355, + "time_per_iteration": 2.4533538818359375 + }, + { + "auxiliary_loss_clip": 0.01078369, + "auxiliary_loss_mlp": 0.01014829, + "balance_loss_clip": 1.03760076, + "balance_loss_mlp": 1.01249886, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6993551459271735, + "language_loss": 0.51778674, + "learning_rate": 3.870653239879212e-06, + "loss": 0.5387187, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.40771484, + "router_z_loss_mlp": 0.02328491, + "step": 2356, + "time_per_iteration": 3.0518555641174316 + }, + { + "auxiliary_loss_clip": 0.01180992, + "auxiliary_loss_mlp": 0.01069526, + "balance_loss_clip": 1.06599307, + "balance_loss_mlp": 1.04880738, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 2.125148286571444, + "language_loss": 0.70248091, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72498608, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.20715332, + "step": 2357, + "time_per_iteration": 2.4444284439086914 + }, + { + "auxiliary_loss_clip": 0.01177251, + "auxiliary_loss_mlp": 0.01055686, + "balance_loss_clip": 1.06238842, + "balance_loss_mlp": 1.03592062, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.957022264350142, + "language_loss": 0.81786323, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84019256, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.19750977, + "step": 2358, + "time_per_iteration": 2.5037200450897217 + }, + { + "auxiliary_loss_clip": 0.011757, + "auxiliary_loss_mlp": 0.01058121, + "balance_loss_clip": 1.05982685, + "balance_loss_mlp": 1.0369494, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 1.9014371872468185, + "language_loss": 0.71251512, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73485327, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.21154785, + "step": 2359, + "time_per_iteration": 2.4700822830200195 + }, + { + "auxiliary_loss_clip": 0.01170095, + "auxiliary_loss_mlp": 0.01045475, + "balance_loss_clip": 1.05858731, + "balance_loss_mlp": 1.02468514, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.5661511395783685, + "language_loss": 0.75937456, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78153026, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.20788574, + "step": 2360, + "time_per_iteration": 2.472383975982666 + }, + { + "auxiliary_loss_clip": 0.01168266, + "auxiliary_loss_mlp": 0.01053739, + "balance_loss_clip": 1.05910408, + "balance_loss_mlp": 1.03270996, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 3.365755279081745, + "language_loss": 0.81822044, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84044045, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.21020508, + "step": 2361, + "time_per_iteration": 2.5081474781036377 + }, + { + "auxiliary_loss_clip": 0.01175306, + "auxiliary_loss_mlp": 0.01050554, + "balance_loss_clip": 1.06425059, + "balance_loss_mlp": 1.03006172, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 1.8965795608361917, + "language_loss": 0.74646521, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76872379, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.20483398, + "step": 2362, + "time_per_iteration": 2.5634896755218506 + }, + { + "auxiliary_loss_clip": 0.01172745, + "auxiliary_loss_mlp": 0.01052108, + "balance_loss_clip": 1.06122112, + "balance_loss_mlp": 1.03106809, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 2.210916631730828, + "language_loss": 0.73940307, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76165164, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.21032715, + "step": 2363, + "time_per_iteration": 2.6696560382843018 + }, + { + "auxiliary_loss_clip": 0.01170955, + "auxiliary_loss_mlp": 0.01057082, + "balance_loss_clip": 1.05921364, + "balance_loss_mlp": 1.03636336, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.7359807182506692, + "language_loss": 0.73052406, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.7528044, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.20715332, + "step": 2364, + "time_per_iteration": 2.552887201309204 + }, + { + "auxiliary_loss_clip": 0.01173401, + "auxiliary_loss_mlp": 0.01058601, + "balance_loss_clip": 1.06361365, + "balance_loss_mlp": 1.03999257, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 2.4309800422090073, + "language_loss": 0.90833503, + "learning_rate": 3.869410294898195e-06, + "loss": 0.93065506, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.18615723, + "step": 2365, + "time_per_iteration": 2.509094715118408 + }, + { + "auxiliary_loss_clip": 0.01173859, + "auxiliary_loss_mlp": 0.01054511, + "balance_loss_clip": 1.06020963, + "balance_loss_mlp": 1.03302908, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 2.0074605879672167, + "language_loss": 0.65064317, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67292684, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.21484375, + "step": 2366, + "time_per_iteration": 3.963813543319702 + }, + { + "auxiliary_loss_clip": 0.0116981, + "auxiliary_loss_mlp": 0.01059966, + "balance_loss_clip": 1.06056619, + "balance_loss_mlp": 1.03908026, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 1.905221572789781, + "language_loss": 0.80540371, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82770151, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 1.09228516, + "router_z_loss_mlp": 0.20874023, + "step": 2367, + "time_per_iteration": 2.438967227935791 + }, + { + "auxiliary_loss_clip": 0.01173312, + "auxiliary_loss_mlp": 0.01060278, + "balance_loss_clip": 1.06239033, + "balance_loss_mlp": 1.03874826, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 1.8594782611168432, + "language_loss": 0.82722175, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84955764, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.21508789, + "step": 2368, + "time_per_iteration": 2.530721426010132 + }, + { + "auxiliary_loss_clip": 0.01171704, + "auxiliary_loss_mlp": 0.01050943, + "balance_loss_clip": 1.05994987, + "balance_loss_mlp": 1.02956843, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.4448674245519317, + "language_loss": 0.86756968, + "learning_rate": 3.868856031585652e-06, + "loss": 0.88979614, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.21362305, + "step": 2369, + "time_per_iteration": 3.8552985191345215 + }, + { + "auxiliary_loss_clip": 0.01174976, + "auxiliary_loss_mlp": 0.01050149, + "balance_loss_clip": 1.0584048, + "balance_loss_mlp": 1.02971649, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.5470325326727812, + "language_loss": 0.75783509, + "learning_rate": 3.868717288576354e-06, + "loss": 0.7800864, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.20446777, + "step": 2370, + "time_per_iteration": 2.520756244659424 + }, + { + "auxiliary_loss_clip": 0.01165822, + "auxiliary_loss_mlp": 0.01056706, + "balance_loss_clip": 1.05561137, + "balance_loss_mlp": 1.03592813, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.6028800366942555, + "language_loss": 0.82881755, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85104287, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.20788574, + "step": 2371, + "time_per_iteration": 2.5158772468566895 + }, + { + "auxiliary_loss_clip": 0.01177599, + "auxiliary_loss_mlp": 0.01062844, + "balance_loss_clip": 1.06521761, + "balance_loss_mlp": 1.04077804, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 2.297338449534265, + "language_loss": 0.82787567, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85028005, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.22058105, + "step": 2372, + "time_per_iteration": 2.431661605834961 + }, + { + "auxiliary_loss_clip": 0.01170895, + "auxiliary_loss_mlp": 0.01060235, + "balance_loss_clip": 1.0594908, + "balance_loss_mlp": 1.03741872, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 3.4021480214541144, + "language_loss": 0.8467291, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86904037, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.22814941, + "step": 2373, + "time_per_iteration": 3.9772794246673584 + }, + { + "auxiliary_loss_clip": 0.01167946, + "auxiliary_loss_mlp": 0.01051147, + "balance_loss_clip": 1.05990076, + "balance_loss_mlp": 1.03294373, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.8098684322479066, + "language_loss": 0.86050642, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88269734, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.18200684, + "step": 2374, + "time_per_iteration": 3.9821062088012695 + }, + { + "auxiliary_loss_clip": 0.01174699, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.05894923, + "balance_loss_mlp": 1.03090346, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 1.6966169259041217, + "language_loss": 0.78950965, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81177527, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.2097168, + "step": 2375, + "time_per_iteration": 2.6325559616088867 + }, + { + "auxiliary_loss_clip": 0.01172707, + "auxiliary_loss_mlp": 0.01065302, + "balance_loss_clip": 1.06236088, + "balance_loss_mlp": 1.04506004, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.4498975015506845, + "language_loss": 0.76673108, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78911114, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.20239258, + "step": 2376, + "time_per_iteration": 2.529299736022949 + }, + { + "auxiliary_loss_clip": 0.01177703, + "auxiliary_loss_mlp": 0.01050363, + "balance_loss_clip": 1.06661534, + "balance_loss_mlp": 1.03060961, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 1.782362150340794, + "language_loss": 0.93116832, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95344895, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.19763184, + "step": 2377, + "time_per_iteration": 2.4719130992889404 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01057041, + "balance_loss_clip": 1.06066704, + "balance_loss_mlp": 1.03545213, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9389576067786993, + "language_loss": 0.91518027, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93748039, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.21594238, + "step": 2378, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.01176119, + "auxiliary_loss_mlp": 0.01048868, + "balance_loss_clip": 1.06322455, + "balance_loss_mlp": 1.02875686, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.7649523915680674, + "language_loss": 0.7372939, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.75954378, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.2010498, + "step": 2379, + "time_per_iteration": 2.4729063510894775 + }, + { + "auxiliary_loss_clip": 0.01176345, + "auxiliary_loss_mlp": 0.01057649, + "balance_loss_clip": 1.06479299, + "balance_loss_mlp": 1.0382899, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.9356453534063331, + "language_loss": 0.78640944, + "learning_rate": 3.867325961945714e-06, + "loss": 0.80874944, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.19348145, + "step": 2380, + "time_per_iteration": 2.4579081535339355 + }, + { + "auxiliary_loss_clip": 0.01181115, + "auxiliary_loss_mlp": 0.01058137, + "balance_loss_clip": 1.0685643, + "balance_loss_mlp": 1.03856266, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.972630708106504, + "language_loss": 0.8846311, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90702361, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.19592285, + "step": 2381, + "time_per_iteration": 2.4259650707244873 + }, + { + "auxiliary_loss_clip": 0.01168144, + "auxiliary_loss_mlp": 0.01048223, + "balance_loss_clip": 1.06041634, + "balance_loss_mlp": 1.02857685, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.538812120522679, + "language_loss": 0.76529378, + "learning_rate": 3.867046846740299e-06, + "loss": 0.78745747, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.19641113, + "step": 2382, + "time_per_iteration": 2.4356563091278076 + }, + { + "auxiliary_loss_clip": 0.01166141, + "auxiliary_loss_mlp": 0.01053891, + "balance_loss_clip": 1.05578864, + "balance_loss_mlp": 1.03423309, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 1.967988573504724, + "language_loss": 0.76464319, + "learning_rate": 3.866907182937039e-06, + "loss": 0.78684354, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.19665527, + "step": 2383, + "time_per_iteration": 2.5023436546325684 + }, + { + "auxiliary_loss_clip": 0.01171965, + "auxiliary_loss_mlp": 0.01051336, + "balance_loss_clip": 1.06025887, + "balance_loss_mlp": 1.03002119, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 8.742941655008147, + "language_loss": 0.8747173, + "learning_rate": 3.866767448340471e-06, + "loss": 0.89695024, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.21325684, + "step": 2384, + "time_per_iteration": 2.486863136291504 + }, + { + "auxiliary_loss_clip": 0.01177723, + "auxiliary_loss_mlp": 0.01050869, + "balance_loss_clip": 1.06043351, + "balance_loss_mlp": 1.02910113, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 3.7967703216856195, + "language_loss": 0.79913956, + "learning_rate": 3.866627642955895e-06, + "loss": 0.82142544, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.21777344, + "step": 2385, + "time_per_iteration": 2.4209108352661133 + }, + { + "auxiliary_loss_clip": 0.01168741, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.0570631, + "balance_loss_mlp": 1.03491116, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.7170877565781852, + "language_loss": 0.75075918, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77299184, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.19592285, + "step": 2386, + "time_per_iteration": 2.5315067768096924 + }, + { + "auxiliary_loss_clip": 0.01170103, + "auxiliary_loss_mlp": 0.01047771, + "balance_loss_clip": 1.05913591, + "balance_loss_mlp": 1.02793503, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.482880194995332, + "language_loss": 0.78750491, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80968368, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.19824219, + "step": 2387, + "time_per_iteration": 2.4204366207122803 + }, + { + "auxiliary_loss_clip": 0.0117265, + "auxiliary_loss_mlp": 0.01055931, + "balance_loss_clip": 1.06056976, + "balance_loss_mlp": 1.03481901, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.1034663451227975, + "language_loss": 0.81969941, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84198517, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.21130371, + "step": 2388, + "time_per_iteration": 2.493525743484497 + }, + { + "auxiliary_loss_clip": 0.01182459, + "auxiliary_loss_mlp": 0.01047296, + "balance_loss_clip": 1.07097924, + "balance_loss_mlp": 1.02865136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.3155209368160015, + "language_loss": 0.82564282, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84794039, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.18640137, + "step": 2389, + "time_per_iteration": 2.4948225021362305 + }, + { + "auxiliary_loss_clip": 0.0117182, + "auxiliary_loss_mlp": 0.01051911, + "balance_loss_clip": 1.05691361, + "balance_loss_mlp": 1.03076291, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.707147341977524, + "language_loss": 0.82906699, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85130429, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.21130371, + "step": 2390, + "time_per_iteration": 2.4524266719818115 + }, + { + "auxiliary_loss_clip": 0.01164494, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_clip": 1.05510318, + "balance_loss_mlp": 1.02955317, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 2.35745901375741, + "language_loss": 0.75389421, + "learning_rate": 3.865787324397324e-06, + "loss": 0.7760334, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.19848633, + "step": 2391, + "time_per_iteration": 2.564089775085449 + }, + { + "auxiliary_loss_clip": 0.01088825, + "auxiliary_loss_mlp": 0.01020372, + "balance_loss_clip": 1.04818261, + "balance_loss_mlp": 1.0181694, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.9281071305416384, + "language_loss": 0.61808717, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63917917, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.40722656, + "router_z_loss_mlp": 0.02206421, + "step": 2392, + "time_per_iteration": 2.9963057041168213 + }, + { + "auxiliary_loss_clip": 0.01172435, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_clip": 1.0589608, + "balance_loss_mlp": 1.03290653, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 3.455597165806415, + "language_loss": 0.77166581, + "learning_rate": 3.865506652147709e-06, + "loss": 0.79392821, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.2088623, + "step": 2393, + "time_per_iteration": 2.445890188217163 + }, + { + "auxiliary_loss_clip": 0.01167276, + "auxiliary_loss_mlp": 0.0105325, + "balance_loss_clip": 1.05546832, + "balance_loss_mlp": 1.03356862, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8491496349946834, + "language_loss": 0.76674056, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78894585, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.19677734, + "step": 2394, + "time_per_iteration": 2.5385825634002686 + }, + { + "auxiliary_loss_clip": 0.01170808, + "auxiliary_loss_mlp": 0.01055932, + "balance_loss_clip": 1.06139684, + "balance_loss_mlp": 1.0355711, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.6205007529961288, + "language_loss": 0.85841244, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88067985, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.20361328, + "step": 2395, + "time_per_iteration": 2.620251417160034 + }, + { + "auxiliary_loss_clip": 0.01177937, + "auxiliary_loss_mlp": 0.01049764, + "balance_loss_clip": 1.06755114, + "balance_loss_mlp": 1.03072667, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.683186525705686, + "language_loss": 0.83145916, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85373622, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.19030762, + "step": 2396, + "time_per_iteration": 2.5405235290527344 + }, + { + "auxiliary_loss_clip": 0.01162252, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.05569029, + "balance_loss_mlp": 1.0316875, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.8869929201451914, + "language_loss": 0.82782567, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84996045, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.1953125, + "step": 2397, + "time_per_iteration": 2.4379184246063232 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01050871, + "balance_loss_clip": 1.06581461, + "balance_loss_mlp": 1.03078473, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.618578779284429, + "language_loss": 0.79986179, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82216829, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.2010498, + "step": 2398, + "time_per_iteration": 2.460925340652466 + }, + { + "auxiliary_loss_clip": 0.01168606, + "auxiliary_loss_mlp": 0.010553, + "balance_loss_clip": 1.060045, + "balance_loss_mlp": 1.03629816, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.5126674509755853, + "language_loss": 0.647892, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67013103, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.19006348, + "step": 2399, + "time_per_iteration": 2.4899351596832275 + }, + { + "auxiliary_loss_clip": 0.01170959, + "auxiliary_loss_mlp": 0.01049628, + "balance_loss_clip": 1.06246519, + "balance_loss_mlp": 1.02962422, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.6932460037887263, + "language_loss": 0.82071197, + "learning_rate": 3.864522071237571e-06, + "loss": 0.8429178, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.20007324, + "step": 2400, + "time_per_iteration": 2.5679430961608887 + }, + { + "auxiliary_loss_clip": 0.01177188, + "auxiliary_loss_mlp": 0.01061056, + "balance_loss_clip": 1.06629241, + "balance_loss_mlp": 1.03993201, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 2.321296376809683, + "language_loss": 0.7453593, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76774174, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.21118164, + "step": 2401, + "time_per_iteration": 2.4644339084625244 + }, + { + "auxiliary_loss_clip": 0.011673, + "auxiliary_loss_mlp": 0.0105362, + "balance_loss_clip": 1.05782998, + "balance_loss_mlp": 1.03447509, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.9339488783424394, + "language_loss": 0.80672109, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82893032, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.19152832, + "step": 2402, + "time_per_iteration": 2.464542865753174 + }, + { + "auxiliary_loss_clip": 0.01169787, + "auxiliary_loss_mlp": 0.01051096, + "balance_loss_clip": 1.05967832, + "balance_loss_mlp": 1.03140306, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 3.2718749965830587, + "language_loss": 0.84679967, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86900854, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.19714355, + "step": 2403, + "time_per_iteration": 2.4034388065338135 + }, + { + "auxiliary_loss_clip": 0.0117089, + "auxiliary_loss_mlp": 0.0105587, + "balance_loss_clip": 1.05942106, + "balance_loss_mlp": 1.03531814, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 1.9072624556389401, + "language_loss": 0.6959523, + "learning_rate": 3.863957897994262e-06, + "loss": 0.71821988, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.20556641, + "step": 2404, + "time_per_iteration": 2.479419708251953 + }, + { + "auxiliary_loss_clip": 0.01163417, + "auxiliary_loss_mlp": 0.01051135, + "balance_loss_clip": 1.05686843, + "balance_loss_mlp": 1.03259778, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.417017668133606, + "language_loss": 0.73926055, + "learning_rate": 3.863816677966381e-06, + "loss": 0.76140606, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.18554688, + "step": 2405, + "time_per_iteration": 2.3946430683135986 + }, + { + "auxiliary_loss_clip": 0.01165145, + "auxiliary_loss_mlp": 0.01049816, + "balance_loss_clip": 1.05760145, + "balance_loss_mlp": 1.02990842, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.4589812509701794, + "language_loss": 0.72710717, + "learning_rate": 3.863675387262386e-06, + "loss": 0.74925673, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.19909668, + "step": 2406, + "time_per_iteration": 2.416861057281494 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01052623, + "balance_loss_clip": 1.05813956, + "balance_loss_mlp": 1.03152299, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 2.294378704324732, + "language_loss": 0.75298107, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77517128, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.21105957, + "step": 2407, + "time_per_iteration": 2.4848296642303467 + }, + { + "auxiliary_loss_clip": 0.0115993, + "auxiliary_loss_mlp": 0.0104651, + "balance_loss_clip": 1.05314469, + "balance_loss_mlp": 1.02763963, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 2.160943348715536, + "language_loss": 0.79142153, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81348598, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18884277, + "step": 2408, + "time_per_iteration": 2.4849672317504883 + }, + { + "auxiliary_loss_clip": 0.01171172, + "auxiliary_loss_mlp": 0.0105778, + "balance_loss_clip": 1.0608983, + "balance_loss_mlp": 1.03510666, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 1.9699028105775895, + "language_loss": 0.82067633, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84296584, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.22668457, + "step": 2409, + "time_per_iteration": 3.9806997776031494 + }, + { + "auxiliary_loss_clip": 0.01167762, + "auxiliary_loss_mlp": 0.01054739, + "balance_loss_clip": 1.05993378, + "balance_loss_mlp": 1.03531957, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 2.1987313769601564, + "language_loss": 0.74489605, + "learning_rate": 3.863109517792446e-06, + "loss": 0.76712108, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.19421387, + "step": 2410, + "time_per_iteration": 2.5647571086883545 + }, + { + "auxiliary_loss_clip": 0.01161458, + "auxiliary_loss_mlp": 0.01043923, + "balance_loss_clip": 1.05558395, + "balance_loss_mlp": 1.02586293, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 1.899137612562064, + "language_loss": 0.8125326, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83458638, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.18078613, + "step": 2411, + "time_per_iteration": 2.492675304412842 + }, + { + "auxiliary_loss_clip": 0.01168765, + "auxiliary_loss_mlp": 0.0105636, + "balance_loss_clip": 1.05952728, + "balance_loss_mlp": 1.03653574, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 2.2268577757848753, + "language_loss": 0.70050901, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72276032, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.19824219, + "step": 2412, + "time_per_iteration": 2.534196138381958 + }, + { + "auxiliary_loss_clip": 0.01168491, + "auxiliary_loss_mlp": 0.01042829, + "balance_loss_clip": 1.06227291, + "balance_loss_mlp": 1.023422, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.2262563962408293, + "language_loss": 0.7679975, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79011071, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.19396973, + "step": 2413, + "time_per_iteration": 3.7993576526641846 + }, + { + "auxiliary_loss_clip": 0.01096233, + "auxiliary_loss_mlp": 0.01013217, + "balance_loss_clip": 1.05665648, + "balance_loss_mlp": 1.01049864, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9138939769871566, + "language_loss": 0.58855194, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60964644, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.39550781, + "router_z_loss_mlp": 0.02722168, + "step": 2414, + "time_per_iteration": 2.98771071434021 + }, + { + "auxiliary_loss_clip": 0.01090384, + "auxiliary_loss_mlp": 0.01014246, + "balance_loss_clip": 1.05181003, + "balance_loss_mlp": 1.01170719, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8504381329280241, + "language_loss": 0.6221875, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64323378, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.38574219, + "router_z_loss_mlp": 0.02539062, + "step": 2415, + "time_per_iteration": 3.0795199871063232 + }, + { + "auxiliary_loss_clip": 0.01165377, + "auxiliary_loss_mlp": 0.01044828, + "balance_loss_clip": 1.05799544, + "balance_loss_mlp": 1.02444363, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 2.6316420287314988, + "language_loss": 0.72375214, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74585414, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.20410156, + "step": 2416, + "time_per_iteration": 2.4205548763275146 + }, + { + "auxiliary_loss_clip": 0.01090301, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.05136299, + "balance_loss_mlp": 1.02523804, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7145938732234098, + "language_loss": 0.60358107, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62476325, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.38964844, + "router_z_loss_mlp": 0.02679443, + "step": 2417, + "time_per_iteration": 5.913217067718506 + }, + { + "auxiliary_loss_clip": 0.01165829, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.05549979, + "balance_loss_mlp": 1.03692317, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9551824245625595, + "language_loss": 0.79022777, + "learning_rate": 3.861974388030356e-06, + "loss": 0.81247413, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 1.10400391, + "router_z_loss_mlp": 0.21887207, + "step": 2418, + "time_per_iteration": 2.5258398056030273 + }, + { + "auxiliary_loss_clip": 0.01189838, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.07965088, + "balance_loss_mlp": 1.03568935, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 2.363230219264198, + "language_loss": 0.71571881, + "learning_rate": 3.861832179025394e-06, + "loss": 0.7381714, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.1973877, + "step": 2419, + "time_per_iteration": 2.5236008167266846 + }, + { + "auxiliary_loss_clip": 0.01168342, + "auxiliary_loss_mlp": 0.01047994, + "balance_loss_clip": 1.06146264, + "balance_loss_mlp": 1.02723956, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.2512458325773848, + "language_loss": 0.89782596, + "learning_rate": 3.861689899419569e-06, + "loss": 0.91998923, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.20751953, + "step": 2420, + "time_per_iteration": 2.5003600120544434 + }, + { + "auxiliary_loss_clip": 0.01174931, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_clip": 1.06372058, + "balance_loss_mlp": 1.03503633, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 1.9744537829673847, + "language_loss": 0.83146822, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85376054, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.19262695, + "step": 2421, + "time_per_iteration": 2.459678888320923 + }, + { + "auxiliary_loss_clip": 0.01168458, + "auxiliary_loss_mlp": 0.01051275, + "balance_loss_clip": 1.05938828, + "balance_loss_mlp": 1.03109288, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 2.163047307752886, + "language_loss": 0.81482565, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83702302, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.20178223, + "step": 2422, + "time_per_iteration": 2.439898729324341 + }, + { + "auxiliary_loss_clip": 0.01083724, + "auxiliary_loss_mlp": 0.01015761, + "balance_loss_clip": 1.04489541, + "balance_loss_mlp": 1.01288176, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9159621678830908, + "language_loss": 0.63322538, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65422022, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.38769531, + "router_z_loss_mlp": 0.02880859, + "step": 2423, + "time_per_iteration": 3.094573497772217 + }, + { + "auxiliary_loss_clip": 0.01168332, + "auxiliary_loss_mlp": 0.01050557, + "balance_loss_clip": 1.06196785, + "balance_loss_mlp": 1.03221047, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.6959147588231225, + "language_loss": 0.82755619, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84974509, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.18347168, + "step": 2424, + "time_per_iteration": 2.5667080879211426 + }, + { + "auxiliary_loss_clip": 0.01164615, + "auxiliary_loss_mlp": 0.01049734, + "balance_loss_clip": 1.05817115, + "balance_loss_mlp": 1.03052974, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.4073544990047457, + "language_loss": 0.78717506, + "learning_rate": 3.860977442566429e-06, + "loss": 0.80931854, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.1920166, + "step": 2425, + "time_per_iteration": 2.4832358360290527 + }, + { + "auxiliary_loss_clip": 0.01165735, + "auxiliary_loss_mlp": 0.01047034, + "balance_loss_clip": 1.05900621, + "balance_loss_mlp": 1.02759087, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.5949409640221717, + "language_loss": 0.83825338, + "learning_rate": 3.860834739468821e-06, + "loss": 0.86038107, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.19445801, + "step": 2426, + "time_per_iteration": 2.4450485706329346 + }, + { + "auxiliary_loss_clip": 0.01165565, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.05976892, + "balance_loss_mlp": 1.02831721, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.8806940769876639, + "language_loss": 0.87204492, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89417028, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.18664551, + "step": 2427, + "time_per_iteration": 2.464453935623169 + }, + { + "auxiliary_loss_clip": 0.01169269, + "auxiliary_loss_mlp": 0.01050182, + "balance_loss_clip": 1.05865455, + "balance_loss_mlp": 1.02990437, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 2.402953771354378, + "language_loss": 0.66905218, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69124675, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.20263672, + "step": 2428, + "time_per_iteration": 2.4174811840057373 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.01047925, + "balance_loss_clip": 1.05883396, + "balance_loss_mlp": 1.02872074, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.8246504290639807, + "language_loss": 0.83556759, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85771263, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1920166, + "step": 2429, + "time_per_iteration": 2.5015780925750732 + }, + { + "auxiliary_loss_clip": 0.0117183, + "auxiliary_loss_mlp": 0.01045343, + "balance_loss_clip": 1.06275415, + "balance_loss_mlp": 1.02704489, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.7657092188738033, + "language_loss": 0.79013526, + "learning_rate": 3.860263221502145e-06, + "loss": 0.812307, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1829834, + "step": 2430, + "time_per_iteration": 2.4183273315429688 + }, + { + "auxiliary_loss_clip": 0.01169242, + "auxiliary_loss_mlp": 0.01072688, + "balance_loss_clip": 1.06011248, + "balance_loss_mlp": 1.05142164, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.222953162719317, + "language_loss": 0.83472204, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85714126, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.21276855, + "step": 2431, + "time_per_iteration": 2.485076904296875 + }, + { + "auxiliary_loss_clip": 0.01172699, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.06077337, + "balance_loss_mlp": 1.03185952, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7727922609248594, + "language_loss": 0.79202014, + "learning_rate": 3.859977039248921e-06, + "loss": 0.81426507, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.19921875, + "step": 2432, + "time_per_iteration": 2.430673599243164 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.06258762, + "balance_loss_mlp": 1.03758764, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9873582742454723, + "language_loss": 0.79534549, + "learning_rate": 3.859833842323822e-06, + "loss": 0.81766176, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.20251465, + "step": 2433, + "time_per_iteration": 2.499192714691162 + }, + { + "auxiliary_loss_clip": 0.01166085, + "auxiliary_loss_mlp": 0.01049957, + "balance_loss_clip": 1.06221342, + "balance_loss_mlp": 1.03014469, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 3.518702036000044, + "language_loss": 0.77894855, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80110896, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.19812012, + "step": 2434, + "time_per_iteration": 2.426445960998535 + }, + { + "auxiliary_loss_clip": 0.010974, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.05835211, + "balance_loss_mlp": 1.03177857, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8687711333041203, + "language_loss": 0.58429754, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60561538, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.39013672, + "router_z_loss_mlp": 0.02606201, + "step": 2435, + "time_per_iteration": 3.0733470916748047 + }, + { + "auxiliary_loss_clip": 0.01168243, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_clip": 1.06356716, + "balance_loss_mlp": 1.02627492, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.385607014372293, + "language_loss": 0.88402081, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90614855, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.18249512, + "step": 2436, + "time_per_iteration": 2.419828414916992 + }, + { + "auxiliary_loss_clip": 0.01170389, + "auxiliary_loss_mlp": 0.0104542, + "balance_loss_clip": 1.06006622, + "balance_loss_mlp": 1.02597702, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.753112390007458, + "language_loss": 0.74813825, + "learning_rate": 3.85926034942691e-06, + "loss": 0.77029634, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 1.10595703, + "router_z_loss_mlp": 0.19458008, + "step": 2437, + "time_per_iteration": 2.4684979915618896 + }, + { + "auxiliary_loss_clip": 0.01169286, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.05989945, + "balance_loss_mlp": 1.02720547, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.2357007636938864, + "language_loss": 0.73378903, + "learning_rate": 3.859116799930736e-06, + "loss": 0.75597525, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.22143555, + "step": 2438, + "time_per_iteration": 2.4687278270721436 + }, + { + "auxiliary_loss_clip": 0.01171655, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.06518579, + "balance_loss_mlp": 1.02380955, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.7788870887396497, + "language_loss": 0.74400258, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76614261, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.1854248, + "step": 2439, + "time_per_iteration": 2.5181453227996826 + }, + { + "auxiliary_loss_clip": 0.011711, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.0621953, + "balance_loss_mlp": 1.03355443, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 1.8923869100928878, + "language_loss": 0.74494785, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76719791, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.20336914, + "step": 2440, + "time_per_iteration": 2.6048741340637207 + }, + { + "auxiliary_loss_clip": 0.01175511, + "auxiliary_loss_mlp": 0.0105174, + "balance_loss_clip": 1.06896925, + "balance_loss_mlp": 1.03296459, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.8251664588397574, + "language_loss": 0.83058989, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85286236, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.18786621, + "step": 2441, + "time_per_iteration": 2.5535383224487305 + }, + { + "auxiliary_loss_clip": 0.01178176, + "auxiliary_loss_mlp": 0.01051376, + "balance_loss_clip": 1.06289887, + "balance_loss_mlp": 1.03063345, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 57.39810442347842, + "language_loss": 0.72200751, + "learning_rate": 3.858541897021563e-06, + "loss": 0.74430299, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.20751953, + "step": 2442, + "time_per_iteration": 2.448782444000244 + }, + { + "auxiliary_loss_clip": 0.01184016, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_clip": 1.06928742, + "balance_loss_mlp": 1.03022671, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 8.774879653370402, + "language_loss": 0.8069573, + "learning_rate": 3.8583979950904e-06, + "loss": 0.8292954, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.19543457, + "step": 2443, + "time_per_iteration": 2.50213885307312 + }, + { + "auxiliary_loss_clip": 0.01188965, + "auxiliary_loss_mlp": 0.01055551, + "balance_loss_clip": 1.07959437, + "balance_loss_mlp": 1.03603613, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 1.8794743816954418, + "language_loss": 0.8326292, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85507441, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.19519043, + "step": 2444, + "time_per_iteration": 2.461459159851074 + }, + { + "auxiliary_loss_clip": 0.0116847, + "auxiliary_loss_mlp": 0.0104988, + "balance_loss_clip": 1.06031859, + "balance_loss_mlp": 1.03122425, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 2.189590572993196, + "language_loss": 0.70907176, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73125523, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.18652344, + "step": 2445, + "time_per_iteration": 2.473029851913452 + }, + { + "auxiliary_loss_clip": 0.01079457, + "auxiliary_loss_mlp": 0.01028369, + "balance_loss_clip": 1.04032421, + "balance_loss_mlp": 1.02586901, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8319721554400834, + "language_loss": 0.6309582, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65203649, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.39111328, + "router_z_loss_mlp": 0.0249939, + "step": 2446, + "time_per_iteration": 2.9502007961273193 + }, + { + "auxiliary_loss_clip": 0.01169921, + "auxiliary_loss_mlp": 0.0105004, + "balance_loss_clip": 1.06143785, + "balance_loss_mlp": 1.03028679, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8045239739796275, + "language_loss": 0.74917579, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77137536, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.19750977, + "step": 2447, + "time_per_iteration": 2.498321533203125 + }, + { + "auxiliary_loss_clip": 0.01164781, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.05630231, + "balance_loss_mlp": 1.02391124, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.1901974151482855, + "language_loss": 0.84860885, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87068272, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.18688965, + "step": 2448, + "time_per_iteration": 2.4766108989715576 + }, + { + "auxiliary_loss_clip": 0.01090606, + "auxiliary_loss_mlp": 0.01003522, + "balance_loss_clip": 1.05230916, + "balance_loss_mlp": 1.00105155, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.762893014056097, + "language_loss": 0.56827861, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58921993, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.38232422, + "router_z_loss_mlp": 0.02468872, + "step": 2449, + "time_per_iteration": 3.0637433528900146 + }, + { + "auxiliary_loss_clip": 0.01163662, + "auxiliary_loss_mlp": 0.01057425, + "balance_loss_clip": 1.05737519, + "balance_loss_mlp": 1.03535938, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 2.163035987985509, + "language_loss": 0.85228699, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87449789, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.2208252, + "step": 2450, + "time_per_iteration": 2.4631309509277344 + }, + { + "auxiliary_loss_clip": 0.01165218, + "auxiliary_loss_mlp": 0.01056073, + "balance_loss_clip": 1.05634737, + "balance_loss_mlp": 1.03627193, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 1.8614282677138947, + "language_loss": 0.74827969, + "learning_rate": 3.857244243157052e-06, + "loss": 0.77049255, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.19812012, + "step": 2451, + "time_per_iteration": 2.4283676147460938 + }, + { + "auxiliary_loss_clip": 0.01162035, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.05786276, + "balance_loss_mlp": 1.02664971, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6756183444490966, + "language_loss": 0.82338721, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8454529, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.17895508, + "step": 2452, + "time_per_iteration": 2.447007417678833 + }, + { + "auxiliary_loss_clip": 0.01166513, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.06153607, + "balance_loss_mlp": 1.02564168, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 2.134659893524917, + "language_loss": 0.7430256, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76512146, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.17431641, + "step": 2453, + "time_per_iteration": 2.4466917514801025 + }, + { + "auxiliary_loss_clip": 0.01160578, + "auxiliary_loss_mlp": 0.01053998, + "balance_loss_clip": 1.05221629, + "balance_loss_mlp": 1.03479362, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 3.579305661477172, + "language_loss": 0.76185441, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78400016, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.19189453, + "step": 2454, + "time_per_iteration": 3.9612064361572266 + }, + { + "auxiliary_loss_clip": 0.01164185, + "auxiliary_loss_mlp": 0.01050309, + "balance_loss_clip": 1.05509591, + "balance_loss_mlp": 1.03139019, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.4321106609544896, + "language_loss": 0.83226228, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85440725, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.18908691, + "step": 2455, + "time_per_iteration": 2.4383187294006348 + }, + { + "auxiliary_loss_clip": 0.01168608, + "auxiliary_loss_mlp": 0.01063368, + "balance_loss_clip": 1.05508614, + "balance_loss_mlp": 1.04456902, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.4899637014173877, + "language_loss": 0.84041643, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86273623, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.18811035, + "step": 2456, + "time_per_iteration": 3.9317924976348877 + }, + { + "auxiliary_loss_clip": 0.01161114, + "auxiliary_loss_mlp": 0.01044773, + "balance_loss_clip": 1.05695021, + "balance_loss_mlp": 1.02648664, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8512095514522753, + "language_loss": 0.84251118, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86457002, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18286133, + "step": 2457, + "time_per_iteration": 2.497030735015869 + }, + { + "auxiliary_loss_clip": 0.01160065, + "auxiliary_loss_mlp": 0.01045305, + "balance_loss_clip": 1.05777502, + "balance_loss_mlp": 1.02729249, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.8679026414287943, + "language_loss": 0.75232172, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77437544, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.18017578, + "step": 2458, + "time_per_iteration": 2.444739818572998 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.05723703, + "balance_loss_mlp": 1.02862549, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.036887916846798, + "language_loss": 0.83583885, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85802758, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.19995117, + "step": 2459, + "time_per_iteration": 2.4701976776123047 + }, + { + "auxiliary_loss_clip": 0.01154883, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_clip": 1.05299306, + "balance_loss_mlp": 1.02355695, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.1162929402863555, + "language_loss": 0.75091201, + "learning_rate": 3.855940884716071e-06, + "loss": 0.77287483, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.17834473, + "step": 2460, + "time_per_iteration": 3.84110164642334 + }, + { + "auxiliary_loss_clip": 0.01169155, + "auxiliary_loss_mlp": 0.01048012, + "balance_loss_clip": 1.05876851, + "balance_loss_mlp": 1.0282228, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 2.0114439608918384, + "language_loss": 0.8150003, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83717197, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.19775391, + "step": 2461, + "time_per_iteration": 3.94620680809021 + }, + { + "auxiliary_loss_clip": 0.01173147, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.06514168, + "balance_loss_mlp": 1.0296042, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 2.863305051974071, + "language_loss": 0.65978456, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68201494, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.20288086, + "step": 2462, + "time_per_iteration": 2.4324138164520264 + }, + { + "auxiliary_loss_clip": 0.01162356, + "auxiliary_loss_mlp": 0.0104772, + "balance_loss_clip": 1.05624044, + "balance_loss_mlp": 1.02723956, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4487989879497212, + "language_loss": 0.67102504, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69312578, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.20483398, + "step": 2463, + "time_per_iteration": 2.7787094116210938 + }, + { + "auxiliary_loss_clip": 0.01164195, + "auxiliary_loss_mlp": 0.01050048, + "balance_loss_clip": 1.05602694, + "balance_loss_mlp": 1.02973449, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 1.914331733695736, + "language_loss": 0.76568234, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78782481, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.203125, + "step": 2464, + "time_per_iteration": 2.474991798400879 + }, + { + "auxiliary_loss_clip": 0.01161674, + "auxiliary_loss_mlp": 0.01047027, + "balance_loss_clip": 1.05746198, + "balance_loss_mlp": 1.02946758, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.7067136118580757, + "language_loss": 0.79882264, + "learning_rate": 3.855214333225688e-06, + "loss": 0.82090974, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.17578125, + "step": 2465, + "time_per_iteration": 2.4758694171905518 + }, + { + "auxiliary_loss_clip": 0.01172257, + "auxiliary_loss_mlp": 0.01045055, + "balance_loss_clip": 1.06137395, + "balance_loss_mlp": 1.02456307, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 1.5008770596423646, + "language_loss": 0.76024944, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78242254, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.20483398, + "step": 2466, + "time_per_iteration": 2.487135887145996 + }, + { + "auxiliary_loss_clip": 0.0106999, + "auxiliary_loss_mlp": 0.01020987, + "balance_loss_clip": 1.033463, + "balance_loss_mlp": 1.01766658, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.7860806934226963, + "language_loss": 0.60057843, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62148815, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.03323364, + "step": 2467, + "time_per_iteration": 3.0679588317871094 + }, + { + "auxiliary_loss_clip": 0.01161949, + "auxiliary_loss_mlp": 0.01049662, + "balance_loss_clip": 1.05673075, + "balance_loss_mlp": 1.02942014, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.047683367949014, + "language_loss": 0.8768639, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89898002, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.20251465, + "step": 2468, + "time_per_iteration": 2.5289344787597656 + }, + { + "auxiliary_loss_clip": 0.0116599, + "auxiliary_loss_mlp": 0.01050553, + "balance_loss_clip": 1.0584923, + "balance_loss_mlp": 1.02843964, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 2.330904234345528, + "language_loss": 0.75773251, + "learning_rate": 3.854631825701919e-06, + "loss": 0.77989799, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.22106934, + "step": 2469, + "time_per_iteration": 2.462374687194824 + }, + { + "auxiliary_loss_clip": 0.01163037, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.05870605, + "balance_loss_mlp": 1.02086389, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 3.983907632186559, + "language_loss": 0.7573936, + "learning_rate": 3.854486022987603e-06, + "loss": 0.77942091, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.18823242, + "step": 2470, + "time_per_iteration": 2.434196949005127 + }, + { + "auxiliary_loss_clip": 0.0116234, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.05894566, + "balance_loss_mlp": 1.02888036, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 2.0455690837194664, + "language_loss": 0.720586, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74268955, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.19128418, + "step": 2471, + "time_per_iteration": 2.4614903926849365 + }, + { + "auxiliary_loss_clip": 0.01166616, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.05638099, + "balance_loss_mlp": 1.02818584, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 43.21423494107486, + "language_loss": 0.89427948, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91642869, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.20117188, + "step": 2472, + "time_per_iteration": 2.456617593765259 + }, + { + "auxiliary_loss_clip": 0.01163713, + "auxiliary_loss_mlp": 0.01057201, + "balance_loss_clip": 1.05529308, + "balance_loss_mlp": 1.03594542, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.432117795572625, + "language_loss": 0.80350888, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82571799, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.21252441, + "step": 2473, + "time_per_iteration": 2.474792957305908 + }, + { + "auxiliary_loss_clip": 0.01167586, + "auxiliary_loss_mlp": 0.0106001, + "balance_loss_clip": 1.05705106, + "balance_loss_mlp": 1.04034007, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.799690824538103, + "language_loss": 0.7755754, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79785138, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.19665527, + "step": 2474, + "time_per_iteration": 2.5624027252197266 + }, + { + "auxiliary_loss_clip": 0.01168834, + "auxiliary_loss_mlp": 0.01059069, + "balance_loss_clip": 1.05680037, + "balance_loss_mlp": 1.03695583, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.9041094176536477, + "language_loss": 0.82620108, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84848011, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.22094727, + "step": 2475, + "time_per_iteration": 2.500009775161743 + }, + { + "auxiliary_loss_clip": 0.01171427, + "auxiliary_loss_mlp": 0.01053136, + "balance_loss_clip": 1.06440282, + "balance_loss_mlp": 1.03444433, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 2.3429465628210946, + "language_loss": 0.80753356, + "learning_rate": 3.85360973012719e-06, + "loss": 0.82977921, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.18688965, + "step": 2476, + "time_per_iteration": 2.531339645385742 + }, + { + "auxiliary_loss_clip": 0.01161526, + "auxiliary_loss_mlp": 0.01046247, + "balance_loss_clip": 1.05972874, + "balance_loss_mlp": 1.02827013, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.842290843391067, + "language_loss": 0.78042185, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80249959, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.1796875, + "step": 2477, + "time_per_iteration": 2.513329029083252 + }, + { + "auxiliary_loss_clip": 0.01110998, + "auxiliary_loss_mlp": 0.01014726, + "balance_loss_clip": 1.07260001, + "balance_loss_mlp": 1.01209164, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8081501048461297, + "language_loss": 0.60123229, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62248951, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.38378906, + "router_z_loss_mlp": 0.02633667, + "step": 2478, + "time_per_iteration": 3.1416540145874023 + }, + { + "auxiliary_loss_clip": 0.01160695, + "auxiliary_loss_mlp": 0.01052331, + "balance_loss_clip": 1.05545974, + "balance_loss_mlp": 1.03353143, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.1101496155023836, + "language_loss": 0.71247554, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73460585, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.18823242, + "step": 2479, + "time_per_iteration": 2.4710378646850586 + }, + { + "auxiliary_loss_clip": 0.01160348, + "auxiliary_loss_mlp": 0.01061668, + "balance_loss_clip": 1.05450046, + "balance_loss_mlp": 1.03978157, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.9537440977414944, + "language_loss": 0.80789876, + "learning_rate": 3.853024129031751e-06, + "loss": 0.83011889, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.21875, + "step": 2480, + "time_per_iteration": 2.476921319961548 + }, + { + "auxiliary_loss_clip": 0.01164445, + "auxiliary_loss_mlp": 0.01058261, + "balance_loss_clip": 1.0542891, + "balance_loss_mlp": 1.03568256, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 1.8638661583367733, + "language_loss": 0.84177351, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86400056, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.22583008, + "step": 2481, + "time_per_iteration": 2.428948163986206 + }, + { + "auxiliary_loss_clip": 0.011646, + "auxiliary_loss_mlp": 0.01052895, + "balance_loss_clip": 1.0577805, + "balance_loss_mlp": 1.03243911, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 2.117316466097552, + "language_loss": 0.77692747, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79910243, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.20458984, + "step": 2482, + "time_per_iteration": 2.472700834274292 + }, + { + "auxiliary_loss_clip": 0.01176069, + "auxiliary_loss_mlp": 0.01042589, + "balance_loss_clip": 1.06353641, + "balance_loss_mlp": 1.02181077, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 1.9791712836747988, + "language_loss": 0.78899062, + "learning_rate": 3.852584190388713e-06, + "loss": 0.81117719, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.2076416, + "step": 2483, + "time_per_iteration": 2.4636571407318115 + }, + { + "auxiliary_loss_clip": 0.01172354, + "auxiliary_loss_mlp": 0.01046166, + "balance_loss_clip": 1.06825149, + "balance_loss_mlp": 1.02909553, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.806543576728995, + "language_loss": 0.70342994, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72561514, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.1706543, + "step": 2484, + "time_per_iteration": 2.483806848526001 + }, + { + "auxiliary_loss_clip": 0.0117159, + "auxiliary_loss_mlp": 0.01044414, + "balance_loss_clip": 1.06191826, + "balance_loss_mlp": 1.02398181, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 9.429366210159753, + "language_loss": 0.84488142, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86704147, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.20446777, + "step": 2485, + "time_per_iteration": 2.450007438659668 + }, + { + "auxiliary_loss_clip": 0.01174794, + "auxiliary_loss_mlp": 0.0104544, + "balance_loss_clip": 1.06497478, + "balance_loss_mlp": 1.02604461, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 6.2343907589983525, + "language_loss": 0.84672469, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.868927, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1940918, + "step": 2486, + "time_per_iteration": 2.5159614086151123 + }, + { + "auxiliary_loss_clip": 0.01161025, + "auxiliary_loss_mlp": 0.01044973, + "balance_loss_clip": 1.0585084, + "balance_loss_mlp": 1.02868855, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.421255908977809, + "language_loss": 0.74994618, + "learning_rate": 3.851996622054842e-06, + "loss": 0.77200615, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.16271973, + "step": 2487, + "time_per_iteration": 2.453610420227051 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.06073523, + "balance_loss_mlp": 1.026842, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.89800093370934, + "language_loss": 0.72199756, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74411798, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.18945312, + "step": 2488, + "time_per_iteration": 2.627363443374634 + }, + { + "auxiliary_loss_clip": 0.01165024, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_clip": 1.05692101, + "balance_loss_mlp": 1.03032029, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.7843063978666396, + "language_loss": 0.71603775, + "learning_rate": 3.851702416498235e-06, + "loss": 0.7381804, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.18920898, + "step": 2489, + "time_per_iteration": 2.4554760456085205 + }, + { + "auxiliary_loss_clip": 0.0116947, + "auxiliary_loss_mlp": 0.01054513, + "balance_loss_clip": 1.06186116, + "balance_loss_mlp": 1.03521323, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 3.050562706355877, + "language_loss": 0.81859004, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.84082991, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.19299316, + "step": 2490, + "time_per_iteration": 2.466543674468994 + }, + { + "auxiliary_loss_clip": 0.01176696, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_clip": 1.06663585, + "balance_loss_mlp": 1.03184557, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 3.982218049899494, + "language_loss": 0.802755, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82502282, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.18261719, + "step": 2491, + "time_per_iteration": 2.557464361190796 + }, + { + "auxiliary_loss_clip": 0.01172743, + "auxiliary_loss_mlp": 0.01048447, + "balance_loss_clip": 1.06308365, + "balance_loss_mlp": 1.02726376, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 1.8127239617453905, + "language_loss": 0.90781277, + "learning_rate": 3.851260581551727e-06, + "loss": 0.93002474, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.21191406, + "step": 2492, + "time_per_iteration": 2.4653003215789795 + }, + { + "auxiliary_loss_clip": 0.01164165, + "auxiliary_loss_mlp": 0.01053653, + "balance_loss_clip": 1.05986238, + "balance_loss_mlp": 1.03467464, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.0354453785984363, + "language_loss": 0.78573531, + "learning_rate": 3.851113162828802e-06, + "loss": 0.80791354, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.18969727, + "step": 2493, + "time_per_iteration": 2.401815414428711 + }, + { + "auxiliary_loss_clip": 0.01167399, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.0595212, + "balance_loss_mlp": 1.02520156, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 1.779662805744728, + "language_loss": 0.80326128, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82539046, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.20324707, + "step": 2494, + "time_per_iteration": 2.4902262687683105 + }, + { + "auxiliary_loss_clip": 0.01163914, + "auxiliary_loss_mlp": 0.01048014, + "balance_loss_clip": 1.05916667, + "balance_loss_mlp": 1.02782059, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.8505681863785597, + "language_loss": 0.66276944, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68488872, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.2019043, + "step": 2495, + "time_per_iteration": 2.8128013610839844 + }, + { + "auxiliary_loss_clip": 0.01085886, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.0489037, + "balance_loss_mlp": 1.03041959, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.9098154716133738, + "language_loss": 0.59499991, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61618716, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.0241394, + "step": 2496, + "time_per_iteration": 3.056065320968628 + }, + { + "auxiliary_loss_clip": 0.01172994, + "auxiliary_loss_mlp": 0.01053404, + "balance_loss_clip": 1.06242132, + "balance_loss_mlp": 1.03245926, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.2224336776738567, + "language_loss": 0.65220159, + "learning_rate": 3.850522786049075e-06, + "loss": 0.6744656, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 1.10498047, + "router_z_loss_mlp": 0.20959473, + "step": 2497, + "time_per_iteration": 2.42714786529541 + }, + { + "auxiliary_loss_clip": 0.01172917, + "auxiliary_loss_mlp": 0.01049473, + "balance_loss_clip": 1.06572461, + "balance_loss_mlp": 1.02999449, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4951281287620781, + "language_loss": 0.75371301, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77593696, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.19482422, + "step": 2498, + "time_per_iteration": 4.053604364395142 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.0621264, + "balance_loss_mlp": 1.02636731, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.2264282441883814, + "language_loss": 0.72407544, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74627584, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.20507812, + "step": 2499, + "time_per_iteration": 2.464986801147461 + }, + { + "auxiliary_loss_clip": 0.01171739, + "auxiliary_loss_mlp": 0.01051082, + "balance_loss_clip": 1.06343186, + "balance_loss_mlp": 1.03081656, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.9497010982835337, + "language_loss": 0.72177422, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74400246, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.20263672, + "step": 2500, + "time_per_iteration": 3.98498797416687 + }, + { + "auxiliary_loss_clip": 0.01170767, + "auxiliary_loss_mlp": 0.01057612, + "balance_loss_clip": 1.06558418, + "balance_loss_mlp": 1.03738189, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.0215390333545864, + "language_loss": 0.65151751, + "learning_rate": 3.849931286517249e-06, + "loss": 0.6738013, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.20227051, + "step": 2501, + "time_per_iteration": 2.5617477893829346 + }, + { + "auxiliary_loss_clip": 0.01163513, + "auxiliary_loss_mlp": 0.01054292, + "balance_loss_clip": 1.05620551, + "balance_loss_mlp": 1.03244138, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.613269836582647, + "language_loss": 0.83707488, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85925293, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.21862793, + "step": 2502, + "time_per_iteration": 2.5003602504730225 + }, + { + "auxiliary_loss_clip": 0.01164891, + "auxiliary_loss_mlp": 0.01052076, + "balance_loss_clip": 1.05829167, + "balance_loss_mlp": 1.03293133, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.0563095539278438, + "language_loss": 0.77222192, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79439157, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.19140625, + "step": 2503, + "time_per_iteration": 2.440530300140381 + }, + { + "auxiliary_loss_clip": 0.01167347, + "auxiliary_loss_mlp": 0.0104699, + "balance_loss_clip": 1.06318164, + "balance_loss_mlp": 1.02870286, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1631011332428716, + "language_loss": 0.85488921, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87703252, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.18273926, + "step": 2504, + "time_per_iteration": 3.9600024223327637 + }, + { + "auxiliary_loss_clip": 0.01159189, + "auxiliary_loss_mlp": 0.0105074, + "balance_loss_clip": 1.05588889, + "balance_loss_mlp": 1.0304867, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6982365771698247, + "language_loss": 0.83197558, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85407495, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.20251465, + "step": 2505, + "time_per_iteration": 3.9692816734313965 + }, + { + "auxiliary_loss_clip": 0.01170369, + "auxiliary_loss_mlp": 0.01051941, + "balance_loss_clip": 1.0634979, + "balance_loss_mlp": 1.03265309, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.3144596937165667, + "language_loss": 0.75797153, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78019458, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.19274902, + "step": 2506, + "time_per_iteration": 2.4233851432800293 + }, + { + "auxiliary_loss_clip": 0.0116921, + "auxiliary_loss_mlp": 0.01050203, + "balance_loss_clip": 1.06123817, + "balance_loss_mlp": 1.03067708, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 3.3457306552452875, + "language_loss": 0.76238763, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78458178, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.1953125, + "step": 2507, + "time_per_iteration": 2.4687888622283936 + }, + { + "auxiliary_loss_clip": 0.01155743, + "auxiliary_loss_mlp": 0.01047123, + "balance_loss_clip": 1.05255294, + "balance_loss_mlp": 1.02839518, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 1.7885415207945474, + "language_loss": 0.68992567, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71195436, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.18713379, + "step": 2508, + "time_per_iteration": 2.495272159576416 + }, + { + "auxiliary_loss_clip": 0.01169866, + "auxiliary_loss_mlp": 0.01050562, + "balance_loss_clip": 1.06370568, + "balance_loss_mlp": 1.03190601, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.9285407893828843, + "language_loss": 0.77688515, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79908943, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.18652344, + "step": 2509, + "time_per_iteration": 2.5043890476226807 + }, + { + "auxiliary_loss_clip": 0.01178211, + "auxiliary_loss_mlp": 0.01050204, + "balance_loss_clip": 1.06553745, + "balance_loss_mlp": 1.02905607, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.631948514071151, + "language_loss": 0.79886711, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82115126, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.21154785, + "step": 2510, + "time_per_iteration": 2.3929524421691895 + }, + { + "auxiliary_loss_clip": 0.0116556, + "auxiliary_loss_mlp": 0.01055912, + "balance_loss_clip": 1.05737734, + "balance_loss_mlp": 1.03544331, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 1.9678991677075153, + "language_loss": 0.73722047, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.75943518, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.20483398, + "step": 2511, + "time_per_iteration": 2.5256848335266113 + }, + { + "auxiliary_loss_clip": 0.01162582, + "auxiliary_loss_mlp": 0.01048722, + "balance_loss_clip": 1.05771303, + "balance_loss_mlp": 1.02819467, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.189313613563181, + "language_loss": 0.69415736, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71627039, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.2052002, + "step": 2512, + "time_per_iteration": 2.443308115005493 + }, + { + "auxiliary_loss_clip": 0.01178655, + "auxiliary_loss_mlp": 0.01050284, + "balance_loss_clip": 1.06982565, + "balance_loss_mlp": 1.03036356, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.4312039302609545, + "language_loss": 0.74171233, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76400173, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.19921875, + "step": 2513, + "time_per_iteration": 2.525729179382324 + }, + { + "auxiliary_loss_clip": 0.01072811, + "auxiliary_loss_mlp": 0.01010535, + "balance_loss_clip": 1.03457952, + "balance_loss_mlp": 1.00672638, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8756511669030824, + "language_loss": 0.64750075, + "learning_rate": 3.84800116337411e-06, + "loss": 0.66833425, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.38232422, + "router_z_loss_mlp": 0.03808594, + "step": 2514, + "time_per_iteration": 3.0067138671875 + }, + { + "auxiliary_loss_clip": 0.0117029, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_clip": 1.06444144, + "balance_loss_mlp": 1.02664161, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.707765840639522, + "language_loss": 0.73146605, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75362217, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.18688965, + "step": 2515, + "time_per_iteration": 2.4746084213256836 + }, + { + "auxiliary_loss_clip": 0.01168353, + "auxiliary_loss_mlp": 0.01046237, + "balance_loss_clip": 1.06141388, + "balance_loss_mlp": 1.02550638, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8248744793281941, + "language_loss": 0.777897, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.80004293, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.20727539, + "step": 2516, + "time_per_iteration": 2.428680181503296 + }, + { + "auxiliary_loss_clip": 0.01072839, + "auxiliary_loss_mlp": 0.01011984, + "balance_loss_clip": 1.03456616, + "balance_loss_mlp": 1.00860453, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7251622941462355, + "language_loss": 0.54667735, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56752563, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.03387451, + "step": 2517, + "time_per_iteration": 3.091168165206909 + }, + { + "auxiliary_loss_clip": 0.01167844, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.05676997, + "balance_loss_mlp": 1.02770376, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.03244964744351, + "language_loss": 0.78806293, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.81022453, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.20617676, + "step": 2518, + "time_per_iteration": 2.417834997177124 + }, + { + "auxiliary_loss_clip": 0.0118069, + "auxiliary_loss_mlp": 0.01047699, + "balance_loss_clip": 1.06856036, + "balance_loss_mlp": 1.02690911, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.075883597006508, + "language_loss": 0.69933456, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72161841, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.20800781, + "step": 2519, + "time_per_iteration": 2.5125107765197754 + }, + { + "auxiliary_loss_clip": 0.0116429, + "auxiliary_loss_mlp": 0.01051188, + "balance_loss_clip": 1.05651307, + "balance_loss_mlp": 1.03160191, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.8684401555005592, + "language_loss": 0.78774899, + "learning_rate": 3.847106342204354e-06, + "loss": 0.80990374, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.19592285, + "step": 2520, + "time_per_iteration": 2.4226815700531006 + }, + { + "auxiliary_loss_clip": 0.01170055, + "auxiliary_loss_mlp": 0.01057802, + "balance_loss_clip": 1.0596267, + "balance_loss_mlp": 1.03661823, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.1405544281719515, + "language_loss": 0.75087368, + "learning_rate": 3.846956960161114e-06, + "loss": 0.77315223, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.21166992, + "step": 2521, + "time_per_iteration": 2.5189380645751953 + }, + { + "auxiliary_loss_clip": 0.01175785, + "auxiliary_loss_mlp": 0.01057941, + "balance_loss_clip": 1.06170857, + "balance_loss_mlp": 1.03655505, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.239953921437418, + "language_loss": 0.82331586, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84565318, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.21386719, + "step": 2522, + "time_per_iteration": 2.4560580253601074 + }, + { + "auxiliary_loss_clip": 0.01086917, + "auxiliary_loss_mlp": 0.01011523, + "balance_loss_clip": 1.04546452, + "balance_loss_mlp": 1.00905883, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8264783715249966, + "language_loss": 0.57913285, + "learning_rate": 3.846657985969922e-06, + "loss": 0.60011721, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 0.02462769, + "step": 2523, + "time_per_iteration": 3.156467914581299 + }, + { + "auxiliary_loss_clip": 0.01161261, + "auxiliary_loss_mlp": 0.0106603, + "balance_loss_clip": 1.053321, + "balance_loss_mlp": 1.04290295, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.7327727159088082, + "language_loss": 0.74903107, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77130401, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.23120117, + "step": 2524, + "time_per_iteration": 2.560215950012207 + }, + { + "auxiliary_loss_clip": 0.01169932, + "auxiliary_loss_mlp": 0.01050465, + "balance_loss_clip": 1.05996025, + "balance_loss_mlp": 1.03015161, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8219249950618384, + "language_loss": 0.75096464, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.77316856, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.20324707, + "step": 2525, + "time_per_iteration": 2.466134548187256 + }, + { + "auxiliary_loss_clip": 0.01168541, + "auxiliary_loss_mlp": 0.01068114, + "balance_loss_clip": 1.05773759, + "balance_loss_mlp": 1.04569113, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 2.100918725640241, + "language_loss": 0.79744756, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81981409, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.22424316, + "step": 2526, + "time_per_iteration": 2.4209282398223877 + }, + { + "auxiliary_loss_clip": 0.01160852, + "auxiliary_loss_mlp": 0.0104658, + "balance_loss_clip": 1.05700922, + "balance_loss_mlp": 1.02770901, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 2.0394568777617925, + "language_loss": 0.85105032, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87312466, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.18859863, + "step": 2527, + "time_per_iteration": 2.4591214656829834 + }, + { + "auxiliary_loss_clip": 0.01162142, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.05449152, + "balance_loss_mlp": 1.02632892, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2129066603271563, + "language_loss": 0.69351697, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71560347, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.20166016, + "step": 2528, + "time_per_iteration": 2.5941786766052246 + }, + { + "auxiliary_loss_clip": 0.01178247, + "auxiliary_loss_mlp": 0.01051166, + "balance_loss_clip": 1.06960833, + "balance_loss_mlp": 1.03123379, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 1.6793695939288251, + "language_loss": 0.87123942, + "learning_rate": 3.845759382967026e-06, + "loss": 0.89353359, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.19934082, + "step": 2529, + "time_per_iteration": 2.488814115524292 + }, + { + "auxiliary_loss_clip": 0.01162214, + "auxiliary_loss_mlp": 0.01047589, + "balance_loss_clip": 1.05692315, + "balance_loss_mlp": 1.02648902, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 2.243712299231861, + "language_loss": 0.83231318, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85441118, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.21105957, + "step": 2530, + "time_per_iteration": 2.4592084884643555 + }, + { + "auxiliary_loss_clip": 0.01167117, + "auxiliary_loss_mlp": 0.01049602, + "balance_loss_clip": 1.05997777, + "balance_loss_mlp": 1.02840674, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 1.8441167865664223, + "language_loss": 0.80020761, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82237476, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 1.07080078, + "router_z_loss_mlp": 0.21191406, + "step": 2531, + "time_per_iteration": 2.477107286453247 + }, + { + "auxiliary_loss_clip": 0.01166398, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_clip": 1.05853128, + "balance_loss_mlp": 1.04020023, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 1.9934151200598342, + "language_loss": 0.78793436, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81021541, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.21508789, + "step": 2532, + "time_per_iteration": 2.4462287425994873 + }, + { + "auxiliary_loss_clip": 0.01168, + "auxiliary_loss_mlp": 0.01045483, + "balance_loss_clip": 1.06183052, + "balance_loss_mlp": 1.02497864, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 2.701650720313273, + "language_loss": 0.87656331, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89869809, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.2052002, + "step": 2533, + "time_per_iteration": 2.4752488136291504 + }, + { + "auxiliary_loss_clip": 0.01162946, + "auxiliary_loss_mlp": 0.01053805, + "balance_loss_clip": 1.05510545, + "balance_loss_mlp": 1.03338408, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.737236015070821, + "language_loss": 0.7884866, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81065416, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.2043457, + "step": 2534, + "time_per_iteration": 2.4007344245910645 + }, + { + "auxiliary_loss_clip": 0.01166022, + "auxiliary_loss_mlp": 0.01052262, + "balance_loss_clip": 1.05467427, + "balance_loss_mlp": 1.03066111, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.5613401637829525, + "language_loss": 0.76943445, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79161727, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.21606445, + "step": 2535, + "time_per_iteration": 2.442440986633301 + }, + { + "auxiliary_loss_clip": 0.01167805, + "auxiliary_loss_mlp": 0.01047946, + "balance_loss_clip": 1.05572033, + "balance_loss_mlp": 1.02795506, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.3386002280554963, + "language_loss": 0.77993357, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80209106, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 1.12060547, + "router_z_loss_mlp": 0.20007324, + "step": 2536, + "time_per_iteration": 2.4537582397460938 + }, + { + "auxiliary_loss_clip": 0.01168523, + "auxiliary_loss_mlp": 0.01063027, + "balance_loss_clip": 1.06253076, + "balance_loss_mlp": 1.0419271, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.5833788506850093, + "language_loss": 0.75524592, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77756143, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.2109375, + "step": 2537, + "time_per_iteration": 2.4549763202667236 + }, + { + "auxiliary_loss_clip": 0.01173894, + "auxiliary_loss_mlp": 0.01052333, + "balance_loss_clip": 1.06369901, + "balance_loss_mlp": 1.03249669, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 1.991224073359012, + "language_loss": 0.77684104, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79910332, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.19824219, + "step": 2538, + "time_per_iteration": 2.400087594985962 + }, + { + "auxiliary_loss_clip": 0.01169454, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.06335521, + "balance_loss_mlp": 1.02340198, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.5829138439487567, + "language_loss": 0.89889592, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92101109, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.18652344, + "step": 2539, + "time_per_iteration": 2.483166456222534 + }, + { + "auxiliary_loss_clip": 0.01170978, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_clip": 1.0628587, + "balance_loss_mlp": 1.03208423, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 1.9728908917548524, + "language_loss": 0.93583763, + "learning_rate": 3.844105400822391e-06, + "loss": 0.9580673, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.19921875, + "step": 2540, + "time_per_iteration": 2.4821527004241943 + }, + { + "auxiliary_loss_clip": 0.01162458, + "auxiliary_loss_mlp": 0.01053456, + "balance_loss_clip": 1.05722427, + "balance_loss_mlp": 1.03367937, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.858970816258888, + "language_loss": 0.75376058, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77591968, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.19775391, + "step": 2541, + "time_per_iteration": 3.922766923904419 + }, + { + "auxiliary_loss_clip": 0.01160215, + "auxiliary_loss_mlp": 0.01055013, + "balance_loss_clip": 1.05398965, + "balance_loss_mlp": 1.03435373, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.5684844299892757, + "language_loss": 0.81133842, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83349073, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.20654297, + "step": 2542, + "time_per_iteration": 2.5268328189849854 + }, + { + "auxiliary_loss_clip": 0.01167836, + "auxiliary_loss_mlp": 0.01058557, + "balance_loss_clip": 1.05923653, + "balance_loss_mlp": 1.03774309, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.1871376695016695, + "language_loss": 0.77788723, + "learning_rate": 3.843652845961383e-06, + "loss": 0.80015117, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.20812988, + "step": 2543, + "time_per_iteration": 2.5104429721832275 + }, + { + "auxiliary_loss_clip": 0.01167825, + "auxiliary_loss_mlp": 0.01053181, + "balance_loss_clip": 1.06100488, + "balance_loss_mlp": 1.03336859, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.1750668858956215, + "language_loss": 0.86454773, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675779, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.19799805, + "step": 2544, + "time_per_iteration": 3.8784542083740234 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.01066372, + "balance_loss_clip": 1.05556941, + "balance_loss_mlp": 1.04319799, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.655590915539098, + "language_loss": 0.82520306, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84757698, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.23193359, + "step": 2545, + "time_per_iteration": 2.4390158653259277 + }, + { + "auxiliary_loss_clip": 0.01160806, + "auxiliary_loss_mlp": 0.01045148, + "balance_loss_clip": 1.0538969, + "balance_loss_mlp": 1.02485895, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.229747769419813, + "language_loss": 0.71208316, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73414272, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.20288086, + "step": 2546, + "time_per_iteration": 3.9174818992614746 + }, + { + "auxiliary_loss_clip": 0.01173882, + "auxiliary_loss_mlp": 0.01050975, + "balance_loss_clip": 1.06147766, + "balance_loss_mlp": 1.02799153, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.685482289934443, + "language_loss": 0.77932179, + "learning_rate": 3.843048460745779e-06, + "loss": 0.80157048, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.22998047, + "step": 2547, + "time_per_iteration": 2.760423421859741 + }, + { + "auxiliary_loss_clip": 0.01169515, + "auxiliary_loss_mlp": 0.01056318, + "balance_loss_clip": 1.06072259, + "balance_loss_mlp": 1.03542018, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.3256761847855687, + "language_loss": 0.74416447, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76642281, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.20898438, + "step": 2548, + "time_per_iteration": 3.9623866081237793 + }, + { + "auxiliary_loss_clip": 0.01167758, + "auxiliary_loss_mlp": 0.01055975, + "balance_loss_clip": 1.05797648, + "balance_loss_mlp": 1.03456497, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.545672860954831, + "language_loss": 0.80535197, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82758933, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.21411133, + "step": 2549, + "time_per_iteration": 2.498819589614868 + }, + { + "auxiliary_loss_clip": 0.01161969, + "auxiliary_loss_mlp": 0.01059768, + "balance_loss_clip": 1.05468559, + "balance_loss_mlp": 1.03809524, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 2.960383132623134, + "language_loss": 0.74920589, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77142328, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.21679688, + "step": 2550, + "time_per_iteration": 2.446857452392578 + }, + { + "auxiliary_loss_clip": 0.01166431, + "auxiliary_loss_mlp": 0.01058405, + "balance_loss_clip": 1.05592895, + "balance_loss_mlp": 1.03518295, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.786890893232847, + "language_loss": 0.77377784, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79602623, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.2322998, + "step": 2551, + "time_per_iteration": 2.4791462421417236 + }, + { + "auxiliary_loss_clip": 0.01085518, + "auxiliary_loss_mlp": 0.01004988, + "balance_loss_clip": 1.04847145, + "balance_loss_mlp": 1.00260997, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9361310639584941, + "language_loss": 0.56693792, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58784294, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.37060547, + "router_z_loss_mlp": 0.02377319, + "step": 2552, + "time_per_iteration": 3.0359408855438232 + }, + { + "auxiliary_loss_clip": 0.01166687, + "auxiliary_loss_mlp": 0.01052743, + "balance_loss_clip": 1.05650783, + "balance_loss_mlp": 1.03070092, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.600893799689097, + "language_loss": 0.88924229, + "learning_rate": 3.84213978637978e-06, + "loss": 0.91143656, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 1.10302734, + "router_z_loss_mlp": 0.22058105, + "step": 2553, + "time_per_iteration": 2.478260040283203 + }, + { + "auxiliary_loss_clip": 0.01172278, + "auxiliary_loss_mlp": 0.01052276, + "balance_loss_clip": 1.05801475, + "balance_loss_mlp": 1.03007889, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.7278356828131207, + "language_loss": 0.78455544, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80680096, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.2220459, + "step": 2554, + "time_per_iteration": 2.5009138584136963 + }, + { + "auxiliary_loss_clip": 0.01171282, + "auxiliary_loss_mlp": 0.01059568, + "balance_loss_clip": 1.06137586, + "balance_loss_mlp": 1.03610802, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.176658930457561, + "language_loss": 0.77950501, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80181348, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.23474121, + "step": 2555, + "time_per_iteration": 2.435845375061035 + }, + { + "auxiliary_loss_clip": 0.01161239, + "auxiliary_loss_mlp": 0.01057713, + "balance_loss_clip": 1.05430496, + "balance_loss_mlp": 1.03725672, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.5140567578692976, + "language_loss": 0.77347344, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.795663, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.20471191, + "step": 2556, + "time_per_iteration": 2.518151044845581 + }, + { + "auxiliary_loss_clip": 0.01163224, + "auxiliary_loss_mlp": 0.01054396, + "balance_loss_clip": 1.05639815, + "balance_loss_mlp": 1.03207994, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8603201752194256, + "language_loss": 0.89754629, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9197225, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.22302246, + "step": 2557, + "time_per_iteration": 2.413891553878784 + }, + { + "auxiliary_loss_clip": 0.01171364, + "auxiliary_loss_mlp": 0.0105764, + "balance_loss_clip": 1.05980349, + "balance_loss_mlp": 1.03580105, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.030248325213409, + "language_loss": 0.7041899, + "learning_rate": 3.841380636700468e-06, + "loss": 0.72648001, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.21826172, + "step": 2558, + "time_per_iteration": 2.51639461517334 + }, + { + "auxiliary_loss_clip": 0.01168143, + "auxiliary_loss_mlp": 0.01050591, + "balance_loss_clip": 1.05718398, + "balance_loss_mlp": 1.03006339, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 1.9998134929017177, + "language_loss": 0.92067939, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94286668, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.20532227, + "step": 2559, + "time_per_iteration": 2.421013593673706 + }, + { + "auxiliary_loss_clip": 0.01175035, + "auxiliary_loss_mlp": 0.01060258, + "balance_loss_clip": 1.06496048, + "balance_loss_mlp": 1.03931332, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.3726364408250245, + "language_loss": 0.63735044, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65970337, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.20947266, + "step": 2560, + "time_per_iteration": 2.516329050064087 + }, + { + "auxiliary_loss_clip": 0.01167012, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.05703092, + "balance_loss_mlp": 1.02764773, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.7591276831664229, + "language_loss": 0.88191807, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90407473, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.20996094, + "step": 2561, + "time_per_iteration": 2.4756267070770264 + }, + { + "auxiliary_loss_clip": 0.01173646, + "auxiliary_loss_mlp": 0.01049512, + "balance_loss_clip": 1.06786299, + "balance_loss_mlp": 1.03006935, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.7055349005179206, + "language_loss": 0.83065027, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85288179, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.19433594, + "step": 2562, + "time_per_iteration": 2.4696784019470215 + }, + { + "auxiliary_loss_clip": 0.01177879, + "auxiliary_loss_mlp": 0.01060728, + "balance_loss_clip": 1.06175685, + "balance_loss_mlp": 1.03801894, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 2.1058594275501017, + "language_loss": 0.74863547, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7710216, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.22729492, + "step": 2563, + "time_per_iteration": 2.418987512588501 + }, + { + "auxiliary_loss_clip": 0.01178697, + "auxiliary_loss_mlp": 0.01046176, + "balance_loss_clip": 1.063954, + "balance_loss_mlp": 1.02555263, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 2.027738272629844, + "language_loss": 0.76128161, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78353029, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.20617676, + "step": 2564, + "time_per_iteration": 2.561720848083496 + }, + { + "auxiliary_loss_clip": 0.01180141, + "auxiliary_loss_mlp": 0.01052726, + "balance_loss_clip": 1.07052207, + "balance_loss_mlp": 1.03304458, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.8652270873493526, + "language_loss": 0.70631075, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72863948, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.19702148, + "step": 2565, + "time_per_iteration": 2.488469362258911 + }, + { + "auxiliary_loss_clip": 0.01167083, + "auxiliary_loss_mlp": 0.01052376, + "balance_loss_clip": 1.05978298, + "balance_loss_mlp": 1.03278995, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.092529482262769, + "language_loss": 0.72015858, + "learning_rate": 3.840162366596259e-06, + "loss": 0.74235314, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.19580078, + "step": 2566, + "time_per_iteration": 2.5321261882781982 + }, + { + "auxiliary_loss_clip": 0.01159801, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_clip": 1.05507445, + "balance_loss_mlp": 1.0274713, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.982798475895305, + "language_loss": 0.84736991, + "learning_rate": 3.840009768766408e-06, + "loss": 0.86943567, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.19311523, + "step": 2567, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.01167515, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.0612452, + "balance_loss_mlp": 1.02855599, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 1.9943526932980822, + "language_loss": 0.77865636, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80080032, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.18322754, + "step": 2568, + "time_per_iteration": 2.495051860809326 + }, + { + "auxiliary_loss_clip": 0.01163044, + "auxiliary_loss_mlp": 0.01044205, + "balance_loss_clip": 1.0575397, + "balance_loss_mlp": 1.02352262, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.9789456777377445, + "language_loss": 0.70462036, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72669286, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.20678711, + "step": 2569, + "time_per_iteration": 2.468111991882324 + }, + { + "auxiliary_loss_clip": 0.01158535, + "auxiliary_loss_mlp": 0.01050619, + "balance_loss_clip": 1.05563903, + "balance_loss_mlp": 1.03094935, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.694214748546642, + "language_loss": 0.76460934, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78670096, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.19665527, + "step": 2570, + "time_per_iteration": 2.5823352336883545 + }, + { + "auxiliary_loss_clip": 0.01163634, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.05687165, + "balance_loss_mlp": 1.02935696, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.7899001954404326, + "language_loss": 0.77518559, + "learning_rate": 3.839398679771359e-06, + "loss": 0.79732114, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.20556641, + "step": 2571, + "time_per_iteration": 2.469752788543701 + }, + { + "auxiliary_loss_clip": 0.01168097, + "auxiliary_loss_mlp": 0.01052476, + "balance_loss_clip": 1.05968738, + "balance_loss_mlp": 1.03211558, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 2.5986881033455895, + "language_loss": 0.8280986, + "learning_rate": 3.839245733132652e-06, + "loss": 0.85030431, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.20361328, + "step": 2572, + "time_per_iteration": 2.519909381866455 + }, + { + "auxiliary_loss_clip": 0.0117378, + "auxiliary_loss_mlp": 0.01055512, + "balance_loss_clip": 1.06153131, + "balance_loss_mlp": 1.0364145, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.9116030074621126, + "language_loss": 0.90838605, + "learning_rate": 3.839092716749563e-06, + "loss": 0.9306789, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.19091797, + "step": 2573, + "time_per_iteration": 2.555201292037964 + }, + { + "auxiliary_loss_clip": 0.0117195, + "auxiliary_loss_mlp": 0.01056387, + "balance_loss_clip": 1.06182718, + "balance_loss_mlp": 1.03684831, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.8436051945844028, + "language_loss": 0.70032239, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72260571, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19543457, + "step": 2574, + "time_per_iteration": 2.451831817626953 + }, + { + "auxiliary_loss_clip": 0.01166242, + "auxiliary_loss_mlp": 0.01049519, + "balance_loss_clip": 1.05780578, + "balance_loss_mlp": 1.02887189, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.840029829867842, + "language_loss": 0.82542503, + "learning_rate": 3.838786474773448e-06, + "loss": 0.8475827, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.20617676, + "step": 2575, + "time_per_iteration": 2.420553684234619 + }, + { + "auxiliary_loss_clip": 0.01174969, + "auxiliary_loss_mlp": 0.01052778, + "balance_loss_clip": 1.06315315, + "balance_loss_mlp": 1.03365672, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 1.7549997462813371, + "language_loss": 0.84262168, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86489916, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.19128418, + "step": 2576, + "time_per_iteration": 2.5310165882110596 + }, + { + "auxiliary_loss_clip": 0.01167127, + "auxiliary_loss_mlp": 0.01048008, + "balance_loss_clip": 1.05864501, + "balance_loss_mlp": 1.02886343, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.7011438482831513, + "language_loss": 0.81805491, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84020621, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.19152832, + "step": 2577, + "time_per_iteration": 2.4957122802734375 + }, + { + "auxiliary_loss_clip": 0.01180341, + "auxiliary_loss_mlp": 0.01052003, + "balance_loss_clip": 1.06998682, + "balance_loss_mlp": 1.03259623, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.4895010743690413, + "language_loss": 0.76486921, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78719264, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.19421387, + "step": 2578, + "time_per_iteration": 2.4812419414520264 + }, + { + "auxiliary_loss_clip": 0.01170623, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.06129742, + "balance_loss_mlp": 1.03037846, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.1398748922276796, + "language_loss": 0.8258217, + "learning_rate": 3.83817315414411e-06, + "loss": 0.8480289, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.19714355, + "step": 2579, + "time_per_iteration": 2.4333910942077637 + }, + { + "auxiliary_loss_clip": 0.01171655, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_clip": 1.06569147, + "balance_loss_mlp": 1.03210735, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 2.120924672151994, + "language_loss": 0.80537152, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82760173, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.19274902, + "step": 2580, + "time_per_iteration": 2.5249135494232178 + }, + { + "auxiliary_loss_clip": 0.01079162, + "auxiliary_loss_mlp": 0.01016927, + "balance_loss_clip": 1.04257274, + "balance_loss_mlp": 1.01456022, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.8436228900539148, + "language_loss": 0.58849096, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60945189, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.02368164, + "step": 2581, + "time_per_iteration": 3.187335729598999 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01045937, + "balance_loss_clip": 1.061674, + "balance_loss_mlp": 1.0260644, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.9611729795563428, + "language_loss": 0.84916669, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87133813, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.19873047, + "step": 2582, + "time_per_iteration": 2.5144081115722656 + }, + { + "auxiliary_loss_clip": 0.0117033, + "auxiliary_loss_mlp": 0.01064024, + "balance_loss_clip": 1.05977297, + "balance_loss_mlp": 1.04288793, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.226857010579462, + "language_loss": 0.79227561, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81461912, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.21130371, + "step": 2583, + "time_per_iteration": 2.4671170711517334 + }, + { + "auxiliary_loss_clip": 0.01171227, + "auxiliary_loss_mlp": 0.01051652, + "balance_loss_clip": 1.06168091, + "balance_loss_mlp": 1.03078997, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 3.1646903734454135, + "language_loss": 0.76105678, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78328556, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.20849609, + "step": 2584, + "time_per_iteration": 3.9880363941192627 + }, + { + "auxiliary_loss_clip": 0.01165613, + "auxiliary_loss_mlp": 0.01045086, + "balance_loss_clip": 1.05941772, + "balance_loss_mlp": 1.02559519, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.049797530388482, + "language_loss": 0.76145059, + "learning_rate": 3.837251082205368e-06, + "loss": 0.78355759, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.19494629, + "step": 2585, + "time_per_iteration": 2.447913885116577 + }, + { + "auxiliary_loss_clip": 0.01162325, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.05854225, + "balance_loss_mlp": 1.02561855, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.022871632882861, + "language_loss": 0.61957765, + "learning_rate": 3.837097159674286e-06, + "loss": 0.64165103, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.19384766, + "step": 2586, + "time_per_iteration": 2.4236602783203125 + }, + { + "auxiliary_loss_clip": 0.01166627, + "auxiliary_loss_mlp": 0.01043528, + "balance_loss_clip": 1.05619514, + "balance_loss_mlp": 1.02421606, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.7227733835140575, + "language_loss": 0.80940402, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83150554, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.19311523, + "step": 2587, + "time_per_iteration": 2.460238218307495 + }, + { + "auxiliary_loss_clip": 0.01180865, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.06746984, + "balance_loss_mlp": 1.03253055, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 2.3211002747669047, + "language_loss": 0.87916267, + "learning_rate": 3.836789105629236e-06, + "loss": 0.90153009, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.23339844, + "step": 2588, + "time_per_iteration": 3.973994016647339 + }, + { + "auxiliary_loss_clip": 0.01170232, + "auxiliary_loss_mlp": 0.01054558, + "balance_loss_clip": 1.06397057, + "balance_loss_mlp": 1.03400648, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.3202510308223943, + "language_loss": 0.64534801, + "learning_rate": 3.83663497412695e-06, + "loss": 0.66759592, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.20544434, + "step": 2589, + "time_per_iteration": 2.4934396743774414 + }, + { + "auxiliary_loss_clip": 0.01176694, + "auxiliary_loss_mlp": 0.01054332, + "balance_loss_clip": 1.06499302, + "balance_loss_mlp": 1.0326364, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 2.549643108218427, + "language_loss": 0.83204228, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85435253, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.21704102, + "step": 2590, + "time_per_iteration": 4.002323865890503 + }, + { + "auxiliary_loss_clip": 0.01166199, + "auxiliary_loss_mlp": 0.01049128, + "balance_loss_clip": 1.05819511, + "balance_loss_mlp": 1.02944708, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.389439766669923, + "language_loss": 0.79139262, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81354588, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.19689941, + "step": 2591, + "time_per_iteration": 2.40163516998291 + }, + { + "auxiliary_loss_clip": 0.01178981, + "auxiliary_loss_mlp": 0.01055507, + "balance_loss_clip": 1.06986737, + "balance_loss_mlp": 1.03689826, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 2.217238262734854, + "language_loss": 0.64900619, + "learning_rate": 3.836172161771189e-06, + "loss": 0.67135113, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.18603516, + "step": 2592, + "time_per_iteration": 2.618684768676758 + }, + { + "auxiliary_loss_clip": 0.0117546, + "auxiliary_loss_mlp": 0.01054036, + "balance_loss_clip": 1.06583452, + "balance_loss_mlp": 1.03351998, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.558916859729224, + "language_loss": 0.82106298, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84335792, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.2052002, + "step": 2593, + "time_per_iteration": 3.9370713233947754 + }, + { + "auxiliary_loss_clip": 0.01170456, + "auxiliary_loss_mlp": 0.01050569, + "balance_loss_clip": 1.06396437, + "balance_loss_mlp": 1.03038669, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.232033801732373, + "language_loss": 0.72730219, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.74951243, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.20178223, + "step": 2594, + "time_per_iteration": 2.4991281032562256 + }, + { + "auxiliary_loss_clip": 0.01167644, + "auxiliary_loss_mlp": 0.01042607, + "balance_loss_clip": 1.06291628, + "balance_loss_mlp": 1.02372384, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.615760159104242, + "language_loss": 0.81781846, + "learning_rate": 3.835708722764952e-06, + "loss": 0.839921, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.18884277, + "step": 2595, + "time_per_iteration": 2.5426337718963623 + }, + { + "auxiliary_loss_clip": 0.01163132, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.05596137, + "balance_loss_mlp": 1.02824676, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 1.9160701014851949, + "language_loss": 0.86811453, + "learning_rate": 3.835554103867876e-06, + "loss": 0.8902185, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 1.07177734, + "router_z_loss_mlp": 0.19030762, + "step": 2596, + "time_per_iteration": 2.48673677444458 + }, + { + "auxiliary_loss_clip": 0.0116889, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_clip": 1.06478667, + "balance_loss_mlp": 1.02546525, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 2.206599596958917, + "language_loss": 0.68993098, + "learning_rate": 3.835399415366404e-06, + "loss": 0.71205986, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18554688, + "step": 2597, + "time_per_iteration": 2.435004949569702 + }, + { + "auxiliary_loss_clip": 0.01167494, + "auxiliary_loss_mlp": 0.01045973, + "balance_loss_clip": 1.06529808, + "balance_loss_mlp": 1.02749586, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.8353775514018884, + "language_loss": 0.79783022, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.819965, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.18493652, + "step": 2598, + "time_per_iteration": 2.5473225116729736 + }, + { + "auxiliary_loss_clip": 0.01158292, + "auxiliary_loss_mlp": 0.0104131, + "balance_loss_clip": 1.05641103, + "balance_loss_mlp": 1.0218904, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 1.8961840646671788, + "language_loss": 0.82643086, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84842682, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.19433594, + "step": 2599, + "time_per_iteration": 2.427957057952881 + }, + { + "auxiliary_loss_clip": 0.01181073, + "auxiliary_loss_mlp": 0.01057576, + "balance_loss_clip": 1.06863248, + "balance_loss_mlp": 1.03708375, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.508692560683978, + "language_loss": 0.81645918, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83884573, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.20495605, + "step": 2600, + "time_per_iteration": 2.437466859817505 + }, + { + "auxiliary_loss_clip": 0.01169169, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.06131315, + "balance_loss_mlp": 1.03385413, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 2.4109408748152807, + "language_loss": 0.88521123, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90743053, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.18920898, + "step": 2601, + "time_per_iteration": 2.4248194694519043 + }, + { + "auxiliary_loss_clip": 0.01172978, + "auxiliary_loss_mlp": 0.01063264, + "balance_loss_clip": 1.0661515, + "balance_loss_mlp": 1.04204452, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.7171194452631429, + "language_loss": 0.78488159, + "learning_rate": 3.834624928998508e-06, + "loss": 0.807244, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.21228027, + "step": 2602, + "time_per_iteration": 2.4709348678588867 + }, + { + "auxiliary_loss_clip": 0.01166501, + "auxiliary_loss_mlp": 0.01044122, + "balance_loss_clip": 1.06222546, + "balance_loss_mlp": 1.02525139, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 3.0608370281546504, + "language_loss": 0.73954141, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76164764, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.1887207, + "step": 2603, + "time_per_iteration": 2.430553436279297 + }, + { + "auxiliary_loss_clip": 0.01170529, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.06103933, + "balance_loss_mlp": 1.04139256, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 2.8549045501979764, + "language_loss": 0.88044912, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.90276831, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.19995117, + "step": 2604, + "time_per_iteration": 2.4929611682891846 + }, + { + "auxiliary_loss_clip": 0.01169414, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_clip": 1.05898333, + "balance_loss_mlp": 1.02795184, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.997831719964711, + "language_loss": 0.85682029, + "learning_rate": 3.834159402300841e-06, + "loss": 0.8789891, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.1953125, + "step": 2605, + "time_per_iteration": 2.475738286972046 + }, + { + "auxiliary_loss_clip": 0.01173082, + "auxiliary_loss_mlp": 0.01056185, + "balance_loss_clip": 1.06169915, + "balance_loss_mlp": 1.03608632, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 3.0429131545867203, + "language_loss": 0.73155117, + "learning_rate": 3.834004087624087e-06, + "loss": 0.7538439, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.20092773, + "step": 2606, + "time_per_iteration": 2.545529365539551 + }, + { + "auxiliary_loss_clip": 0.01175823, + "auxiliary_loss_mlp": 0.01048851, + "balance_loss_clip": 1.0682646, + "balance_loss_mlp": 1.03068352, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.297003114599753, + "language_loss": 0.76293802, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.7851848, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.18164062, + "step": 2607, + "time_per_iteration": 2.413827419281006 + }, + { + "auxiliary_loss_clip": 0.01176961, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_clip": 1.07132995, + "balance_loss_mlp": 1.02813518, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 2.297816302440851, + "language_loss": 0.81974733, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84197992, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.1817627, + "step": 2608, + "time_per_iteration": 2.4942855834960938 + }, + { + "auxiliary_loss_clip": 0.01165568, + "auxiliary_loss_mlp": 0.0105486, + "balance_loss_clip": 1.05787146, + "balance_loss_mlp": 1.03385484, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.9281993051016602, + "language_loss": 0.71993899, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74214321, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.20996094, + "step": 2609, + "time_per_iteration": 2.4690232276916504 + }, + { + "auxiliary_loss_clip": 0.01170172, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_clip": 1.06220114, + "balance_loss_mlp": 1.03071547, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8694316880994073, + "language_loss": 0.71908277, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74128729, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.19555664, + "step": 2610, + "time_per_iteration": 2.525569438934326 + }, + { + "auxiliary_loss_clip": 0.0116876, + "auxiliary_loss_mlp": 0.01059499, + "balance_loss_clip": 1.05835891, + "balance_loss_mlp": 1.03856587, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 1.8107761178693391, + "language_loss": 0.72950447, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75178707, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.20922852, + "step": 2611, + "time_per_iteration": 2.468886137008667 + }, + { + "auxiliary_loss_clip": 0.01172064, + "auxiliary_loss_mlp": 0.01049102, + "balance_loss_clip": 1.06340384, + "balance_loss_mlp": 1.02971911, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.163582513724305, + "language_loss": 0.71151471, + "learning_rate": 3.833070739311887e-06, + "loss": 0.73372638, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.19372559, + "step": 2612, + "time_per_iteration": 2.4807279109954834 + }, + { + "auxiliary_loss_clip": 0.01172294, + "auxiliary_loss_mlp": 0.01058276, + "balance_loss_clip": 1.06458616, + "balance_loss_mlp": 1.03859496, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.806446431476777, + "language_loss": 0.75823712, + "learning_rate": 3.83291493793963e-06, + "loss": 0.78054279, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.19677734, + "step": 2613, + "time_per_iteration": 2.4524106979370117 + }, + { + "auxiliary_loss_clip": 0.01164578, + "auxiliary_loss_mlp": 0.01060882, + "balance_loss_clip": 1.05637228, + "balance_loss_mlp": 1.04049754, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.59095306602379, + "language_loss": 0.65859592, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68085051, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.20385742, + "step": 2614, + "time_per_iteration": 2.5492846965789795 + }, + { + "auxiliary_loss_clip": 0.01175559, + "auxiliary_loss_mlp": 0.01058525, + "balance_loss_clip": 1.06537032, + "balance_loss_mlp": 1.03644788, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.2697139395880632, + "language_loss": 0.75302279, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77536362, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.22058105, + "step": 2615, + "time_per_iteration": 2.4287517070770264 + }, + { + "auxiliary_loss_clip": 0.01164627, + "auxiliary_loss_mlp": 0.01054763, + "balance_loss_clip": 1.06113887, + "balance_loss_mlp": 1.03628576, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.629295452606713, + "language_loss": 0.72874838, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75094223, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.18493652, + "step": 2616, + "time_per_iteration": 2.4770662784576416 + }, + { + "auxiliary_loss_clip": 0.01167509, + "auxiliary_loss_mlp": 0.01051901, + "balance_loss_clip": 1.06036401, + "balance_loss_mlp": 1.03170729, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.4324719860899306, + "language_loss": 0.72610974, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74830383, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.20166016, + "step": 2617, + "time_per_iteration": 2.5402607917785645 + }, + { + "auxiliary_loss_clip": 0.01165724, + "auxiliary_loss_mlp": 0.01052234, + "balance_loss_clip": 1.05989468, + "balance_loss_mlp": 1.03076434, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.304897598139724, + "language_loss": 0.74295163, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76513124, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.21459961, + "step": 2618, + "time_per_iteration": 2.512627601623535 + }, + { + "auxiliary_loss_clip": 0.01172378, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.06135964, + "balance_loss_mlp": 1.03073978, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.0198412828544186, + "language_loss": 0.78581196, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80805695, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.21362305, + "step": 2619, + "time_per_iteration": 2.4551148414611816 + }, + { + "auxiliary_loss_clip": 0.01169149, + "auxiliary_loss_mlp": 0.01056739, + "balance_loss_clip": 1.06269169, + "balance_loss_mlp": 1.03757048, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 2.052517518267784, + "language_loss": 0.76690936, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78916824, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.19165039, + "step": 2620, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01171473, + "auxiliary_loss_mlp": 0.01051307, + "balance_loss_clip": 1.06210148, + "balance_loss_mlp": 1.03044486, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.8613739434029744, + "language_loss": 0.7052117, + "learning_rate": 3.831666025302944e-06, + "loss": 0.72743952, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 1.09423828, + "router_z_loss_mlp": 0.20874023, + "step": 2621, + "time_per_iteration": 2.501408815383911 + }, + { + "auxiliary_loss_clip": 0.01178005, + "auxiliary_loss_mlp": 0.01052569, + "balance_loss_clip": 1.06820226, + "balance_loss_mlp": 1.03199363, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 1.970736275555306, + "language_loss": 0.72186965, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74417543, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.20593262, + "step": 2622, + "time_per_iteration": 2.807171106338501 + }, + { + "auxiliary_loss_clip": 0.01170526, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.06170332, + "balance_loss_mlp": 1.02507854, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.727003986420605, + "language_loss": 0.87779188, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89992809, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.17993164, + "step": 2623, + "time_per_iteration": 2.507289409637451 + }, + { + "auxiliary_loss_clip": 0.0116594, + "auxiliary_loss_mlp": 0.01045252, + "balance_loss_clip": 1.06128168, + "balance_loss_mlp": 1.02702475, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7500896163546744, + "language_loss": 0.81935358, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84146547, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.18237305, + "step": 2624, + "time_per_iteration": 2.668030261993408 + }, + { + "auxiliary_loss_clip": 0.01173336, + "auxiliary_loss_mlp": 0.01049835, + "balance_loss_clip": 1.06387699, + "balance_loss_mlp": 1.03033233, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.6175757024487774, + "language_loss": 0.79747647, + "learning_rate": 3.831039901828054e-06, + "loss": 0.81970817, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.19506836, + "step": 2625, + "time_per_iteration": 2.5117504596710205 + }, + { + "auxiliary_loss_clip": 0.0118022, + "auxiliary_loss_mlp": 0.01047456, + "balance_loss_clip": 1.07228982, + "balance_loss_mlp": 1.02949071, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.477952698214655, + "language_loss": 0.80726606, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82954282, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.17956543, + "step": 2626, + "time_per_iteration": 2.544029474258423 + }, + { + "auxiliary_loss_clip": 0.01167601, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.06341684, + "balance_loss_mlp": 1.03193641, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.7534402902017618, + "language_loss": 0.73591036, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75810325, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.19750977, + "step": 2627, + "time_per_iteration": 3.9497780799865723 + }, + { + "auxiliary_loss_clip": 0.01163981, + "auxiliary_loss_mlp": 0.0104826, + "balance_loss_clip": 1.05907834, + "balance_loss_mlp": 1.02896011, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 4.140511007196983, + "language_loss": 0.84932417, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87144661, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.19311523, + "step": 2628, + "time_per_iteration": 2.486623525619507 + }, + { + "auxiliary_loss_clip": 0.01168031, + "auxiliary_loss_mlp": 0.01041052, + "balance_loss_clip": 1.06402159, + "balance_loss_mlp": 1.02312279, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.7783368061216076, + "language_loss": 0.77133894, + "learning_rate": 3.830412667421752e-06, + "loss": 0.79342973, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.17932129, + "step": 2629, + "time_per_iteration": 2.5283334255218506 + }, + { + "auxiliary_loss_clip": 0.01167521, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.06121564, + "balance_loss_mlp": 1.0286845, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.1454452052161614, + "language_loss": 0.73433381, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.75648934, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.19348145, + "step": 2630, + "time_per_iteration": 2.4862325191497803 + }, + { + "auxiliary_loss_clip": 0.01162509, + "auxiliary_loss_mlp": 0.01044419, + "balance_loss_clip": 1.05553496, + "balance_loss_mlp": 1.02514243, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 8.850591113140863, + "language_loss": 0.83918458, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.86125386, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.19250488, + "step": 2631, + "time_per_iteration": 2.5380806922912598 + }, + { + "auxiliary_loss_clip": 0.01165591, + "auxiliary_loss_mlp": 0.01053288, + "balance_loss_clip": 1.05791473, + "balance_loss_mlp": 1.03260565, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.6673831754309862, + "language_loss": 0.78926599, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.81145477, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.20666504, + "step": 2632, + "time_per_iteration": 3.9288971424102783 + }, + { + "auxiliary_loss_clip": 0.01174021, + "auxiliary_loss_mlp": 0.01052511, + "balance_loss_clip": 1.06614137, + "balance_loss_mlp": 1.0333662, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 1.9402405830050935, + "language_loss": 0.83596396, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85822928, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.19152832, + "step": 2633, + "time_per_iteration": 2.5534284114837646 + }, + { + "auxiliary_loss_clip": 0.01168725, + "auxiliary_loss_mlp": 0.01048213, + "balance_loss_clip": 1.06080008, + "balance_loss_mlp": 1.02878213, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.6995191120375577, + "language_loss": 0.76902354, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79119289, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.19445801, + "step": 2634, + "time_per_iteration": 3.8585429191589355 + }, + { + "auxiliary_loss_clip": 0.01177572, + "auxiliary_loss_mlp": 0.01049565, + "balance_loss_clip": 1.06719053, + "balance_loss_mlp": 1.03053904, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.461013901772709, + "language_loss": 0.89012474, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91239607, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.19030762, + "step": 2635, + "time_per_iteration": 2.427752733230591 + }, + { + "auxiliary_loss_clip": 0.01176051, + "auxiliary_loss_mlp": 0.01060968, + "balance_loss_clip": 1.06682432, + "balance_loss_mlp": 1.04188299, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.348848186612764, + "language_loss": 0.75937033, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78174049, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.1907959, + "step": 2636, + "time_per_iteration": 2.5263707637786865 + }, + { + "auxiliary_loss_clip": 0.01166231, + "auxiliary_loss_mlp": 0.01048113, + "balance_loss_clip": 1.05967379, + "balance_loss_mlp": 1.02812195, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.2438828278563117, + "language_loss": 0.72446167, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74660516, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.19970703, + "step": 2637, + "time_per_iteration": 4.027221441268921 + }, + { + "auxiliary_loss_clip": 0.0115813, + "auxiliary_loss_mlp": 0.01044944, + "balance_loss_clip": 1.05533671, + "balance_loss_mlp": 1.02719426, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.905338470756114, + "language_loss": 0.77793586, + "learning_rate": 3.82899733013685e-06, + "loss": 0.79996657, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.17773438, + "step": 2638, + "time_per_iteration": 2.5271098613739014 + }, + { + "auxiliary_loss_clip": 0.01166442, + "auxiliary_loss_mlp": 0.01061327, + "balance_loss_clip": 1.0574218, + "balance_loss_mlp": 1.04056048, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 2.0279932239670666, + "language_loss": 0.7579208, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78019845, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.2076416, + "step": 2639, + "time_per_iteration": 2.5944368839263916 + }, + { + "auxiliary_loss_clip": 0.01170946, + "auxiliary_loss_mlp": 0.01061289, + "balance_loss_clip": 1.06259966, + "balance_loss_mlp": 1.04034448, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 1.8990456223735117, + "language_loss": 0.81390083, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83622313, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.20947266, + "step": 2640, + "time_per_iteration": 2.4928510189056396 + }, + { + "auxiliary_loss_clip": 0.01171749, + "auxiliary_loss_mlp": 0.01051685, + "balance_loss_clip": 1.06621766, + "balance_loss_mlp": 1.03264737, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.520877611486049, + "language_loss": 0.67307788, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69531226, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.19067383, + "step": 2641, + "time_per_iteration": 2.550243854522705 + }, + { + "auxiliary_loss_clip": 0.01179105, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.06636381, + "balance_loss_mlp": 1.03433633, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.390353106771804, + "language_loss": 0.75554651, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77787787, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.19689941, + "step": 2642, + "time_per_iteration": 2.4766056537628174 + }, + { + "auxiliary_loss_clip": 0.01171139, + "auxiliary_loss_mlp": 0.0105263, + "balance_loss_clip": 1.06581783, + "balance_loss_mlp": 1.03355646, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 3.1538027351245432, + "language_loss": 0.70269507, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72493279, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.1907959, + "step": 2643, + "time_per_iteration": 2.518160104751587 + }, + { + "auxiliary_loss_clip": 0.01157157, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.0554862, + "balance_loss_mlp": 1.02301264, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.0913760018907745, + "language_loss": 0.78469402, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80667639, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.18066406, + "step": 2644, + "time_per_iteration": 2.536787748336792 + }, + { + "auxiliary_loss_clip": 0.01157355, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.0544076, + "balance_loss_mlp": 1.02381504, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.958806882807848, + "language_loss": 0.82487553, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84687513, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.18774414, + "step": 2645, + "time_per_iteration": 2.620619058609009 + }, + { + "auxiliary_loss_clip": 0.01165631, + "auxiliary_loss_mlp": 0.01052821, + "balance_loss_clip": 1.05776298, + "balance_loss_mlp": 1.03266311, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 2.033367211972131, + "language_loss": 0.70423955, + "learning_rate": 3.827734536224087e-06, + "loss": 0.7264241, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.20166016, + "step": 2646, + "time_per_iteration": 2.532968282699585 + }, + { + "auxiliary_loss_clip": 0.0116198, + "auxiliary_loss_mlp": 0.01054887, + "balance_loss_clip": 1.05873871, + "balance_loss_mlp": 1.03457332, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.236472379381863, + "language_loss": 0.62136394, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64353263, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.203125, + "step": 2647, + "time_per_iteration": 2.464050054550171 + }, + { + "auxiliary_loss_clip": 0.01166787, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.0632863, + "balance_loss_mlp": 1.02784061, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.2027378845248515, + "language_loss": 0.90119529, + "learning_rate": 3.827418144547318e-06, + "loss": 0.92332911, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.18762207, + "step": 2648, + "time_per_iteration": 2.4089648723602295 + }, + { + "auxiliary_loss_clip": 0.01164171, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.06327772, + "balance_loss_mlp": 1.02689028, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 2.0780034501290943, + "language_loss": 0.9161334, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93821639, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.17236328, + "step": 2649, + "time_per_iteration": 2.513737916946411 + }, + { + "auxiliary_loss_clip": 0.01173856, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.05997419, + "balance_loss_mlp": 1.02849174, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 2.478052281782943, + "language_loss": 0.71830845, + "learning_rate": 3.827101475687033e-06, + "loss": 0.74053228, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.20019531, + "step": 2650, + "time_per_iteration": 2.4703187942504883 + }, + { + "auxiliary_loss_clip": 0.01171705, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.06856298, + "balance_loss_mlp": 1.02304363, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.0545693169602197, + "language_loss": 0.71435279, + "learning_rate": 3.826943037328082e-06, + "loss": 0.7364713, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.17102051, + "step": 2651, + "time_per_iteration": 2.619884729385376 + }, + { + "auxiliary_loss_clip": 0.01163508, + "auxiliary_loss_mlp": 0.0104923, + "balance_loss_clip": 1.05648232, + "balance_loss_mlp": 1.03057432, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.9871317302876956, + "language_loss": 0.80116045, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82328784, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.18652344, + "step": 2652, + "time_per_iteration": 2.487679958343506 + }, + { + "auxiliary_loss_clip": 0.01160947, + "auxiliary_loss_mlp": 0.01046167, + "balance_loss_clip": 1.05971074, + "balance_loss_mlp": 1.02846479, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 3.027766134804142, + "language_loss": 0.6998769, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72194803, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.17687988, + "step": 2653, + "time_per_iteration": 2.4617538452148438 + }, + { + "auxiliary_loss_clip": 0.01174442, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.06895459, + "balance_loss_mlp": 1.02077603, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.38097753700263, + "language_loss": 0.77051687, + "learning_rate": 3.826467306608095e-06, + "loss": 0.79265922, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.19006348, + "step": 2654, + "time_per_iteration": 2.516296148300171 + }, + { + "auxiliary_loss_clip": 0.01167581, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.06367636, + "balance_loss_mlp": 1.02183878, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.9476316817439068, + "language_loss": 0.8204928, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84257388, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.18688965, + "step": 2655, + "time_per_iteration": 2.505373477935791 + }, + { + "auxiliary_loss_clip": 0.01163594, + "auxiliary_loss_mlp": 0.01048256, + "balance_loss_clip": 1.05597162, + "balance_loss_mlp": 1.02989721, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.087683833661472, + "language_loss": 0.74001467, + "learning_rate": 3.826149806485631e-06, + "loss": 0.76213324, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.18371582, + "step": 2656, + "time_per_iteration": 2.454453706741333 + }, + { + "auxiliary_loss_clip": 0.01161024, + "auxiliary_loss_mlp": 0.01044772, + "balance_loss_clip": 1.05841291, + "balance_loss_mlp": 1.02640164, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 2.0124512950075193, + "language_loss": 0.7822957, + "learning_rate": 3.825990952549713e-06, + "loss": 0.80435365, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.18371582, + "step": 2657, + "time_per_iteration": 2.824191093444824 + }, + { + "auxiliary_loss_clip": 0.01167555, + "auxiliary_loss_mlp": 0.01052599, + "balance_loss_clip": 1.06442523, + "balance_loss_mlp": 1.03364515, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.8965161658119207, + "language_loss": 0.74357438, + "learning_rate": 3.825832029372035e-06, + "loss": 0.76577586, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.18969727, + "step": 2658, + "time_per_iteration": 2.4404664039611816 + }, + { + "auxiliary_loss_clip": 0.01161457, + "auxiliary_loss_mlp": 0.0105311, + "balance_loss_clip": 1.05467653, + "balance_loss_mlp": 1.03165269, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.7772262094698053, + "language_loss": 0.7555089, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77765453, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.21459961, + "step": 2659, + "time_per_iteration": 2.60727596282959 + }, + { + "auxiliary_loss_clip": 0.01162617, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.05594265, + "balance_loss_mlp": 1.03028131, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.193595676775038, + "language_loss": 0.90502644, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92714822, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.19274902, + "step": 2660, + "time_per_iteration": 2.453984022140503 + }, + { + "auxiliary_loss_clip": 0.01167006, + "auxiliary_loss_mlp": 0.0105, + "balance_loss_clip": 1.06147158, + "balance_loss_mlp": 1.03064096, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.425539651586679, + "language_loss": 0.78019834, + "learning_rate": 3.82535484444872e-06, + "loss": 0.8023684, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.19360352, + "step": 2661, + "time_per_iteration": 2.646239995956421 + }, + { + "auxiliary_loss_clip": 0.01167837, + "auxiliary_loss_mlp": 0.01046528, + "balance_loss_clip": 1.05987322, + "balance_loss_mlp": 1.02708483, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7299194156466864, + "language_loss": 0.74580371, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76794732, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.19445801, + "step": 2662, + "time_per_iteration": 2.5404107570648193 + }, + { + "auxiliary_loss_clip": 0.01163605, + "auxiliary_loss_mlp": 0.01055852, + "balance_loss_clip": 1.05742955, + "balance_loss_mlp": 1.03651667, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 1.892912763351362, + "language_loss": 0.82226264, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84445715, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.19335938, + "step": 2663, + "time_per_iteration": 2.5696656703948975 + }, + { + "auxiliary_loss_clip": 0.01169081, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.05866575, + "balance_loss_mlp": 1.03142691, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5766788835099885, + "language_loss": 0.797943, + "learning_rate": 3.824877036566672e-06, + "loss": 0.82014906, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.20117188, + "step": 2664, + "time_per_iteration": 2.434004545211792 + }, + { + "auxiliary_loss_clip": 0.01162517, + "auxiliary_loss_mlp": 0.01055176, + "balance_loss_clip": 1.05619073, + "balance_loss_mlp": 1.03550625, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.7335018415549168, + "language_loss": 0.93724376, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95942074, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.19677734, + "step": 2665, + "time_per_iteration": 2.444481134414673 + }, + { + "auxiliary_loss_clip": 0.01172502, + "auxiliary_loss_mlp": 0.01046004, + "balance_loss_clip": 1.05974364, + "balance_loss_mlp": 1.02613211, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 2.5680469591088007, + "language_loss": 0.85082519, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87301022, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.19885254, + "step": 2666, + "time_per_iteration": 2.387073040008545 + }, + { + "auxiliary_loss_clip": 0.01165067, + "auxiliary_loss_mlp": 0.0104838, + "balance_loss_clip": 1.0591619, + "balance_loss_mlp": 1.03030789, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.8708074949688005, + "language_loss": 0.8152405, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83737499, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.18066406, + "step": 2667, + "time_per_iteration": 2.565269708633423 + }, + { + "auxiliary_loss_clip": 0.01158596, + "auxiliary_loss_mlp": 0.0105397, + "balance_loss_clip": 1.05516791, + "balance_loss_mlp": 1.03387189, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.1004543856655595, + "language_loss": 0.73680532, + "learning_rate": 3.824238990625567e-06, + "loss": 0.75893092, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.20092773, + "step": 2668, + "time_per_iteration": 2.474177598953247 + }, + { + "auxiliary_loss_clip": 0.01159052, + "auxiliary_loss_mlp": 0.01057125, + "balance_loss_clip": 1.05310404, + "balance_loss_mlp": 1.03561926, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.7655924354392134, + "language_loss": 0.77334398, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79550576, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.21520996, + "step": 2669, + "time_per_iteration": 2.4974749088287354 + }, + { + "auxiliary_loss_clip": 0.01092324, + "auxiliary_loss_mlp": 0.01008033, + "balance_loss_clip": 1.05145967, + "balance_loss_mlp": 1.00559258, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8000525747105738, + "language_loss": 0.5549174, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57592094, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.40771484, + "router_z_loss_mlp": 0.02441406, + "step": 2670, + "time_per_iteration": 2.940316677093506 + }, + { + "auxiliary_loss_clip": 0.01164201, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.05670691, + "balance_loss_mlp": 1.0268867, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.0951588836856576, + "language_loss": 0.77512038, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79722059, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18945312, + "step": 2671, + "time_per_iteration": 3.8173556327819824 + }, + { + "auxiliary_loss_clip": 0.01166902, + "auxiliary_loss_mlp": 0.01048335, + "balance_loss_clip": 1.05929983, + "balance_loss_mlp": 1.0296905, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.8757762961964755, + "language_loss": 0.65009266, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.67224503, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18652344, + "step": 2672, + "time_per_iteration": 2.617610454559326 + }, + { + "auxiliary_loss_clip": 0.01170535, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.06146026, + "balance_loss_mlp": 1.02137518, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.6206184911951738, + "language_loss": 0.85470629, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87683558, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.21020508, + "step": 2673, + "time_per_iteration": 2.4715230464935303 + }, + { + "auxiliary_loss_clip": 0.0116923, + "auxiliary_loss_mlp": 0.01051337, + "balance_loss_clip": 1.06089842, + "balance_loss_mlp": 1.03209698, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.468676350189091, + "language_loss": 0.73310947, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75531518, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.19250488, + "step": 2674, + "time_per_iteration": 2.4937758445739746 + }, + { + "auxiliary_loss_clip": 0.01163656, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.05843616, + "balance_loss_mlp": 1.02288437, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 1.7705910456416183, + "language_loss": 0.8459143, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86798775, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.20800781, + "step": 2675, + "time_per_iteration": 2.4585378170013428 + }, + { + "auxiliary_loss_clip": 0.01175198, + "auxiliary_loss_mlp": 0.01043137, + "balance_loss_clip": 1.06628823, + "balance_loss_mlp": 1.02276421, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.804934114044033, + "language_loss": 0.82422376, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84640706, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.20373535, + "step": 2676, + "time_per_iteration": 4.007381200790405 + }, + { + "auxiliary_loss_clip": 0.01168152, + "auxiliary_loss_mlp": 0.01049554, + "balance_loss_clip": 1.06305432, + "balance_loss_mlp": 1.03149402, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 2.5337754589119967, + "language_loss": 0.73248982, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75466686, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.18066406, + "step": 2677, + "time_per_iteration": 3.9944753646850586 + }, + { + "auxiliary_loss_clip": 0.01160553, + "auxiliary_loss_mlp": 0.01040994, + "balance_loss_clip": 1.05713058, + "balance_loss_mlp": 1.02243352, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 1.7906060214899102, + "language_loss": 0.76322329, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78523874, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.18566895, + "step": 2678, + "time_per_iteration": 2.5367510318756104 + }, + { + "auxiliary_loss_clip": 0.01163028, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.05668783, + "balance_loss_mlp": 1.02773952, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.9753910458066775, + "language_loss": 0.7091974, + "learning_rate": 3.822478658490228e-06, + "loss": 0.73130107, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.19604492, + "step": 2679, + "time_per_iteration": 2.605087995529175 + }, + { + "auxiliary_loss_clip": 0.01089359, + "auxiliary_loss_mlp": 0.01012471, + "balance_loss_clip": 1.05054939, + "balance_loss_mlp": 1.01045632, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7745068437912385, + "language_loss": 0.51784343, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53886175, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 0.02017212, + "step": 2680, + "time_per_iteration": 4.48240852355957 + }, + { + "auxiliary_loss_clip": 0.01155893, + "auxiliary_loss_mlp": 0.01045674, + "balance_loss_clip": 1.04939318, + "balance_loss_mlp": 1.02496696, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.6495834624866088, + "language_loss": 0.80761099, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82962668, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.20715332, + "step": 2681, + "time_per_iteration": 2.477506637573242 + }, + { + "auxiliary_loss_clip": 0.01158345, + "auxiliary_loss_mlp": 0.01052301, + "balance_loss_clip": 1.05656791, + "balance_loss_mlp": 1.03394294, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8468066299428485, + "language_loss": 0.69152653, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71363294, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.18371582, + "step": 2682, + "time_per_iteration": 2.5239200592041016 + }, + { + "auxiliary_loss_clip": 0.01166474, + "auxiliary_loss_mlp": 0.0105903, + "balance_loss_clip": 1.05982876, + "balance_loss_mlp": 1.03691721, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8161858214528779, + "language_loss": 0.87488282, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89713788, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.22094727, + "step": 2683, + "time_per_iteration": 2.4209342002868652 + }, + { + "auxiliary_loss_clip": 0.01167742, + "auxiliary_loss_mlp": 0.01065152, + "balance_loss_clip": 1.05945146, + "balance_loss_mlp": 1.04270458, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 1.6712425795271968, + "language_loss": 0.74370748, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76603639, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.22460938, + "step": 2684, + "time_per_iteration": 2.598288059234619 + }, + { + "auxiliary_loss_clip": 0.01179289, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.07037663, + "balance_loss_mlp": 1.02487016, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 2.445710787633824, + "language_loss": 0.70508039, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72732246, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.20031738, + "step": 2685, + "time_per_iteration": 2.549142599105835 + }, + { + "auxiliary_loss_clip": 0.01158768, + "auxiliary_loss_mlp": 0.01054718, + "balance_loss_clip": 1.05366158, + "balance_loss_mlp": 1.03504801, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 7.731820108524943, + "language_loss": 0.71588862, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73802346, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.19665527, + "step": 2686, + "time_per_iteration": 2.487992525100708 + }, + { + "auxiliary_loss_clip": 0.01167122, + "auxiliary_loss_mlp": 0.01049519, + "balance_loss_clip": 1.06114447, + "balance_loss_mlp": 1.03019547, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 2.037888280958594, + "language_loss": 0.81599087, + "learning_rate": 3.821193164224981e-06, + "loss": 0.8381573, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.1932373, + "step": 2687, + "time_per_iteration": 2.433211326599121 + }, + { + "auxiliary_loss_clip": 0.01164406, + "auxiliary_loss_mlp": 0.01053392, + "balance_loss_clip": 1.05037093, + "balance_loss_mlp": 1.03114748, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.7003693893726057, + "language_loss": 0.71851468, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74069262, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.22241211, + "step": 2688, + "time_per_iteration": 2.479790687561035 + }, + { + "auxiliary_loss_clip": 0.01167527, + "auxiliary_loss_mlp": 0.01047032, + "balance_loss_clip": 1.06097317, + "balance_loss_mlp": 1.02887607, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 1.7591717555128823, + "language_loss": 0.75922728, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.7813729, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18151855, + "step": 2689, + "time_per_iteration": 2.602839231491089 + }, + { + "auxiliary_loss_clip": 0.01164633, + "auxiliary_loss_mlp": 0.01048562, + "balance_loss_clip": 1.06032896, + "balance_loss_mlp": 1.02833247, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 2.322564422721349, + "language_loss": 0.87576675, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89789873, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.20214844, + "step": 2690, + "time_per_iteration": 2.5941624641418457 + }, + { + "auxiliary_loss_clip": 0.01162444, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.05850673, + "balance_loss_mlp": 1.02548647, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.594075450043217, + "language_loss": 0.88255197, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90460718, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.17590332, + "step": 2691, + "time_per_iteration": 2.4816362857818604 + }, + { + "auxiliary_loss_clip": 0.01165699, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_clip": 1.05342102, + "balance_loss_mlp": 1.03173327, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.483381977049743, + "language_loss": 0.82103628, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84322661, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.21606445, + "step": 2692, + "time_per_iteration": 2.4898509979248047 + }, + { + "auxiliary_loss_clip": 0.01175324, + "auxiliary_loss_mlp": 0.01048503, + "balance_loss_clip": 1.06187844, + "balance_loss_mlp": 1.02820218, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.232185564875728, + "language_loss": 0.81498134, + "learning_rate": 3.820226142842862e-06, + "loss": 0.8372196, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.203125, + "step": 2693, + "time_per_iteration": 2.4458811283111572 + }, + { + "auxiliary_loss_clip": 0.01157496, + "auxiliary_loss_mlp": 0.01053202, + "balance_loss_clip": 1.05504525, + "balance_loss_mlp": 1.03517711, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.536393738615454, + "language_loss": 0.84046924, + "learning_rate": 3.820064730995783e-06, + "loss": 0.86257619, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.18005371, + "step": 2694, + "time_per_iteration": 2.4639880657196045 + }, + { + "auxiliary_loss_clip": 0.01156851, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.05017328, + "balance_loss_mlp": 1.03304052, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9775988498920285, + "language_loss": 0.69656473, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71866453, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.20092773, + "step": 2695, + "time_per_iteration": 2.52994441986084 + }, + { + "auxiliary_loss_clip": 0.01164768, + "auxiliary_loss_mlp": 0.01055629, + "balance_loss_clip": 1.05641234, + "balance_loss_mlp": 1.03552985, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.111612090825439, + "language_loss": 0.82715344, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84935749, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.2010498, + "step": 2696, + "time_per_iteration": 2.4553117752075195 + }, + { + "auxiliary_loss_clip": 0.01166499, + "auxiliary_loss_mlp": 0.01052328, + "balance_loss_clip": 1.05450141, + "balance_loss_mlp": 1.03114462, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 5.269832521880821, + "language_loss": 0.88706756, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90925586, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.21166992, + "step": 2697, + "time_per_iteration": 2.4279870986938477 + }, + { + "auxiliary_loss_clip": 0.01159222, + "auxiliary_loss_mlp": 0.01046812, + "balance_loss_clip": 1.05770779, + "balance_loss_mlp": 1.02889442, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4464211174628216, + "language_loss": 0.80873907, + "learning_rate": 3.819418393498343e-06, + "loss": 0.8307994, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.17895508, + "step": 2698, + "time_per_iteration": 2.533766746520996 + }, + { + "auxiliary_loss_clip": 0.01159085, + "auxiliary_loss_mlp": 0.01049562, + "balance_loss_clip": 1.05588841, + "balance_loss_mlp": 1.03014302, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.6969690650032152, + "language_loss": 0.77652544, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79861188, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.19421387, + "step": 2699, + "time_per_iteration": 2.4577178955078125 + }, + { + "auxiliary_loss_clip": 0.01161692, + "auxiliary_loss_mlp": 0.01043979, + "balance_loss_clip": 1.05637002, + "balance_loss_mlp": 1.02600193, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.2158784254263493, + "language_loss": 0.85930383, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88136053, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.17980957, + "step": 2700, + "time_per_iteration": 2.4761338233947754 + }, + { + "auxiliary_loss_clip": 0.01156818, + "auxiliary_loss_mlp": 0.01051998, + "balance_loss_clip": 1.05320668, + "balance_loss_mlp": 1.03141081, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.7059917461646634, + "language_loss": 0.8064636, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82855177, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.20568848, + "step": 2701, + "time_per_iteration": 2.4690334796905518 + }, + { + "auxiliary_loss_clip": 0.01161311, + "auxiliary_loss_mlp": 0.010461, + "balance_loss_clip": 1.05723262, + "balance_loss_mlp": 1.02719307, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.7484208694955383, + "language_loss": 0.73141259, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75348663, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.18920898, + "step": 2702, + "time_per_iteration": 2.4373395442962646 + }, + { + "auxiliary_loss_clip": 0.01163346, + "auxiliary_loss_mlp": 0.01052573, + "balance_loss_clip": 1.0545733, + "balance_loss_mlp": 1.03215241, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.290413091948648, + "language_loss": 0.72753012, + "learning_rate": 3.81860891934076e-06, + "loss": 0.74968928, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.20422363, + "step": 2703, + "time_per_iteration": 2.417246103286743 + }, + { + "auxiliary_loss_clip": 0.01160493, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.05243814, + "balance_loss_mlp": 1.02854753, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 2.162269200874188, + "language_loss": 0.70681274, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72890759, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.2043457, + "step": 2704, + "time_per_iteration": 2.5255138874053955 + }, + { + "auxiliary_loss_clip": 0.01111209, + "auxiliary_loss_mlp": 0.01010955, + "balance_loss_clip": 1.06879735, + "balance_loss_mlp": 1.00881243, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7789231700065744, + "language_loss": 0.53370368, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55492532, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.42333984, + "router_z_loss_mlp": 0.02145386, + "step": 2705, + "time_per_iteration": 3.097663402557373 + }, + { + "auxiliary_loss_clip": 0.01166395, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.05908585, + "balance_loss_mlp": 1.02963161, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.450126036340926, + "language_loss": 0.75504231, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77719414, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.19140625, + "step": 2706, + "time_per_iteration": 2.406470537185669 + }, + { + "auxiliary_loss_clip": 0.01160222, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.05361843, + "balance_loss_mlp": 1.02879214, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.8855199580630972, + "language_loss": 0.72424728, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74633694, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.19946289, + "step": 2707, + "time_per_iteration": 2.4825315475463867 + }, + { + "auxiliary_loss_clip": 0.01165744, + "auxiliary_loss_mlp": 0.0105207, + "balance_loss_clip": 1.05834019, + "balance_loss_mlp": 1.0331043, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 18.001602525340473, + "language_loss": 0.84109062, + "learning_rate": 3.817797721137495e-06, + "loss": 0.86326879, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.1895752, + "step": 2708, + "time_per_iteration": 2.4721126556396484 + }, + { + "auxiliary_loss_clip": 0.01163186, + "auxiliary_loss_mlp": 0.01047508, + "balance_loss_clip": 1.05371058, + "balance_loss_mlp": 1.02652717, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.419252388419528, + "language_loss": 0.86394417, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88605106, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.2097168, + "step": 2709, + "time_per_iteration": 2.4996447563171387 + }, + { + "auxiliary_loss_clip": 0.01158558, + "auxiliary_loss_mlp": 0.01052088, + "balance_loss_clip": 1.05347061, + "balance_loss_mlp": 1.03315759, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 2.0929297148000927, + "language_loss": 0.91548902, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93759549, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.18933105, + "step": 2710, + "time_per_iteration": 2.466003179550171 + }, + { + "auxiliary_loss_clip": 0.01162991, + "auxiliary_loss_mlp": 0.0105507, + "balance_loss_clip": 1.05858755, + "balance_loss_mlp": 1.0355916, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.113065606125459, + "language_loss": 0.81526333, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83744395, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.19482422, + "step": 2711, + "time_per_iteration": 2.4034674167633057 + }, + { + "auxiliary_loss_clip": 0.01167291, + "auxiliary_loss_mlp": 0.01046069, + "balance_loss_clip": 1.0564059, + "balance_loss_mlp": 1.02611303, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.3668673908749165, + "language_loss": 0.81619334, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83832693, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.19946289, + "step": 2712, + "time_per_iteration": 2.490447759628296 + }, + { + "auxiliary_loss_clip": 0.01171956, + "auxiliary_loss_mlp": 0.01057141, + "balance_loss_clip": 1.0620414, + "balance_loss_mlp": 1.03757906, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 1.9309363801111161, + "language_loss": 0.76957178, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79186273, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19580078, + "step": 2713, + "time_per_iteration": 2.4669857025146484 + }, + { + "auxiliary_loss_clip": 0.01163531, + "auxiliary_loss_mlp": 0.01053901, + "balance_loss_clip": 1.06196761, + "balance_loss_mlp": 1.03450537, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.11142947301526, + "language_loss": 0.78935039, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81152475, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.19396973, + "step": 2714, + "time_per_iteration": 3.9798851013183594 + }, + { + "auxiliary_loss_clip": 0.01160065, + "auxiliary_loss_mlp": 0.01054041, + "balance_loss_clip": 1.05454993, + "balance_loss_mlp": 1.03587306, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.6028435477058618, + "language_loss": 0.78260696, + "learning_rate": 3.816659148720702e-06, + "loss": 0.804748, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.1817627, + "step": 2715, + "time_per_iteration": 2.47440505027771 + }, + { + "auxiliary_loss_clip": 0.01161236, + "auxiliary_loss_mlp": 0.01047393, + "balance_loss_clip": 1.05447936, + "balance_loss_mlp": 1.02916646, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 3.2282721535385295, + "language_loss": 0.81469852, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83678484, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18225098, + "step": 2716, + "time_per_iteration": 2.496508836746216 + }, + { + "auxiliary_loss_clip": 0.01163561, + "auxiliary_loss_mlp": 0.0105463, + "balance_loss_clip": 1.05825794, + "balance_loss_mlp": 1.03690326, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 2.363548125096083, + "language_loss": 0.86462641, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88680828, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.17724609, + "step": 2717, + "time_per_iteration": 2.5083093643188477 + }, + { + "auxiliary_loss_clip": 0.01163453, + "auxiliary_loss_mlp": 0.01045063, + "balance_loss_clip": 1.05868959, + "balance_loss_mlp": 1.02647829, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.9720340129910703, + "language_loss": 0.76490068, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78698575, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.18579102, + "step": 2718, + "time_per_iteration": 2.529970169067383 + }, + { + "auxiliary_loss_clip": 0.01163599, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0559752, + "balance_loss_mlp": 1.02693629, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.072339139813738, + "language_loss": 0.73830378, + "learning_rate": 3.816007020241652e-06, + "loss": 0.7603817, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.17272949, + "step": 2719, + "time_per_iteration": 3.889589548110962 + }, + { + "auxiliary_loss_clip": 0.01166164, + "auxiliary_loss_mlp": 0.01042599, + "balance_loss_clip": 1.06003714, + "balance_loss_mlp": 1.02472997, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.9598844988501407, + "language_loss": 0.72066748, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74275512, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.17871094, + "step": 2720, + "time_per_iteration": 3.8531432151794434 + }, + { + "auxiliary_loss_clip": 0.01176834, + "auxiliary_loss_mlp": 0.01043107, + "balance_loss_clip": 1.07050586, + "balance_loss_mlp": 1.02367592, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.5640714790820507, + "language_loss": 0.74739522, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.76959467, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.1940918, + "step": 2721, + "time_per_iteration": 2.4830422401428223 + }, + { + "auxiliary_loss_clip": 0.01175354, + "auxiliary_loss_mlp": 0.01046795, + "balance_loss_clip": 1.06372595, + "balance_loss_mlp": 1.02657723, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.875162821328467, + "language_loss": 0.79613519, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81835675, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.20214844, + "step": 2722, + "time_per_iteration": 2.6040170192718506 + }, + { + "auxiliary_loss_clip": 0.01167707, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.05649436, + "balance_loss_mlp": 1.02776623, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.686013807839059, + "language_loss": 0.85031402, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.87248838, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.21948242, + "step": 2723, + "time_per_iteration": 2.589461326599121 + }, + { + "auxiliary_loss_clip": 0.01158783, + "auxiliary_loss_mlp": 0.01043171, + "balance_loss_clip": 1.05483055, + "balance_loss_mlp": 1.02362049, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.2045076627620097, + "language_loss": 0.70957452, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73159409, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.19567871, + "step": 2724, + "time_per_iteration": 3.934870719909668 + }, + { + "auxiliary_loss_clip": 0.01171275, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.06779563, + "balance_loss_mlp": 1.02869916, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.7172534498865857, + "language_loss": 0.70765054, + "learning_rate": 3.815026761751955e-06, + "loss": 0.72982562, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.17529297, + "step": 2725, + "time_per_iteration": 2.431617021560669 + }, + { + "auxiliary_loss_clip": 0.01165455, + "auxiliary_loss_mlp": 0.01047206, + "balance_loss_clip": 1.06399608, + "balance_loss_mlp": 1.02934837, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 3.319255539296229, + "language_loss": 0.88352859, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90565515, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.17871094, + "step": 2726, + "time_per_iteration": 2.498790740966797 + }, + { + "auxiliary_loss_clip": 0.01166762, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_clip": 1.05850613, + "balance_loss_mlp": 1.03226566, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.842216743482477, + "language_loss": 0.74289072, + "learning_rate": 3.814699458247963e-06, + "loss": 0.7650702, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.18920898, + "step": 2727, + "time_per_iteration": 2.4639365673065186 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_clip": 1.05955386, + "balance_loss_mlp": 1.0360868, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.620540686549532, + "language_loss": 0.82403076, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84618175, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.17126465, + "step": 2728, + "time_per_iteration": 2.503378391265869 + }, + { + "auxiliary_loss_clip": 0.01178602, + "auxiliary_loss_mlp": 0.01050053, + "balance_loss_clip": 1.06898701, + "balance_loss_mlp": 1.03035927, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.259284010664894, + "language_loss": 0.85160911, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87389565, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.19689941, + "step": 2729, + "time_per_iteration": 2.4196815490722656 + }, + { + "auxiliary_loss_clip": 0.01163249, + "auxiliary_loss_mlp": 0.01042114, + "balance_loss_clip": 1.05720162, + "balance_loss_mlp": 1.02472103, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 3.0744897165779657, + "language_loss": 0.72848964, + "learning_rate": 3.814207986905616e-06, + "loss": 0.7505433, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.1739502, + "step": 2730, + "time_per_iteration": 2.463578939437866 + }, + { + "auxiliary_loss_clip": 0.0116488, + "auxiliary_loss_mlp": 0.01055383, + "balance_loss_clip": 1.0552876, + "balance_loss_mlp": 1.03253055, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 1.5712909555921404, + "language_loss": 0.74336547, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76556814, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.2286377, + "step": 2731, + "time_per_iteration": 2.706688165664673 + }, + { + "auxiliary_loss_clip": 0.01172737, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_clip": 1.06176305, + "balance_loss_mlp": 1.02986121, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.386431108524353, + "language_loss": 0.79382843, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.8160553, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.20092773, + "step": 2732, + "time_per_iteration": 2.4924442768096924 + }, + { + "auxiliary_loss_clip": 0.01166087, + "auxiliary_loss_mlp": 0.01049119, + "balance_loss_clip": 1.05847466, + "balance_loss_mlp": 1.02960443, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 2.0801623164107466, + "language_loss": 0.69440627, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71655834, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1953125, + "step": 2733, + "time_per_iteration": 2.56801176071167 + }, + { + "auxiliary_loss_clip": 0.01167157, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.05897963, + "balance_loss_mlp": 1.03044605, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 1.8549498976593002, + "language_loss": 0.80810535, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8303076, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.22631836, + "step": 2734, + "time_per_iteration": 2.6352291107177734 + }, + { + "auxiliary_loss_clip": 0.01176054, + "auxiliary_loss_mlp": 0.01053019, + "balance_loss_clip": 1.06745601, + "balance_loss_mlp": 1.0336951, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.9408065230891207, + "language_loss": 0.82244718, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84473795, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.19335938, + "step": 2735, + "time_per_iteration": 2.5335376262664795 + }, + { + "auxiliary_loss_clip": 0.01164636, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.05909169, + "balance_loss_mlp": 1.02456009, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 3.222528146950043, + "language_loss": 0.78538704, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80745864, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.1796875, + "step": 2736, + "time_per_iteration": 2.475353956222534 + }, + { + "auxiliary_loss_clip": 0.01165383, + "auxiliary_loss_mlp": 0.01049895, + "balance_loss_clip": 1.05843258, + "balance_loss_mlp": 1.03078532, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.6769689444876061, + "language_loss": 0.81352079, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83567357, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.19104004, + "step": 2737, + "time_per_iteration": 2.4683804512023926 + }, + { + "auxiliary_loss_clip": 0.0116236, + "auxiliary_loss_mlp": 0.01057502, + "balance_loss_clip": 1.0534054, + "balance_loss_mlp": 1.03717637, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8571777722105036, + "language_loss": 0.87055415, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89275277, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.20324707, + "step": 2738, + "time_per_iteration": 2.5484373569488525 + }, + { + "auxiliary_loss_clip": 0.01163062, + "auxiliary_loss_mlp": 0.0105315, + "balance_loss_clip": 1.05540156, + "balance_loss_mlp": 1.03437448, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.6552151757933518, + "language_loss": 0.7224977, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74465984, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.18774414, + "step": 2739, + "time_per_iteration": 2.5031509399414062 + }, + { + "auxiliary_loss_clip": 0.01168701, + "auxiliary_loss_mlp": 0.01049034, + "balance_loss_clip": 1.0616523, + "balance_loss_mlp": 1.02935266, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.7800239398516573, + "language_loss": 0.81654382, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.83872116, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.19677734, + "step": 2740, + "time_per_iteration": 2.5101358890533447 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01051576, + "balance_loss_clip": 1.05897975, + "balance_loss_mlp": 1.02964139, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.5403949766630722, + "language_loss": 0.69618666, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71840531, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.21960449, + "step": 2741, + "time_per_iteration": 2.6198341846466064 + }, + { + "auxiliary_loss_clip": 0.01169335, + "auxiliary_loss_mlp": 0.01045862, + "balance_loss_clip": 1.06177068, + "balance_loss_mlp": 1.02714634, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.8998020950167482, + "language_loss": 0.79839849, + "learning_rate": 3.812235911671472e-06, + "loss": 0.8205505, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.18725586, + "step": 2742, + "time_per_iteration": 2.4722931385040283 + }, + { + "auxiliary_loss_clip": 0.01168614, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_clip": 1.06138122, + "balance_loss_mlp": 1.02948189, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.0892257184517184, + "language_loss": 0.84864658, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.87083018, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.20275879, + "step": 2743, + "time_per_iteration": 2.457597255706787 + }, + { + "auxiliary_loss_clip": 0.01156003, + "auxiliary_loss_mlp": 0.01049841, + "balance_loss_clip": 1.05229664, + "balance_loss_mlp": 1.02875257, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 4.213523379167426, + "language_loss": 0.85657191, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87863034, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.2109375, + "step": 2744, + "time_per_iteration": 2.5273542404174805 + }, + { + "auxiliary_loss_clip": 0.01160018, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_clip": 1.05782497, + "balance_loss_mlp": 1.0270859, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.8836799432212956, + "language_loss": 0.83008969, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85213912, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.17858887, + "step": 2745, + "time_per_iteration": 2.6091511249542236 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01055759, + "balance_loss_clip": 1.05891967, + "balance_loss_mlp": 1.03698349, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 1.93795589871227, + "language_loss": 0.77055001, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.79277396, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.18774414, + "step": 2746, + "time_per_iteration": 2.5062880516052246 + }, + { + "auxiliary_loss_clip": 0.01166485, + "auxiliary_loss_mlp": 0.01050091, + "balance_loss_clip": 1.05768073, + "balance_loss_mlp": 1.02877665, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5391739023980837, + "language_loss": 0.8096422, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83180797, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.21313477, + "step": 2747, + "time_per_iteration": 2.419409990310669 + }, + { + "auxiliary_loss_clip": 0.01170115, + "auxiliary_loss_mlp": 0.01046818, + "balance_loss_clip": 1.06246436, + "balance_loss_mlp": 1.02733898, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.311893408503191, + "language_loss": 0.69345814, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71562743, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.19470215, + "step": 2748, + "time_per_iteration": 2.4218873977661133 + }, + { + "auxiliary_loss_clip": 0.01164813, + "auxiliary_loss_mlp": 0.01048301, + "balance_loss_clip": 1.05852091, + "balance_loss_mlp": 1.02971649, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 1.9561834548072243, + "language_loss": 0.8775087, + "learning_rate": 3.811080963869561e-06, + "loss": 0.8996399, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.18579102, + "step": 2749, + "time_per_iteration": 2.4680137634277344 + }, + { + "auxiliary_loss_clip": 0.01166354, + "auxiliary_loss_mlp": 0.01058006, + "balance_loss_clip": 1.05638075, + "balance_loss_mlp": 1.03631043, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.7806823198227804, + "language_loss": 0.79397804, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81622165, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.21691895, + "step": 2750, + "time_per_iteration": 2.4085826873779297 + }, + { + "auxiliary_loss_clip": 0.01175709, + "auxiliary_loss_mlp": 0.01045963, + "balance_loss_clip": 1.06566143, + "balance_loss_mlp": 1.02642477, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7805615765625016, + "language_loss": 0.94775975, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.96997643, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.1953125, + "step": 2751, + "time_per_iteration": 2.445665121078491 + }, + { + "auxiliary_loss_clip": 0.0116297, + "auxiliary_loss_mlp": 0.01049754, + "balance_loss_clip": 1.05926251, + "balance_loss_mlp": 1.03096676, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.3866176497397307, + "language_loss": 0.70913374, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73126101, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 1.03759766, + "router_z_loss_mlp": 0.18774414, + "step": 2752, + "time_per_iteration": 2.46828556060791 + }, + { + "auxiliary_loss_clip": 0.01084709, + "auxiliary_loss_mlp": 0.01010804, + "balance_loss_clip": 1.04772258, + "balance_loss_mlp": 1.00799322, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7681572308127149, + "language_loss": 0.54122221, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56217736, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.36962891, + "router_z_loss_mlp": 0.02810669, + "step": 2753, + "time_per_iteration": 3.1383588314056396 + }, + { + "auxiliary_loss_clip": 0.01169291, + "auxiliary_loss_mlp": 0.01053166, + "balance_loss_clip": 1.05912709, + "balance_loss_mlp": 1.03269756, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 2.4678673491095515, + "language_loss": 0.75330931, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.7755338, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.20458984, + "step": 2754, + "time_per_iteration": 2.459768295288086 + }, + { + "auxiliary_loss_clip": 0.01181143, + "auxiliary_loss_mlp": 0.01060519, + "balance_loss_clip": 1.06571066, + "balance_loss_mlp": 1.03726077, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.563683136363922, + "language_loss": 0.86832774, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89074439, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.23254395, + "step": 2755, + "time_per_iteration": 2.425025463104248 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01058514, + "balance_loss_clip": 1.05844617, + "balance_loss_mlp": 1.03785515, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 2.0640312746611515, + "language_loss": 0.73568094, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75794935, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.2064209, + "step": 2756, + "time_per_iteration": 2.5221757888793945 + }, + { + "auxiliary_loss_clip": 0.01170946, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.06564522, + "balance_loss_mlp": 1.03126216, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.5689132352003519, + "language_loss": 0.75207615, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.7742967, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.19848633, + "step": 2757, + "time_per_iteration": 2.472801446914673 + }, + { + "auxiliary_loss_clip": 0.011854, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.07455945, + "balance_loss_mlp": 1.03423393, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.971762405501438, + "language_loss": 0.84567922, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86806726, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.19165039, + "step": 2758, + "time_per_iteration": 3.9923646450042725 + }, + { + "auxiliary_loss_clip": 0.01168401, + "auxiliary_loss_mlp": 0.0105394, + "balance_loss_clip": 1.06082392, + "balance_loss_mlp": 1.03520036, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 2.1373066984290015, + "language_loss": 0.79272652, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81494993, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18737793, + "step": 2759, + "time_per_iteration": 2.4830634593963623 + }, + { + "auxiliary_loss_clip": 0.01163539, + "auxiliary_loss_mlp": 0.01045323, + "balance_loss_clip": 1.05698788, + "balance_loss_mlp": 1.02529585, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.449304576368254, + "language_loss": 0.7486943, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77078289, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.20019531, + "step": 2760, + "time_per_iteration": 2.375859260559082 + }, + { + "auxiliary_loss_clip": 0.01177991, + "auxiliary_loss_mlp": 0.01046197, + "balance_loss_clip": 1.06751537, + "balance_loss_mlp": 1.0264318, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 2.3884977232071813, + "language_loss": 0.73123986, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75348175, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.19750977, + "step": 2761, + "time_per_iteration": 3.8390395641326904 + }, + { + "auxiliary_loss_clip": 0.01169503, + "auxiliary_loss_mlp": 0.01054501, + "balance_loss_clip": 1.0589726, + "balance_loss_mlp": 1.03392553, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 1.786095972133225, + "language_loss": 0.88654649, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.90878654, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.20581055, + "step": 2762, + "time_per_iteration": 2.4688801765441895 + }, + { + "auxiliary_loss_clip": 0.01175339, + "auxiliary_loss_mlp": 0.01055492, + "balance_loss_clip": 1.06475508, + "balance_loss_mlp": 1.03490472, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 2.58976344958366, + "language_loss": 0.88132447, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90363282, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 1.10693359, + "router_z_loss_mlp": 0.20605469, + "step": 2763, + "time_per_iteration": 2.5019171237945557 + }, + { + "auxiliary_loss_clip": 0.0109123, + "auxiliary_loss_mlp": 0.01010294, + "balance_loss_clip": 1.05141532, + "balance_loss_mlp": 1.00796294, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7729845516192466, + "language_loss": 0.59783089, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61884612, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.39794922, + "router_z_loss_mlp": 0.02328491, + "step": 2764, + "time_per_iteration": 4.590064525604248 + }, + { + "auxiliary_loss_clip": 0.01192555, + "auxiliary_loss_mlp": 0.01050872, + "balance_loss_clip": 1.07929945, + "balance_loss_mlp": 1.02910435, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8422339288856013, + "language_loss": 0.8234027, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84583706, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.2175293, + "step": 2765, + "time_per_iteration": 2.5014805793762207 + }, + { + "auxiliary_loss_clip": 0.01178287, + "auxiliary_loss_mlp": 0.01053155, + "balance_loss_clip": 1.06376851, + "balance_loss_mlp": 1.03126836, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.5423819621198933, + "language_loss": 0.69638348, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.71869791, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.21875, + "step": 2766, + "time_per_iteration": 2.4680511951446533 + }, + { + "auxiliary_loss_clip": 0.01166956, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.06250668, + "balance_loss_mlp": 1.02688313, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3319024216089392, + "language_loss": 0.88513988, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90727329, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.19506836, + "step": 2767, + "time_per_iteration": 3.954676389694214 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01005598, + "balance_loss_clip": 1.06100082, + "balance_loss_mlp": 1.00320745, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.695198730267389, + "language_loss": 0.52878821, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.5498246, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.02389526, + "step": 2768, + "time_per_iteration": 3.1680498123168945 + }, + { + "auxiliary_loss_clip": 0.01184609, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.07156909, + "balance_loss_mlp": 1.02945554, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.9722927897175837, + "language_loss": 0.85174549, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87409365, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.20751953, + "step": 2769, + "time_per_iteration": 2.5151989459991455 + }, + { + "auxiliary_loss_clip": 0.01087167, + "auxiliary_loss_mlp": 0.01009525, + "balance_loss_clip": 1.04910767, + "balance_loss_mlp": 1.00696778, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.8078503565573208, + "language_loss": 0.57416463, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59513152, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.38037109, + "router_z_loss_mlp": 0.02557373, + "step": 2770, + "time_per_iteration": 2.950734853744507 + }, + { + "auxiliary_loss_clip": 0.01088447, + "auxiliary_loss_mlp": 0.01004624, + "balance_loss_clip": 1.04834962, + "balance_loss_mlp": 1.00212336, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8653413305126852, + "language_loss": 0.56247842, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58340913, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.40087891, + "router_z_loss_mlp": 0.0249939, + "step": 2771, + "time_per_iteration": 2.977987051010132 + }, + { + "auxiliary_loss_clip": 0.01173046, + "auxiliary_loss_mlp": 0.01056417, + "balance_loss_clip": 1.06386948, + "balance_loss_mlp": 1.03544784, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.410569025969651, + "language_loss": 0.70462418, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72691882, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.2097168, + "step": 2772, + "time_per_iteration": 2.465188980102539 + }, + { + "auxiliary_loss_clip": 0.01162214, + "auxiliary_loss_mlp": 0.01050641, + "balance_loss_clip": 1.05648959, + "balance_loss_mlp": 1.03070915, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 2.1025250359388683, + "language_loss": 0.86041492, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88254344, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.19921875, + "step": 2773, + "time_per_iteration": 2.50079607963562 + }, + { + "auxiliary_loss_clip": 0.01170724, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.06487107, + "balance_loss_mlp": 1.02898431, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.677222503553018, + "language_loss": 0.822245, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84443343, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 1.05810547, + "router_z_loss_mlp": 0.19128418, + "step": 2774, + "time_per_iteration": 2.3992221355438232 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01045243, + "balance_loss_clip": 1.06240249, + "balance_loss_mlp": 1.02490604, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.2900092772080125, + "language_loss": 0.8323915, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85457027, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.20349121, + "step": 2775, + "time_per_iteration": 2.508772373199463 + }, + { + "auxiliary_loss_clip": 0.01163688, + "auxiliary_loss_mlp": 0.01055335, + "balance_loss_clip": 1.05803561, + "balance_loss_mlp": 1.03484261, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 2.037647320743538, + "language_loss": 0.80850142, + "learning_rate": 3.806594661981897e-06, + "loss": 0.83069164, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.20495605, + "step": 2776, + "time_per_iteration": 2.4250640869140625 + }, + { + "auxiliary_loss_clip": 0.01164113, + "auxiliary_loss_mlp": 0.01048101, + "balance_loss_clip": 1.06081796, + "balance_loss_mlp": 1.02808642, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.9039094780440928, + "language_loss": 0.80421841, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82634056, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.19995117, + "step": 2777, + "time_per_iteration": 2.456496477127075 + }, + { + "auxiliary_loss_clip": 0.01169316, + "auxiliary_loss_mlp": 0.01051398, + "balance_loss_clip": 1.06317055, + "balance_loss_mlp": 1.03139448, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.6126643355811254, + "language_loss": 0.85369802, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87590516, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.19995117, + "step": 2778, + "time_per_iteration": 2.4741687774658203 + }, + { + "auxiliary_loss_clip": 0.01169762, + "auxiliary_loss_mlp": 0.01051315, + "balance_loss_clip": 1.05821753, + "balance_loss_mlp": 1.0310142, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.232685237566431, + "language_loss": 0.74269712, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.7649079, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.203125, + "step": 2779, + "time_per_iteration": 2.5424013137817383 + }, + { + "auxiliary_loss_clip": 0.0117971, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_clip": 1.0696938, + "balance_loss_mlp": 1.03284156, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 3.132491983549076, + "language_loss": 0.65040731, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67273295, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1998291, + "step": 2780, + "time_per_iteration": 2.546401262283325 + }, + { + "auxiliary_loss_clip": 0.01179627, + "auxiliary_loss_mlp": 0.01048457, + "balance_loss_clip": 1.07169378, + "balance_loss_mlp": 1.02814388, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.2865381652441825, + "language_loss": 0.78181076, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80409157, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.20324707, + "step": 2781, + "time_per_iteration": 2.5087780952453613 + }, + { + "auxiliary_loss_clip": 0.01168354, + "auxiliary_loss_mlp": 0.01052422, + "balance_loss_clip": 1.05872393, + "balance_loss_mlp": 1.03313434, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.6855054267846235, + "language_loss": 0.75546217, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77766991, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.19287109, + "step": 2782, + "time_per_iteration": 2.439422845840454 + }, + { + "auxiliary_loss_clip": 0.01175813, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.06400156, + "balance_loss_mlp": 1.03811789, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 5.2176573103082085, + "language_loss": 0.67950249, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70186174, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.21984863, + "step": 2783, + "time_per_iteration": 2.5386362075805664 + }, + { + "auxiliary_loss_clip": 0.01165369, + "auxiliary_loss_mlp": 0.01056312, + "balance_loss_clip": 1.0581373, + "balance_loss_mlp": 1.03480721, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.0061807950283113, + "language_loss": 0.70206797, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72428483, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.21496582, + "step": 2784, + "time_per_iteration": 2.500037431716919 + }, + { + "auxiliary_loss_clip": 0.01168809, + "auxiliary_loss_mlp": 0.01056836, + "balance_loss_clip": 1.05935001, + "balance_loss_mlp": 1.03480577, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 2.1911921465706485, + "language_loss": 0.60699797, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62925446, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.22033691, + "step": 2785, + "time_per_iteration": 2.5571625232696533 + }, + { + "auxiliary_loss_clip": 0.0108381, + "auxiliary_loss_mlp": 0.01012771, + "balance_loss_clip": 1.04644489, + "balance_loss_mlp": 1.00969863, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 1.4063758171194491, + "language_loss": 0.58803213, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60899794, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.37353516, + "router_z_loss_mlp": 0.0307312, + "step": 2786, + "time_per_iteration": 3.2115061283111572 + }, + { + "auxiliary_loss_clip": 0.01183027, + "auxiliary_loss_mlp": 0.01047434, + "balance_loss_clip": 1.07231021, + "balance_loss_mlp": 1.02758574, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 2.356847023333848, + "language_loss": 0.76263601, + "learning_rate": 3.80475258451721e-06, + "loss": 0.7849406, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 1.10595703, + "router_z_loss_mlp": 0.1986084, + "step": 2787, + "time_per_iteration": 2.5194780826568604 + }, + { + "auxiliary_loss_clip": 0.011783, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.06987083, + "balance_loss_mlp": 1.02771151, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.2245926570386643, + "language_loss": 0.7745502, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79680002, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.18969727, + "step": 2788, + "time_per_iteration": 2.5270140171051025 + }, + { + "auxiliary_loss_clip": 0.01099224, + "auxiliary_loss_mlp": 0.01011603, + "balance_loss_clip": 1.06196606, + "balance_loss_mlp": 1.00923395, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8628495035260084, + "language_loss": 0.59379965, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61490798, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.37255859, + "router_z_loss_mlp": 0.02368164, + "step": 2789, + "time_per_iteration": 3.011103630065918 + }, + { + "auxiliary_loss_clip": 0.01162485, + "auxiliary_loss_mlp": 0.01053107, + "balance_loss_clip": 1.05550671, + "balance_loss_mlp": 1.03231716, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.7097614322261987, + "language_loss": 0.70124108, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72339702, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.20776367, + "step": 2790, + "time_per_iteration": 2.7275702953338623 + }, + { + "auxiliary_loss_clip": 0.01170708, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_clip": 1.06414652, + "balance_loss_mlp": 1.03409684, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.585714629487615, + "language_loss": 0.79564297, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81787801, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18713379, + "step": 2791, + "time_per_iteration": 2.479203939437866 + }, + { + "auxiliary_loss_clip": 0.01169096, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.06073427, + "balance_loss_mlp": 1.02603364, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.699555655302135, + "language_loss": 0.71420735, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73637235, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.21362305, + "step": 2792, + "time_per_iteration": 2.622072458267212 + }, + { + "auxiliary_loss_clip": 0.01166483, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.05920029, + "balance_loss_mlp": 1.0253675, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 1.9185641821387391, + "language_loss": 0.71787089, + "learning_rate": 3.803744324194691e-06, + "loss": 0.73997998, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.1907959, + "step": 2793, + "time_per_iteration": 2.521392822265625 + }, + { + "auxiliary_loss_clip": 0.01165142, + "auxiliary_loss_mlp": 0.01058008, + "balance_loss_clip": 1.05840337, + "balance_loss_mlp": 1.03681242, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 2.1666588495981354, + "language_loss": 0.77110183, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79333329, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.21203613, + "step": 2794, + "time_per_iteration": 2.491339921951294 + }, + { + "auxiliary_loss_clip": 0.01172191, + "auxiliary_loss_mlp": 0.01052469, + "balance_loss_clip": 1.06457639, + "balance_loss_mlp": 1.03369355, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 3.3146572093289866, + "language_loss": 0.71668124, + "learning_rate": 3.803407690167187e-06, + "loss": 0.73892784, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18774414, + "step": 2795, + "time_per_iteration": 2.542673349380493 + }, + { + "auxiliary_loss_clip": 0.01162659, + "auxiliary_loss_mlp": 0.01046485, + "balance_loss_clip": 1.05594897, + "balance_loss_mlp": 1.02716064, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 2.056059695755885, + "language_loss": 0.83866316, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86075461, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.1932373, + "step": 2796, + "time_per_iteration": 2.475125551223755 + }, + { + "auxiliary_loss_clip": 0.01177339, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.06806946, + "balance_loss_mlp": 1.03374684, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.965116385272388, + "language_loss": 0.81406546, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83638346, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.20690918, + "step": 2797, + "time_per_iteration": 2.504565715789795 + }, + { + "auxiliary_loss_clip": 0.01166958, + "auxiliary_loss_mlp": 0.01046633, + "balance_loss_clip": 1.06483865, + "balance_loss_mlp": 1.02978826, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.840980580707778, + "language_loss": 0.75145459, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77359051, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.16833496, + "step": 2798, + "time_per_iteration": 2.570742607116699 + }, + { + "auxiliary_loss_clip": 0.01167021, + "auxiliary_loss_mlp": 0.01052287, + "balance_loss_clip": 1.06082988, + "balance_loss_mlp": 1.03407156, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.5984186169250607, + "language_loss": 0.79676604, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81895912, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.18212891, + "step": 2799, + "time_per_iteration": 2.482811450958252 + }, + { + "auxiliary_loss_clip": 0.01171283, + "auxiliary_loss_mlp": 0.0105563, + "balance_loss_clip": 1.06183267, + "balance_loss_mlp": 1.0350064, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 3.3666169313979153, + "language_loss": 0.71177554, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.73404467, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.20617676, + "step": 2800, + "time_per_iteration": 2.5647642612457275 + }, + { + "auxiliary_loss_clip": 0.01170307, + "auxiliary_loss_mlp": 0.0104642, + "balance_loss_clip": 1.06368542, + "balance_loss_mlp": 1.02615428, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.8175379226103945, + "language_loss": 0.83471382, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85688108, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.20251465, + "step": 2801, + "time_per_iteration": 4.029038667678833 + }, + { + "auxiliary_loss_clip": 0.01162325, + "auxiliary_loss_mlp": 0.01055158, + "balance_loss_clip": 1.05627763, + "balance_loss_mlp": 1.03582191, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.359907862927847, + "language_loss": 0.82852328, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.85069811, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.19335938, + "step": 2802, + "time_per_iteration": 2.5138099193573 + }, + { + "auxiliary_loss_clip": 0.01179633, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_clip": 1.07075071, + "balance_loss_mlp": 1.02871132, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 2.014964703787428, + "language_loss": 0.81147766, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83376443, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.20349121, + "step": 2803, + "time_per_iteration": 2.546863079071045 + }, + { + "auxiliary_loss_clip": 0.01167708, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_clip": 1.06570029, + "balance_loss_mlp": 1.03271461, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.321114134719098, + "language_loss": 0.76650107, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78869158, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.1862793, + "step": 2804, + "time_per_iteration": 2.5729737281799316 + }, + { + "auxiliary_loss_clip": 0.01079513, + "auxiliary_loss_mlp": 0.01071564, + "balance_loss_clip": 1.04033542, + "balance_loss_mlp": 1.06807399, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.9014162360324911, + "language_loss": 0.55377293, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57528365, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.39160156, + "router_z_loss_mlp": 0.03485107, + "step": 2805, + "time_per_iteration": 4.389646053314209 + }, + { + "auxiliary_loss_clip": 0.01156507, + "auxiliary_loss_mlp": 0.01038136, + "balance_loss_clip": 1.05398071, + "balance_loss_mlp": 1.02126765, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.859413855199377, + "language_loss": 0.73340273, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75534916, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.16870117, + "step": 2806, + "time_per_iteration": 2.503408432006836 + }, + { + "auxiliary_loss_clip": 0.01175909, + "auxiliary_loss_mlp": 0.01050575, + "balance_loss_clip": 1.0693363, + "balance_loss_mlp": 1.03182375, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 2.0907891714663394, + "language_loss": 0.70085973, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72312456, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18762207, + "step": 2807, + "time_per_iteration": 3.851527690887451 + }, + { + "auxiliary_loss_clip": 0.01173102, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.06513333, + "balance_loss_mlp": 1.03145289, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.633582251389188, + "language_loss": 0.70398742, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72622859, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.19567871, + "step": 2808, + "time_per_iteration": 2.5108895301818848 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01045002, + "balance_loss_clip": 1.06250691, + "balance_loss_mlp": 1.0249511, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.87443374296815, + "language_loss": 0.79871434, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82085449, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.20068359, + "step": 2809, + "time_per_iteration": 2.4422590732574463 + }, + { + "auxiliary_loss_clip": 0.01172055, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.06248617, + "balance_loss_mlp": 1.0266428, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.121992424410405, + "language_loss": 0.87998128, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90215755, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.18945312, + "step": 2810, + "time_per_iteration": 2.4624295234680176 + }, + { + "auxiliary_loss_clip": 0.01169522, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.0618583, + "balance_loss_mlp": 1.0314337, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 3.5241487095860378, + "language_loss": 0.92346919, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94567728, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.19848633, + "step": 2811, + "time_per_iteration": 3.956430673599243 + }, + { + "auxiliary_loss_clip": 0.01168737, + "auxiliary_loss_mlp": 0.01057169, + "balance_loss_clip": 1.06303668, + "balance_loss_mlp": 1.03920436, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 2.1798294743644338, + "language_loss": 0.78721476, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80947387, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.17993164, + "step": 2812, + "time_per_iteration": 2.4862515926361084 + }, + { + "auxiliary_loss_clip": 0.01162546, + "auxiliary_loss_mlp": 0.01058002, + "balance_loss_clip": 1.06034875, + "balance_loss_mlp": 1.03982234, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 3.0483934117436347, + "language_loss": 0.74990141, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.77210695, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.1817627, + "step": 2813, + "time_per_iteration": 2.495248794555664 + }, + { + "auxiliary_loss_clip": 0.01166744, + "auxiliary_loss_mlp": 0.01052572, + "balance_loss_clip": 1.058321, + "balance_loss_mlp": 1.03359389, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.027566591727545, + "language_loss": 0.69222307, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71441627, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.18969727, + "step": 2814, + "time_per_iteration": 2.3926703929901123 + }, + { + "auxiliary_loss_clip": 0.01169254, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.06407142, + "balance_loss_mlp": 1.02826142, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 2.0594194410526616, + "language_loss": 0.61570084, + "learning_rate": 3.800026313549776e-06, + "loss": 0.63785982, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.18383789, + "step": 2815, + "time_per_iteration": 2.5017824172973633 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_clip": 1.06287169, + "balance_loss_mlp": 1.03345406, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9784195914557092, + "language_loss": 0.81603897, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.83825958, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.18359375, + "step": 2816, + "time_per_iteration": 2.4735825061798096 + }, + { + "auxiliary_loss_clip": 0.01172225, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_clip": 1.0653559, + "balance_loss_mlp": 1.02795351, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.14076326373426, + "language_loss": 0.86985779, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89204401, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.18444824, + "step": 2817, + "time_per_iteration": 2.52280855178833 + }, + { + "auxiliary_loss_clip": 0.01163553, + "auxiliary_loss_mlp": 0.01051327, + "balance_loss_clip": 1.06111681, + "balance_loss_mlp": 1.032444, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 2.0350340167754988, + "language_loss": 0.81549919, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83764803, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.18884277, + "step": 2818, + "time_per_iteration": 2.4386894702911377 + }, + { + "auxiliary_loss_clip": 0.01158813, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.05412078, + "balance_loss_mlp": 1.03060555, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 2.4796882415389505, + "language_loss": 0.80759859, + "learning_rate": 3.799346760237336e-06, + "loss": 0.82968885, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.19604492, + "step": 2819, + "time_per_iteration": 2.4567015171051025 + }, + { + "auxiliary_loss_clip": 0.01084742, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.04688263, + "balance_loss_mlp": 1.0343318, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.943413150877215, + "language_loss": 0.61007071, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63128722, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 0.0256958, + "step": 2820, + "time_per_iteration": 3.0125436782836914 + }, + { + "auxiliary_loss_clip": 0.011604, + "auxiliary_loss_mlp": 0.01052824, + "balance_loss_clip": 1.0551846, + "balance_loss_mlp": 1.03434706, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.9375347679176484, + "language_loss": 0.78411162, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.8062439, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.18493652, + "step": 2821, + "time_per_iteration": 2.547450542449951 + }, + { + "auxiliary_loss_clip": 0.01165516, + "auxiliary_loss_mlp": 0.01070037, + "balance_loss_clip": 1.05732012, + "balance_loss_mlp": 1.04896092, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.4119981912222768, + "language_loss": 0.78368372, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80603921, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.21081543, + "step": 2822, + "time_per_iteration": 2.4624814987182617 + }, + { + "auxiliary_loss_clip": 0.01161082, + "auxiliary_loss_mlp": 0.01051361, + "balance_loss_clip": 1.05869222, + "balance_loss_mlp": 1.03357482, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 2.4876348147836302, + "language_loss": 0.75206888, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77419335, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.17773438, + "step": 2823, + "time_per_iteration": 2.5351781845092773 + }, + { + "auxiliary_loss_clip": 0.01164334, + "auxiliary_loss_mlp": 0.01056483, + "balance_loss_clip": 1.05948794, + "balance_loss_mlp": 1.03807759, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 2.1006638645065365, + "language_loss": 0.59812915, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62033731, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.18408203, + "step": 2824, + "time_per_iteration": 2.5933680534362793 + }, + { + "auxiliary_loss_clip": 0.01165276, + "auxiliary_loss_mlp": 0.01055124, + "balance_loss_clip": 1.05716646, + "balance_loss_mlp": 1.03554988, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 2.442565285147317, + "language_loss": 0.73349166, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75569558, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 1.08056641, + "router_z_loss_mlp": 0.19567871, + "step": 2825, + "time_per_iteration": 2.5832571983337402 + }, + { + "auxiliary_loss_clip": 0.01162621, + "auxiliary_loss_mlp": 0.0105556, + "balance_loss_clip": 1.05241942, + "balance_loss_mlp": 1.03492427, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 2.3748262633057653, + "language_loss": 0.85913086, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.88131261, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.20605469, + "step": 2826, + "time_per_iteration": 2.4394028186798096 + }, + { + "auxiliary_loss_clip": 0.01169357, + "auxiliary_loss_mlp": 0.01054345, + "balance_loss_clip": 1.06085491, + "balance_loss_mlp": 1.03540277, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.6653194833129683, + "language_loss": 0.82502639, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.8472634, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.18945312, + "step": 2827, + "time_per_iteration": 2.47910475730896 + }, + { + "auxiliary_loss_clip": 0.01174881, + "auxiliary_loss_mlp": 0.01056543, + "balance_loss_clip": 1.06400955, + "balance_loss_mlp": 1.03713536, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.6520918404442981, + "language_loss": 0.73927116, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76158535, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.19421387, + "step": 2828, + "time_per_iteration": 2.4736032485961914 + }, + { + "auxiliary_loss_clip": 0.01087852, + "auxiliary_loss_mlp": 0.0101532, + "balance_loss_clip": 1.05078089, + "balance_loss_mlp": 1.01312947, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7723995455039315, + "language_loss": 0.56440008, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58543175, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.37158203, + "router_z_loss_mlp": 0.02191162, + "step": 2829, + "time_per_iteration": 3.1494274139404297 + }, + { + "auxiliary_loss_clip": 0.01159786, + "auxiliary_loss_mlp": 0.01052587, + "balance_loss_clip": 1.0559094, + "balance_loss_mlp": 1.0337162, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.8063306296012056, + "language_loss": 0.82984442, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85196817, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.1887207, + "step": 2830, + "time_per_iteration": 2.472898244857788 + }, + { + "auxiliary_loss_clip": 0.01164145, + "auxiliary_loss_mlp": 0.01046411, + "balance_loss_clip": 1.0566442, + "balance_loss_mlp": 1.02676535, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.213441095369385, + "language_loss": 0.78328657, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80539215, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.19641113, + "step": 2831, + "time_per_iteration": 2.5434908866882324 + }, + { + "auxiliary_loss_clip": 0.01167857, + "auxiliary_loss_mlp": 0.01051041, + "balance_loss_clip": 1.05786967, + "balance_loss_mlp": 1.031896, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.7219254303152005, + "language_loss": 0.79336047, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81554943, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19140625, + "step": 2832, + "time_per_iteration": 2.4262475967407227 + }, + { + "auxiliary_loss_clip": 0.01166765, + "auxiliary_loss_mlp": 0.01057179, + "balance_loss_clip": 1.05602193, + "balance_loss_mlp": 1.03868961, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.7961414026759133, + "language_loss": 0.88910055, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91133994, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.18493652, + "step": 2833, + "time_per_iteration": 2.4968554973602295 + }, + { + "auxiliary_loss_clip": 0.01171559, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.06348419, + "balance_loss_mlp": 1.02748632, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.60157280219337, + "language_loss": 0.72087216, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74304575, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.18310547, + "step": 2834, + "time_per_iteration": 2.624086380004883 + }, + { + "auxiliary_loss_clip": 0.01166987, + "auxiliary_loss_mlp": 0.01054688, + "balance_loss_clip": 1.06040788, + "balance_loss_mlp": 1.03734314, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.811209729059828, + "language_loss": 0.86349273, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88570952, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.17346191, + "step": 2835, + "time_per_iteration": 2.554450273513794 + }, + { + "auxiliary_loss_clip": 0.01172573, + "auxiliary_loss_mlp": 0.0104848, + "balance_loss_clip": 1.06476271, + "balance_loss_mlp": 1.02791679, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 3.0767273194033753, + "language_loss": 0.73848629, + "learning_rate": 3.796446484348989e-06, + "loss": 0.76069689, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.20568848, + "step": 2836, + "time_per_iteration": 2.400575876235962 + }, + { + "auxiliary_loss_clip": 0.0116478, + "auxiliary_loss_mlp": 0.01049165, + "balance_loss_clip": 1.0557611, + "balance_loss_mlp": 1.0285058, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.4895000943458854, + "language_loss": 0.80007935, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82221884, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.2064209, + "step": 2837, + "time_per_iteration": 2.4227302074432373 + }, + { + "auxiliary_loss_clip": 0.0116147, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.06082046, + "balance_loss_mlp": 1.02921844, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 1.9311679169194824, + "language_loss": 0.83284587, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85493708, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.18432617, + "step": 2838, + "time_per_iteration": 2.4225540161132812 + }, + { + "auxiliary_loss_clip": 0.01162483, + "auxiliary_loss_mlp": 0.01049976, + "balance_loss_clip": 1.0551672, + "balance_loss_mlp": 1.0316658, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.8082339234334248, + "language_loss": 0.93549627, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95762086, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.18310547, + "step": 2839, + "time_per_iteration": 2.5026004314422607 + }, + { + "auxiliary_loss_clip": 0.01160316, + "auxiliary_loss_mlp": 0.01044138, + "balance_loss_clip": 1.05367208, + "balance_loss_mlp": 1.02411056, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 1.9461963445349235, + "language_loss": 0.83743596, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.85948062, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.20031738, + "step": 2840, + "time_per_iteration": 2.463623046875 + }, + { + "auxiliary_loss_clip": 0.01166622, + "auxiliary_loss_mlp": 0.01046468, + "balance_loss_clip": 1.05794442, + "balance_loss_mlp": 1.02653599, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.939212087924408, + "language_loss": 0.75984359, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78197449, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.19934082, + "step": 2841, + "time_per_iteration": 2.4568867683410645 + }, + { + "auxiliary_loss_clip": 0.01161697, + "auxiliary_loss_mlp": 0.01052258, + "balance_loss_clip": 1.05495834, + "balance_loss_mlp": 1.03333902, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 2.444642072590888, + "language_loss": 0.7690872, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79122669, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18908691, + "step": 2842, + "time_per_iteration": 2.440037250518799 + }, + { + "auxiliary_loss_clip": 0.01159698, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.05666018, + "balance_loss_mlp": 1.02708793, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 1.9770165080722213, + "language_loss": 0.85826886, + "learning_rate": 3.795246529087043e-06, + "loss": 0.8803218, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.18505859, + "step": 2843, + "time_per_iteration": 2.5033464431762695 + }, + { + "auxiliary_loss_clip": 0.01154937, + "auxiliary_loss_mlp": 0.01045482, + "balance_loss_clip": 1.05479574, + "balance_loss_mlp": 1.02738619, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 2.1918773020316555, + "language_loss": 0.68435872, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7063629, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.18115234, + "step": 2844, + "time_per_iteration": 2.3941571712493896 + }, + { + "auxiliary_loss_clip": 0.01164744, + "auxiliary_loss_mlp": 0.01056074, + "balance_loss_clip": 1.05946338, + "balance_loss_mlp": 1.03663146, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 2.02986399727168, + "language_loss": 0.78957915, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.81178737, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.19445801, + "step": 2845, + "time_per_iteration": 3.8615057468414307 + }, + { + "auxiliary_loss_clip": 0.01163194, + "auxiliary_loss_mlp": 0.01042209, + "balance_loss_clip": 1.06046438, + "balance_loss_mlp": 1.0246737, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.639540418061884, + "language_loss": 0.78428864, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80634266, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.17541504, + "step": 2846, + "time_per_iteration": 2.418354034423828 + }, + { + "auxiliary_loss_clip": 0.01154771, + "auxiliary_loss_mlp": 0.01043929, + "balance_loss_clip": 1.05234122, + "balance_loss_mlp": 1.02583265, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.994102594492144, + "language_loss": 0.80012369, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82211065, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.1809082, + "step": 2847, + "time_per_iteration": 4.042368412017822 + }, + { + "auxiliary_loss_clip": 0.01153781, + "auxiliary_loss_mlp": 0.01047759, + "balance_loss_clip": 1.04914904, + "balance_loss_mlp": 1.02894795, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.699683321631975, + "language_loss": 0.87372291, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89573836, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.18811035, + "step": 2848, + "time_per_iteration": 2.503988742828369 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01049017, + "balance_loss_clip": 1.05263638, + "balance_loss_mlp": 1.0304085, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 5.949969811057779, + "language_loss": 0.74892902, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77100277, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.18615723, + "step": 2849, + "time_per_iteration": 2.4939584732055664 + }, + { + "auxiliary_loss_clip": 0.01093251, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.05798841, + "balance_loss_mlp": 1.03347421, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.8066610255456632, + "language_loss": 0.57489991, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59618616, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.0189209, + "step": 2850, + "time_per_iteration": 3.0622105598449707 + }, + { + "auxiliary_loss_clip": 0.01157892, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.05831075, + "balance_loss_mlp": 1.02419305, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.5522973377882976, + "language_loss": 0.80849963, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83050168, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.18115234, + "step": 2851, + "time_per_iteration": 3.993166923522949 + }, + { + "auxiliary_loss_clip": 0.01156997, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_clip": 1.05530953, + "balance_loss_mlp": 1.03892493, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 3.612133814052211, + "language_loss": 0.93387645, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95603669, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.20092773, + "step": 2852, + "time_per_iteration": 2.462440252304077 + }, + { + "auxiliary_loss_clip": 0.01159553, + "auxiliary_loss_mlp": 0.01055299, + "balance_loss_clip": 1.05489004, + "balance_loss_mlp": 1.03438985, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 2.126934262822852, + "language_loss": 0.6945169, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71666545, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.20910645, + "step": 2853, + "time_per_iteration": 2.4635586738586426 + }, + { + "auxiliary_loss_clip": 0.01160856, + "auxiliary_loss_mlp": 0.01050637, + "balance_loss_clip": 1.05652976, + "balance_loss_mlp": 1.03236246, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.42102902284124, + "language_loss": 0.6681577, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.69027257, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18286133, + "step": 2854, + "time_per_iteration": 3.8164563179016113 + }, + { + "auxiliary_loss_clip": 0.01161659, + "auxiliary_loss_mlp": 0.01044914, + "balance_loss_clip": 1.05635011, + "balance_loss_mlp": 1.02673459, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.6986786520106785, + "language_loss": 0.8902626, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91232824, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.1817627, + "step": 2855, + "time_per_iteration": 2.470252752304077 + }, + { + "auxiliary_loss_clip": 0.01162652, + "auxiliary_loss_mlp": 0.01056431, + "balance_loss_clip": 1.05808449, + "balance_loss_mlp": 1.03890729, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 3.170219218682776, + "language_loss": 0.83584565, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85803652, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.17529297, + "step": 2856, + "time_per_iteration": 2.491340160369873 + }, + { + "auxiliary_loss_clip": 0.01168966, + "auxiliary_loss_mlp": 0.01050178, + "balance_loss_clip": 1.06245852, + "balance_loss_mlp": 1.03204632, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 2.0762728572118245, + "language_loss": 0.8645407, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88673216, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18139648, + "step": 2857, + "time_per_iteration": 2.461442232131958 + }, + { + "auxiliary_loss_clip": 0.01160666, + "auxiliary_loss_mlp": 0.01060237, + "balance_loss_clip": 1.05715382, + "balance_loss_mlp": 1.04159307, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.3562871291643392, + "language_loss": 0.78137696, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80358601, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.18634033, + "step": 2858, + "time_per_iteration": 2.4442994594573975 + }, + { + "auxiliary_loss_clip": 0.01170249, + "auxiliary_loss_mlp": 0.01061424, + "balance_loss_clip": 1.05883837, + "balance_loss_mlp": 1.03875077, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 6.550504695405818, + "language_loss": 0.77244973, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79476649, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.22668457, + "step": 2859, + "time_per_iteration": 2.4398622512817383 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01050003, + "balance_loss_clip": 1.05996227, + "balance_loss_mlp": 1.03067923, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 2.1320899918838156, + "language_loss": 0.77272916, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79484332, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.1932373, + "step": 2860, + "time_per_iteration": 2.427614450454712 + }, + { + "auxiliary_loss_clip": 0.0116078, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.05447733, + "balance_loss_mlp": 1.02666342, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.2003410581865426, + "language_loss": 0.82103503, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84310043, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.19091797, + "step": 2861, + "time_per_iteration": 2.4595413208007812 + }, + { + "auxiliary_loss_clip": 0.01168497, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.06410778, + "balance_loss_mlp": 1.02961016, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 2.168807984191274, + "language_loss": 0.85757202, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87973881, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.18554688, + "step": 2862, + "time_per_iteration": 2.4411075115203857 + }, + { + "auxiliary_loss_clip": 0.01162356, + "auxiliary_loss_mlp": 0.01043571, + "balance_loss_clip": 1.06052577, + "balance_loss_mlp": 1.02675033, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 1.9940376360731542, + "language_loss": 0.77887052, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80092978, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.16809082, + "step": 2863, + "time_per_iteration": 2.497830390930176 + }, + { + "auxiliary_loss_clip": 0.0117097, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.06399202, + "balance_loss_mlp": 1.02945769, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.9704232633584815, + "language_loss": 0.72671402, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74890208, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.18395996, + "step": 2864, + "time_per_iteration": 2.479860544204712 + }, + { + "auxiliary_loss_clip": 0.01165758, + "auxiliary_loss_mlp": 0.01046966, + "balance_loss_clip": 1.060619, + "balance_loss_mlp": 1.02870345, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.7235511529625205, + "language_loss": 0.72535467, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.74748194, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.18249512, + "step": 2865, + "time_per_iteration": 2.4732067584991455 + }, + { + "auxiliary_loss_clip": 0.01165462, + "auxiliary_loss_mlp": 0.01047818, + "balance_loss_clip": 1.06227326, + "balance_loss_mlp": 1.02959085, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.660585183191867, + "language_loss": 0.78643143, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80856425, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.18225098, + "step": 2866, + "time_per_iteration": 2.434224843978882 + }, + { + "auxiliary_loss_clip": 0.01158078, + "auxiliary_loss_mlp": 0.01056071, + "balance_loss_clip": 1.05336607, + "balance_loss_mlp": 1.03370726, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.6777249432164871, + "language_loss": 0.79982048, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82196194, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.22363281, + "step": 2867, + "time_per_iteration": 2.4294698238372803 + }, + { + "auxiliary_loss_clip": 0.01162667, + "auxiliary_loss_mlp": 0.01049014, + "balance_loss_clip": 1.05877638, + "balance_loss_mlp": 1.02799773, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.668390352432253, + "language_loss": 0.79697859, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81909537, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.21020508, + "step": 2868, + "time_per_iteration": 2.40922212600708 + }, + { + "auxiliary_loss_clip": 0.01166068, + "auxiliary_loss_mlp": 0.01052673, + "balance_loss_clip": 1.05717385, + "balance_loss_mlp": 1.03425503, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9318421356661777, + "language_loss": 0.84118229, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.8633697, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.18432617, + "step": 2869, + "time_per_iteration": 2.577376127243042 + }, + { + "auxiliary_loss_clip": 0.01173035, + "auxiliary_loss_mlp": 0.01050286, + "balance_loss_clip": 1.06441021, + "balance_loss_mlp": 1.03120017, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.047131497445026, + "language_loss": 0.77060449, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79283768, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.19091797, + "step": 2870, + "time_per_iteration": 2.5847785472869873 + }, + { + "auxiliary_loss_clip": 0.01168057, + "auxiliary_loss_mlp": 0.01041499, + "balance_loss_clip": 1.06569719, + "balance_loss_mlp": 1.02482116, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 3.110378499685793, + "language_loss": 0.76900816, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79110372, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.16687012, + "step": 2871, + "time_per_iteration": 2.525669574737549 + }, + { + "auxiliary_loss_clip": 0.01164601, + "auxiliary_loss_mlp": 0.01041066, + "balance_loss_clip": 1.06012225, + "balance_loss_mlp": 1.02252936, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.677445940114608, + "language_loss": 0.74105632, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76311302, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.1854248, + "step": 2872, + "time_per_iteration": 2.488973379135132 + }, + { + "auxiliary_loss_clip": 0.01160662, + "auxiliary_loss_mlp": 0.0104659, + "balance_loss_clip": 1.05925202, + "balance_loss_mlp": 1.02818418, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.7535540485636272, + "language_loss": 0.82247579, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84454834, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.18408203, + "step": 2873, + "time_per_iteration": 2.4815776348114014 + }, + { + "auxiliary_loss_clip": 0.01158289, + "auxiliary_loss_mlp": 0.01044999, + "balance_loss_clip": 1.05418158, + "balance_loss_mlp": 1.02553225, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 3.1690931510294518, + "language_loss": 0.75273961, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77477247, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.19458008, + "step": 2874, + "time_per_iteration": 2.40584135055542 + }, + { + "auxiliary_loss_clip": 0.01157627, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_clip": 1.0537535, + "balance_loss_mlp": 1.02754068, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 2.143149851137838, + "language_loss": 0.8140592, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.83610445, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.19360352, + "step": 2875, + "time_per_iteration": 2.4477639198303223 + }, + { + "auxiliary_loss_clip": 0.0116072, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_clip": 1.05512059, + "balance_loss_mlp": 1.0344429, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 3.225248795824281, + "language_loss": 0.87243915, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89458251, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.19165039, + "step": 2876, + "time_per_iteration": 2.404787063598633 + }, + { + "auxiliary_loss_clip": 0.01166134, + "auxiliary_loss_mlp": 0.01037804, + "balance_loss_clip": 1.06318545, + "balance_loss_mlp": 1.02025652, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.722738789599503, + "language_loss": 0.84141564, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86345506, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.17553711, + "step": 2877, + "time_per_iteration": 2.5172109603881836 + }, + { + "auxiliary_loss_clip": 0.01159328, + "auxiliary_loss_mlp": 0.01043655, + "balance_loss_clip": 1.05539322, + "balance_loss_mlp": 1.02482033, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 2.0752690972061822, + "language_loss": 0.79210234, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81413215, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.18835449, + "step": 2878, + "time_per_iteration": 2.4406819343566895 + }, + { + "auxiliary_loss_clip": 0.01157251, + "auxiliary_loss_mlp": 0.01045164, + "balance_loss_clip": 1.05530894, + "balance_loss_mlp": 1.027807, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.83340876205575, + "language_loss": 0.70596433, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72798848, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.17358398, + "step": 2879, + "time_per_iteration": 2.540616750717163 + }, + { + "auxiliary_loss_clip": 0.01158261, + "auxiliary_loss_mlp": 0.01046365, + "balance_loss_clip": 1.05559731, + "balance_loss_mlp": 1.02873349, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.1766872144367246, + "language_loss": 0.82856041, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85060668, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.1763916, + "step": 2880, + "time_per_iteration": 2.444145441055298 + }, + { + "auxiliary_loss_clip": 0.01163757, + "auxiliary_loss_mlp": 0.01045936, + "balance_loss_clip": 1.05886388, + "balance_loss_mlp": 1.02673173, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.2293948754718564, + "language_loss": 0.80614847, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8282454, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.1920166, + "step": 2881, + "time_per_iteration": 2.5257010459899902 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.05956852, + "balance_loss_mlp": 1.02320981, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.400418083909313, + "language_loss": 0.77443421, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79643953, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.16015625, + "step": 2882, + "time_per_iteration": 2.494091510772705 + }, + { + "auxiliary_loss_clip": 0.01161088, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.06162655, + "balance_loss_mlp": 1.03337169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.9722438168904062, + "language_loss": 0.76430935, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78643036, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.1763916, + "step": 2883, + "time_per_iteration": 2.5283684730529785 + }, + { + "auxiliary_loss_clip": 0.01159879, + "auxiliary_loss_mlp": 0.01043326, + "balance_loss_clip": 1.05789852, + "balance_loss_mlp": 1.02576613, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 2.3614796591392087, + "language_loss": 0.85933852, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.8813706, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.17565918, + "step": 2884, + "time_per_iteration": 2.458441734313965 + }, + { + "auxiliary_loss_clip": 0.01163433, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.05958796, + "balance_loss_mlp": 1.02632117, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6472069523172206, + "language_loss": 0.74032927, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76239741, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.17041016, + "step": 2885, + "time_per_iteration": 2.4959261417388916 + }, + { + "auxiliary_loss_clip": 0.01152823, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.05565667, + "balance_loss_mlp": 1.02301121, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.5836892208664164, + "language_loss": 0.70519602, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72711915, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.16479492, + "step": 2886, + "time_per_iteration": 2.4736924171447754 + }, + { + "auxiliary_loss_clip": 0.01161448, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_clip": 1.05735898, + "balance_loss_mlp": 1.02715957, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 2.788611359249366, + "language_loss": 0.69378382, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71584994, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.17993164, + "step": 2887, + "time_per_iteration": 2.4744086265563965 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.01050298, + "balance_loss_clip": 1.06095445, + "balance_loss_mlp": 1.03335834, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.8083335096045774, + "language_loss": 0.85344601, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87558532, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.16943359, + "step": 2888, + "time_per_iteration": 2.412654161453247 + }, + { + "auxiliary_loss_clip": 0.01163847, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.06157207, + "balance_loss_mlp": 1.02945781, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 2.1471916940219358, + "language_loss": 0.78477377, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80689359, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.18664551, + "step": 2889, + "time_per_iteration": 4.027343511581421 + }, + { + "auxiliary_loss_clip": 0.01164574, + "auxiliary_loss_mlp": 0.01043654, + "balance_loss_clip": 1.06533742, + "balance_loss_mlp": 1.0253433, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.7335572114733377, + "language_loss": 0.83844209, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86052442, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.18310547, + "step": 2890, + "time_per_iteration": 2.441721200942993 + }, + { + "auxiliary_loss_clip": 0.01172428, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.06627035, + "balance_loss_mlp": 1.0345974, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 2.2180482565898294, + "language_loss": 0.82218623, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84443915, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.18273926, + "step": 2891, + "time_per_iteration": 3.7496540546417236 + }, + { + "auxiliary_loss_clip": 0.01158804, + "auxiliary_loss_mlp": 0.01045045, + "balance_loss_clip": 1.05606687, + "balance_loss_mlp": 1.02669907, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.091428754599029, + "language_loss": 0.81472319, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83676171, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.18347168, + "step": 2892, + "time_per_iteration": 2.5734126567840576 + }, + { + "auxiliary_loss_clip": 0.01167088, + "auxiliary_loss_mlp": 0.01057602, + "balance_loss_clip": 1.06465244, + "balance_loss_mlp": 1.03879046, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.122446870190439, + "language_loss": 0.74752653, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76977336, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.18811035, + "step": 2893, + "time_per_iteration": 2.6279735565185547 + }, + { + "auxiliary_loss_clip": 0.01161623, + "auxiliary_loss_mlp": 0.01054532, + "balance_loss_clip": 1.05941248, + "balance_loss_mlp": 1.03675807, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.047118877068774, + "language_loss": 0.82824826, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85040981, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.17785645, + "step": 2894, + "time_per_iteration": 2.5339951515197754 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.05452442, + "balance_loss_mlp": 1.02668929, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 7.6743186257841876, + "language_loss": 0.74008203, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76211524, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.20288086, + "step": 2895, + "time_per_iteration": 3.8778929710388184 + }, + { + "auxiliary_loss_clip": 0.01071582, + "auxiliary_loss_mlp": 0.01068879, + "balance_loss_clip": 1.03704667, + "balance_loss_mlp": 1.06671274, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.9064644702035101, + "language_loss": 0.62783778, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.6492424, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.34521484, + "router_z_loss_mlp": 0.021698, + "step": 2896, + "time_per_iteration": 3.129000663757324 + }, + { + "auxiliary_loss_clip": 0.01159096, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.05390835, + "balance_loss_mlp": 1.0195334, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 1.8349081175681878, + "language_loss": 0.75747502, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77944046, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.17919922, + "step": 2897, + "time_per_iteration": 2.5405492782592773 + }, + { + "auxiliary_loss_clip": 0.01152687, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.05367053, + "balance_loss_mlp": 1.02306342, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.864984646078847, + "language_loss": 0.69560432, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71754038, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.17858887, + "step": 2898, + "time_per_iteration": 4.119635343551636 + }, + { + "auxiliary_loss_clip": 0.01169926, + "auxiliary_loss_mlp": 0.01046036, + "balance_loss_clip": 1.06339073, + "balance_loss_mlp": 1.02779722, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.4020878499377174, + "language_loss": 0.76295853, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78511816, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18249512, + "step": 2899, + "time_per_iteration": 2.491580009460449 + }, + { + "auxiliary_loss_clip": 0.01159343, + "auxiliary_loss_mlp": 0.01048657, + "balance_loss_clip": 1.05951929, + "balance_loss_mlp": 1.03033412, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9839459650954827, + "language_loss": 0.72873044, + "learning_rate": 3.785351493339121e-06, + "loss": 0.7508105, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.18334961, + "step": 2900, + "time_per_iteration": 2.5192127227783203 + }, + { + "auxiliary_loss_clip": 0.01159279, + "auxiliary_loss_mlp": 0.01051997, + "balance_loss_clip": 1.0581367, + "balance_loss_mlp": 1.03511691, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 2.1504314442755206, + "language_loss": 0.69836575, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72047848, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.16882324, + "step": 2901, + "time_per_iteration": 2.6183671951293945 + }, + { + "auxiliary_loss_clip": 0.01164546, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.05887151, + "balance_loss_mlp": 1.03831005, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.9525340895515269, + "language_loss": 0.76385134, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78606153, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.18139648, + "step": 2902, + "time_per_iteration": 2.520383834838867 + }, + { + "auxiliary_loss_clip": 0.01159088, + "auxiliary_loss_mlp": 0.01065862, + "balance_loss_clip": 1.05521393, + "balance_loss_mlp": 1.04807591, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 1.9610872163002913, + "language_loss": 0.81502831, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83727777, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.17785645, + "step": 2903, + "time_per_iteration": 2.4278724193573 + }, + { + "auxiliary_loss_clip": 0.01166338, + "auxiliary_loss_mlp": 0.0106037, + "balance_loss_clip": 1.06219649, + "balance_loss_mlp": 1.04251218, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 1.98629725143916, + "language_loss": 0.73496521, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75723231, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.17858887, + "step": 2904, + "time_per_iteration": 2.537201404571533 + }, + { + "auxiliary_loss_clip": 0.01153316, + "auxiliary_loss_mlp": 0.01052224, + "balance_loss_clip": 1.05229604, + "balance_loss_mlp": 1.03445017, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 2.056867593040317, + "language_loss": 0.64600563, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66806102, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.17749023, + "step": 2905, + "time_per_iteration": 2.50915789604187 + }, + { + "auxiliary_loss_clip": 0.01167162, + "auxiliary_loss_mlp": 0.01058968, + "balance_loss_clip": 1.05837071, + "balance_loss_mlp": 1.03911924, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 2.1487602775887016, + "language_loss": 0.79596031, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81822157, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.19848633, + "step": 2906, + "time_per_iteration": 2.501891613006592 + }, + { + "auxiliary_loss_clip": 0.01163667, + "auxiliary_loss_mlp": 0.01062067, + "balance_loss_clip": 1.05979633, + "balance_loss_mlp": 1.04343498, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.8880480205577905, + "language_loss": 0.81230009, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83455741, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18615723, + "step": 2907, + "time_per_iteration": 2.4204490184783936 + }, + { + "auxiliary_loss_clip": 0.01157527, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.05416071, + "balance_loss_mlp": 1.03672838, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.266213644204116, + "language_loss": 0.81487608, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83699942, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.18066406, + "step": 2908, + "time_per_iteration": 2.4922378063201904 + }, + { + "auxiliary_loss_clip": 0.01159186, + "auxiliary_loss_mlp": 0.01054232, + "balance_loss_clip": 1.05558431, + "balance_loss_mlp": 1.03517032, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 4.257172986544591, + "language_loss": 0.79896879, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82110298, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.19055176, + "step": 2909, + "time_per_iteration": 2.39442777633667 + }, + { + "auxiliary_loss_clip": 0.01156399, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.05241632, + "balance_loss_mlp": 1.03449488, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 2.5808343727788956, + "language_loss": 0.76788926, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78999841, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.19995117, + "step": 2910, + "time_per_iteration": 2.478598117828369 + }, + { + "auxiliary_loss_clip": 0.01160512, + "auxiliary_loss_mlp": 0.0105559, + "balance_loss_clip": 1.05639827, + "balance_loss_mlp": 1.03538382, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 2.292913404162713, + "language_loss": 0.86975527, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89191628, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.2019043, + "step": 2911, + "time_per_iteration": 2.5065550804138184 + }, + { + "auxiliary_loss_clip": 0.01152832, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_clip": 1.05013251, + "balance_loss_mlp": 1.02831256, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.1972029220293035, + "language_loss": 0.90193516, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.92394245, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.19580078, + "step": 2912, + "time_per_iteration": 2.4813790321350098 + }, + { + "auxiliary_loss_clip": 0.01158993, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.05501842, + "balance_loss_mlp": 1.02281356, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 1.8771232637614654, + "language_loss": 0.72259575, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74462086, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.20703125, + "step": 2913, + "time_per_iteration": 2.414123773574829 + }, + { + "auxiliary_loss_clip": 0.01166197, + "auxiliary_loss_mlp": 0.01045714, + "balance_loss_clip": 1.06076825, + "balance_loss_mlp": 1.02690244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 1.727522568290937, + "language_loss": 0.69099939, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71311849, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.18811035, + "step": 2914, + "time_per_iteration": 2.4804112911224365 + }, + { + "auxiliary_loss_clip": 0.01167297, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.06382918, + "balance_loss_mlp": 1.02620316, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.9995445461711623, + "language_loss": 0.93340147, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95551741, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.18115234, + "step": 2915, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01154812, + "auxiliary_loss_mlp": 0.01047248, + "balance_loss_clip": 1.0570159, + "balance_loss_mlp": 1.02832913, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8084175474358193, + "language_loss": 0.80662262, + "learning_rate": 3.782534349431226e-06, + "loss": 0.8286432, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.18933105, + "step": 2916, + "time_per_iteration": 2.539243698120117 + }, + { + "auxiliary_loss_clip": 0.01158655, + "auxiliary_loss_mlp": 0.01057097, + "balance_loss_clip": 1.05463243, + "balance_loss_mlp": 1.0391326, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.775009657881267, + "language_loss": 0.74121821, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76337576, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.1796875, + "step": 2917, + "time_per_iteration": 2.439239740371704 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01051249, + "balance_loss_clip": 1.05409884, + "balance_loss_mlp": 1.03157926, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 2.262003765164305, + "language_loss": 0.77610612, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79816329, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.19677734, + "step": 2918, + "time_per_iteration": 2.485947608947754 + }, + { + "auxiliary_loss_clip": 0.01171077, + "auxiliary_loss_mlp": 0.01043276, + "balance_loss_clip": 1.0645597, + "balance_loss_mlp": 1.02401209, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 2.1732836295578775, + "language_loss": 0.73883742, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76098096, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.19274902, + "step": 2919, + "time_per_iteration": 2.662188768386841 + }, + { + "auxiliary_loss_clip": 0.01164364, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.05698371, + "balance_loss_mlp": 1.03134537, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 2.951286007992475, + "language_loss": 0.74582911, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76796365, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.17736816, + "step": 2920, + "time_per_iteration": 2.6837270259857178 + }, + { + "auxiliary_loss_clip": 0.0115848, + "auxiliary_loss_mlp": 0.01039101, + "balance_loss_clip": 1.0563544, + "balance_loss_mlp": 1.02146971, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.7014630338053207, + "language_loss": 0.79671407, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81868988, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.17651367, + "step": 2921, + "time_per_iteration": 2.4308855533599854 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.05800998, + "balance_loss_mlp": 1.02697253, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.7379027829020133, + "language_loss": 0.87796587, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.90010065, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1953125, + "step": 2922, + "time_per_iteration": 2.5210440158843994 + }, + { + "auxiliary_loss_clip": 0.01167563, + "auxiliary_loss_mlp": 0.01045344, + "balance_loss_clip": 1.06376886, + "balance_loss_mlp": 1.02659225, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 4.7727882714098895, + "language_loss": 0.62522554, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.6473546, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.1875, + "step": 2923, + "time_per_iteration": 2.494910717010498 + }, + { + "auxiliary_loss_clip": 0.01163863, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_clip": 1.05793488, + "balance_loss_mlp": 1.02749813, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.9807263121852596, + "language_loss": 0.8069706, + "learning_rate": 3.78111928675413e-06, + "loss": 0.8290832, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.19897461, + "step": 2924, + "time_per_iteration": 2.4837019443511963 + }, + { + "auxiliary_loss_clip": 0.01162525, + "auxiliary_loss_mlp": 0.01062136, + "balance_loss_clip": 1.05471754, + "balance_loss_mlp": 1.04158401, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 4.644746481503436, + "language_loss": 0.70905274, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73129934, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.20556641, + "step": 2925, + "time_per_iteration": 2.4054925441741943 + }, + { + "auxiliary_loss_clip": 0.01157559, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_clip": 1.05631566, + "balance_loss_mlp": 1.03009748, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 2.467607909309787, + "language_loss": 0.72030783, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74236298, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.1784668, + "step": 2926, + "time_per_iteration": 2.5679519176483154 + }, + { + "auxiliary_loss_clip": 0.01158979, + "auxiliary_loss_mlp": 0.01043209, + "balance_loss_clip": 1.05341911, + "balance_loss_mlp": 1.02233493, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 3.1186377490180397, + "language_loss": 0.85122454, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.87324643, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.20898438, + "step": 2927, + "time_per_iteration": 2.4626970291137695 + }, + { + "auxiliary_loss_clip": 0.01154645, + "auxiliary_loss_mlp": 0.01050846, + "balance_loss_clip": 1.05260122, + "balance_loss_mlp": 1.03309608, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 2.0028815635692685, + "language_loss": 0.72080386, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74285877, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.17749023, + "step": 2928, + "time_per_iteration": 2.6317694187164307 + }, + { + "auxiliary_loss_clip": 0.01162064, + "auxiliary_loss_mlp": 0.0104834, + "balance_loss_clip": 1.0610379, + "balance_loss_mlp": 1.02921832, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 5.242391882850927, + "language_loss": 0.83296907, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85507309, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.19104004, + "step": 2929, + "time_per_iteration": 2.5071520805358887 + }, + { + "auxiliary_loss_clip": 0.01157201, + "auxiliary_loss_mlp": 0.01041772, + "balance_loss_clip": 1.05317497, + "balance_loss_mlp": 1.02359247, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.57930475217738, + "language_loss": 0.79345042, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81544012, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18188477, + "step": 2930, + "time_per_iteration": 2.486985921859741 + }, + { + "auxiliary_loss_clip": 0.01170531, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_clip": 1.06576717, + "balance_loss_mlp": 1.02610302, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.652854837864945, + "language_loss": 0.76610792, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78827167, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.1973877, + "step": 2931, + "time_per_iteration": 2.4676554203033447 + }, + { + "auxiliary_loss_clip": 0.01167719, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.06253982, + "balance_loss_mlp": 1.02319551, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.2702072799357285, + "language_loss": 0.74827248, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7703619, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.18029785, + "step": 2932, + "time_per_iteration": 3.8718316555023193 + }, + { + "auxiliary_loss_clip": 0.01167787, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.05756676, + "balance_loss_mlp": 1.02662039, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 6.524046418725844, + "language_loss": 0.89814758, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92029285, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.20117188, + "step": 2933, + "time_per_iteration": 2.4344825744628906 + }, + { + "auxiliary_loss_clip": 0.0116099, + "auxiliary_loss_mlp": 0.01060621, + "balance_loss_clip": 1.05791724, + "balance_loss_mlp": 1.04034352, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.8468672425101964, + "language_loss": 0.88254154, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90475762, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.20275879, + "step": 2934, + "time_per_iteration": 2.5397582054138184 + }, + { + "auxiliary_loss_clip": 0.01157154, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.05651045, + "balance_loss_mlp": 1.02666044, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.608603825778404, + "language_loss": 0.70855701, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73057508, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.18005371, + "step": 2935, + "time_per_iteration": 4.09400200843811 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.05519581, + "balance_loss_mlp": 1.02204359, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2858699764181734, + "language_loss": 0.69869924, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.72075361, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.18847656, + "step": 2936, + "time_per_iteration": 2.4508750438690186 + }, + { + "auxiliary_loss_clip": 0.01178193, + "auxiliary_loss_mlp": 0.01041413, + "balance_loss_clip": 1.07303178, + "balance_loss_mlp": 1.02410412, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 4.197220321473445, + "language_loss": 0.71710885, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73930484, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.17297363, + "step": 2937, + "time_per_iteration": 2.5726001262664795 + }, + { + "auxiliary_loss_clip": 0.01163726, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.05793023, + "balance_loss_mlp": 1.02384853, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.3159833047957683, + "language_loss": 0.75600302, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.77807999, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.20129395, + "step": 2938, + "time_per_iteration": 3.8443310260772705 + }, + { + "auxiliary_loss_clip": 0.0116376, + "auxiliary_loss_mlp": 0.01040715, + "balance_loss_clip": 1.05904675, + "balance_loss_mlp": 1.0227387, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.4070079701384812, + "language_loss": 0.70750576, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.72955048, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.17980957, + "step": 2939, + "time_per_iteration": 2.5084266662597656 + }, + { + "auxiliary_loss_clip": 0.01161791, + "auxiliary_loss_mlp": 0.01043937, + "balance_loss_clip": 1.05785453, + "balance_loss_mlp": 1.02530432, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 2.5281742435313426, + "language_loss": 0.74059761, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7626549, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.18652344, + "step": 2940, + "time_per_iteration": 2.493492603302002 + }, + { + "auxiliary_loss_clip": 0.01167406, + "auxiliary_loss_mlp": 0.01046125, + "balance_loss_clip": 1.06199157, + "balance_loss_mlp": 1.02599096, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.133663064889865, + "language_loss": 0.85664755, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87878287, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.20129395, + "step": 2941, + "time_per_iteration": 2.613586902618408 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.06284404, + "balance_loss_mlp": 1.02248764, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.2227215507325133, + "language_loss": 0.77117932, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.7932834, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.18505859, + "step": 2942, + "time_per_iteration": 3.9109795093536377 + }, + { + "auxiliary_loss_clip": 0.01162868, + "auxiliary_loss_mlp": 0.01045063, + "balance_loss_clip": 1.05745292, + "balance_loss_mlp": 1.02600169, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.863912076954168, + "language_loss": 0.8067162, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82879555, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.19055176, + "step": 2943, + "time_per_iteration": 2.4930195808410645 + }, + { + "auxiliary_loss_clip": 0.01158915, + "auxiliary_loss_mlp": 0.01053385, + "balance_loss_clip": 1.0519557, + "balance_loss_mlp": 1.03309536, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.0969601664942235, + "language_loss": 0.80949366, + "learning_rate": 3.777562726341155e-06, + "loss": 0.83161664, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.20288086, + "step": 2944, + "time_per_iteration": 2.5315628051757812 + }, + { + "auxiliary_loss_clip": 0.01158408, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.05399394, + "balance_loss_mlp": 1.0391314, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 3.1322546760837535, + "language_loss": 0.73782271, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75998533, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18713379, + "step": 2945, + "time_per_iteration": 2.7149300575256348 + }, + { + "auxiliary_loss_clip": 0.01166832, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_clip": 1.06552136, + "balance_loss_mlp": 1.0284586, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 3.9280425948358086, + "language_loss": 0.77936053, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80149287, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.17944336, + "step": 2946, + "time_per_iteration": 2.452354907989502 + }, + { + "auxiliary_loss_clip": 0.01160568, + "auxiliary_loss_mlp": 0.01047165, + "balance_loss_clip": 1.05748379, + "balance_loss_mlp": 1.02879524, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.864009188605328, + "language_loss": 0.76127565, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78335297, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.18371582, + "step": 2947, + "time_per_iteration": 2.532858371734619 + }, + { + "auxiliary_loss_clip": 0.01155078, + "auxiliary_loss_mlp": 0.01047254, + "balance_loss_clip": 1.05175734, + "balance_loss_mlp": 1.02813315, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.4478999473786827, + "language_loss": 0.72427523, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74629861, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.19116211, + "step": 2948, + "time_per_iteration": 2.5736148357391357 + }, + { + "auxiliary_loss_clip": 0.01159069, + "auxiliary_loss_mlp": 0.0104409, + "balance_loss_clip": 1.0568496, + "balance_loss_mlp": 1.0257082, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 2.0207673082571587, + "language_loss": 0.82015705, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8421886, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.18383789, + "step": 2949, + "time_per_iteration": 2.4957776069641113 + }, + { + "auxiliary_loss_clip": 0.01082487, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.04234052, + "balance_loss_mlp": 1.02541339, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7828372037287521, + "language_loss": 0.65048438, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67159665, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.40136719, + "router_z_loss_mlp": 0.03323364, + "step": 2950, + "time_per_iteration": 3.1506900787353516 + }, + { + "auxiliary_loss_clip": 0.01151185, + "auxiliary_loss_mlp": 0.01044187, + "balance_loss_clip": 1.05142164, + "balance_loss_mlp": 1.02558994, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.8899806383959845, + "language_loss": 0.84002805, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.86198181, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.18603516, + "step": 2951, + "time_per_iteration": 2.5563478469848633 + }, + { + "auxiliary_loss_clip": 0.01160606, + "auxiliary_loss_mlp": 0.01048081, + "balance_loss_clip": 1.05281866, + "balance_loss_mlp": 1.02912688, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.4350498414510087, + "language_loss": 0.79960549, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82169241, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.1895752, + "step": 2952, + "time_per_iteration": 2.430652618408203 + }, + { + "auxiliary_loss_clip": 0.0117563, + "auxiliary_loss_mlp": 0.01049439, + "balance_loss_clip": 1.06806779, + "balance_loss_mlp": 1.0300076, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 3.0317882534039886, + "language_loss": 0.79453063, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81678128, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.19421387, + "step": 2953, + "time_per_iteration": 2.50266695022583 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.06496429, + "balance_loss_mlp": 1.02842128, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 3.7087776017671623, + "language_loss": 0.87805724, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90021098, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.17602539, + "step": 2954, + "time_per_iteration": 2.5297939777374268 + }, + { + "auxiliary_loss_clip": 0.01165486, + "auxiliary_loss_mlp": 0.01052746, + "balance_loss_clip": 1.06218159, + "balance_loss_mlp": 1.03386378, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 1.8543408711837652, + "language_loss": 0.85307062, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.8752529, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.18896484, + "step": 2955, + "time_per_iteration": 2.509911298751831 + }, + { + "auxiliary_loss_clip": 0.01164136, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.05907774, + "balance_loss_mlp": 1.02320051, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 1.7071738643874381, + "language_loss": 0.71375436, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73580676, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.17907715, + "step": 2956, + "time_per_iteration": 2.46793532371521 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01063432, + "balance_loss_clip": 1.05666828, + "balance_loss_mlp": 1.04323769, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 2.8884957263680464, + "language_loss": 0.83078068, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85299689, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.20202637, + "step": 2957, + "time_per_iteration": 2.512965202331543 + }, + { + "auxiliary_loss_clip": 0.01159924, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.05854774, + "balance_loss_mlp": 1.02487302, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.6473335859542246, + "language_loss": 0.74962431, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77164567, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.17333984, + "step": 2958, + "time_per_iteration": 2.4912893772125244 + }, + { + "auxiliary_loss_clip": 0.01169877, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.06557441, + "balance_loss_mlp": 1.02935576, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.716065553994003, + "language_loss": 0.80145043, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82361859, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.17590332, + "step": 2959, + "time_per_iteration": 2.506988763809204 + }, + { + "auxiliary_loss_clip": 0.01164905, + "auxiliary_loss_mlp": 0.01057954, + "balance_loss_clip": 1.05664372, + "balance_loss_mlp": 1.03773642, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.924802665187508, + "language_loss": 0.51912677, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54135537, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 1.08251953, + "router_z_loss_mlp": 0.20227051, + "step": 2960, + "time_per_iteration": 2.5009982585906982 + }, + { + "auxiliary_loss_clip": 0.01162536, + "auxiliary_loss_mlp": 0.01050134, + "balance_loss_clip": 1.05734336, + "balance_loss_mlp": 1.031883, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 2.0336786113883956, + "language_loss": 0.89444005, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91656673, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.18261719, + "step": 2961, + "time_per_iteration": 2.5423314571380615 + }, + { + "auxiliary_loss_clip": 0.0116795, + "auxiliary_loss_mlp": 0.0105209, + "balance_loss_clip": 1.06017852, + "balance_loss_mlp": 1.03227735, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.78586261073792, + "language_loss": 0.78762507, + "learning_rate": 3.774338767820631e-06, + "loss": 0.80982548, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.19812012, + "step": 2962, + "time_per_iteration": 2.463353157043457 + }, + { + "auxiliary_loss_clip": 0.01169759, + "auxiliary_loss_mlp": 0.01048282, + "balance_loss_clip": 1.06552315, + "balance_loss_mlp": 1.02941084, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 2.5585161246798758, + "language_loss": 0.74646062, + "learning_rate": 3.774159019458203e-06, + "loss": 0.768641, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.1887207, + "step": 2963, + "time_per_iteration": 2.4964566230773926 + }, + { + "auxiliary_loss_clip": 0.01173444, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.06447756, + "balance_loss_mlp": 1.02805269, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.720875596259532, + "language_loss": 0.78737676, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80958837, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.19677734, + "step": 2964, + "time_per_iteration": 2.468442916870117 + }, + { + "auxiliary_loss_clip": 0.01164523, + "auxiliary_loss_mlp": 0.01054079, + "balance_loss_clip": 1.05946934, + "balance_loss_mlp": 1.03525591, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.701353725383452, + "language_loss": 0.80890578, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83109182, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.18823242, + "step": 2965, + "time_per_iteration": 2.4964258670806885 + }, + { + "auxiliary_loss_clip": 0.01159989, + "auxiliary_loss_mlp": 0.0105056, + "balance_loss_clip": 1.0561738, + "balance_loss_mlp": 1.03221393, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.9873681210480867, + "language_loss": 0.94834298, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.97044849, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.18334961, + "step": 2966, + "time_per_iteration": 2.388728141784668 + }, + { + "auxiliary_loss_clip": 0.01160533, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.05515885, + "balance_loss_mlp": 1.02828372, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.4126945964111988, + "language_loss": 0.72883779, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.75092834, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.20251465, + "step": 2967, + "time_per_iteration": 2.6056439876556396 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.0104771, + "balance_loss_clip": 1.05796814, + "balance_loss_mlp": 1.03009117, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.290759579091356, + "language_loss": 0.77302426, + "learning_rate": 3.773259268638157e-06, + "loss": 0.7950871, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17614746, + "step": 2968, + "time_per_iteration": 2.4216256141662598 + }, + { + "auxiliary_loss_clip": 0.0115759, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.05544269, + "balance_loss_mlp": 1.02615261, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 1.9108520394227184, + "language_loss": 0.75669622, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.77870411, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.17041016, + "step": 2969, + "time_per_iteration": 2.637403726577759 + }, + { + "auxiliary_loss_clip": 0.01088586, + "auxiliary_loss_mlp": 0.01022202, + "balance_loss_clip": 1.04960752, + "balance_loss_mlp": 1.01893532, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8638023445460005, + "language_loss": 0.69016528, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71127313, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.38964844, + "router_z_loss_mlp": 0.03262329, + "step": 2970, + "time_per_iteration": 3.1645987033843994 + }, + { + "auxiliary_loss_clip": 0.0116386, + "auxiliary_loss_mlp": 0.01048503, + "balance_loss_clip": 1.05797517, + "balance_loss_mlp": 1.02983534, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.2169261971251073, + "language_loss": 0.67576969, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69789326, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.18664551, + "step": 2971, + "time_per_iteration": 2.6143689155578613 + }, + { + "auxiliary_loss_clip": 0.01163333, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.05770564, + "balance_loss_mlp": 1.02442944, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.8617899828998516, + "language_loss": 0.89897323, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.92104697, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.19628906, + "step": 2972, + "time_per_iteration": 2.4674108028411865 + }, + { + "auxiliary_loss_clip": 0.01159331, + "auxiliary_loss_mlp": 0.01055599, + "balance_loss_clip": 1.05576634, + "balance_loss_mlp": 1.03703797, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 5.0169549260375295, + "language_loss": 0.88366348, + "learning_rate": 3.77235783676401e-06, + "loss": 0.9058128, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.1854248, + "step": 2973, + "time_per_iteration": 2.465486526489258 + }, + { + "auxiliary_loss_clip": 0.01162314, + "auxiliary_loss_mlp": 0.01044708, + "balance_loss_clip": 1.06170428, + "balance_loss_mlp": 1.02708936, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.7489836294591323, + "language_loss": 0.76148492, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78355515, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.17626953, + "step": 2974, + "time_per_iteration": 2.4637768268585205 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.05433607, + "balance_loss_mlp": 1.02741933, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.654145362008018, + "language_loss": 0.74837899, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.77039409, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.18725586, + "step": 2975, + "time_per_iteration": 3.988898515701294 + }, + { + "auxiliary_loss_clip": 0.01156195, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_clip": 1.05496645, + "balance_loss_mlp": 1.02915883, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 5.648050628841725, + "language_loss": 0.73205292, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75408638, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.17980957, + "step": 2976, + "time_per_iteration": 2.4771780967712402 + }, + { + "auxiliary_loss_clip": 0.01151003, + "auxiliary_loss_mlp": 0.01054018, + "balance_loss_clip": 1.05293036, + "balance_loss_mlp": 1.03692913, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 2.0981984190429213, + "language_loss": 0.77113998, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79319024, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17059326, + "step": 2977, + "time_per_iteration": 2.4964141845703125 + }, + { + "auxiliary_loss_clip": 0.0115837, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.05900097, + "balance_loss_mlp": 1.03100717, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.2990071942958705, + "language_loss": 0.80283678, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.82490194, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.17114258, + "step": 2978, + "time_per_iteration": 2.4588265419006348 + }, + { + "auxiliary_loss_clip": 0.01158591, + "auxiliary_loss_mlp": 0.01041117, + "balance_loss_clip": 1.05569577, + "balance_loss_mlp": 1.02313972, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.5160465376866268, + "language_loss": 0.76457274, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78656977, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.1796875, + "step": 2979, + "time_per_iteration": 4.009858131408691 + }, + { + "auxiliary_loss_clip": 0.01159618, + "auxiliary_loss_mlp": 0.01043848, + "balance_loss_clip": 1.05929589, + "balance_loss_mlp": 1.02585971, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.9277626922315134, + "language_loss": 0.68658179, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.70861638, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17993164, + "step": 2980, + "time_per_iteration": 2.4292032718658447 + }, + { + "auxiliary_loss_clip": 0.01165311, + "auxiliary_loss_mlp": 0.01044394, + "balance_loss_clip": 1.06158137, + "balance_loss_mlp": 1.02421153, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 2.8432781732732346, + "language_loss": 0.7091372, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73123431, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.20178223, + "step": 2981, + "time_per_iteration": 2.4263479709625244 + }, + { + "auxiliary_loss_clip": 0.01171748, + "auxiliary_loss_mlp": 0.01052628, + "balance_loss_clip": 1.06913161, + "balance_loss_mlp": 1.03466356, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 4.166325441657953, + "language_loss": 0.81966531, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84190905, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 1.02490234, + "router_z_loss_mlp": 0.1796875, + "step": 2982, + "time_per_iteration": 3.857269287109375 + }, + { + "auxiliary_loss_clip": 0.0115554, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.05582869, + "balance_loss_mlp": 1.02457368, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.7393067968222262, + "language_loss": 0.82800746, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.8499971, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.18835449, + "step": 2983, + "time_per_iteration": 2.542217969894409 + }, + { + "auxiliary_loss_clip": 0.01154087, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.05154645, + "balance_loss_mlp": 1.03194547, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.4745187615629334, + "language_loss": 0.854231, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87628704, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 1.02490234, + "router_z_loss_mlp": 0.19580078, + "step": 2984, + "time_per_iteration": 2.452892541885376 + }, + { + "auxiliary_loss_clip": 0.01155879, + "auxiliary_loss_mlp": 0.01047567, + "balance_loss_clip": 1.05287266, + "balance_loss_mlp": 1.02825451, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 1.4311249388160134, + "language_loss": 0.8945272, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91656166, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.1932373, + "step": 2985, + "time_per_iteration": 2.5497689247131348 + }, + { + "auxiliary_loss_clip": 0.01155584, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.05745482, + "balance_loss_mlp": 1.0273118, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 2.354865542741756, + "language_loss": 0.69479465, + "learning_rate": 3.770006252694922e-06, + "loss": 0.71677995, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.15649414, + "step": 2986, + "time_per_iteration": 3.8726742267608643 + }, + { + "auxiliary_loss_clip": 0.01154907, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_clip": 1.0537945, + "balance_loss_mlp": 1.03188014, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.4877910720656264, + "language_loss": 0.7765429, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79858899, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.17834473, + "step": 2987, + "time_per_iteration": 2.5490071773529053 + }, + { + "auxiliary_loss_clip": 0.01168912, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.0620625, + "balance_loss_mlp": 1.02454734, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.0572160057567697, + "language_loss": 0.77801424, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80014056, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.19177246, + "step": 2988, + "time_per_iteration": 2.439409017562866 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01010839, + "balance_loss_clip": 1.06457019, + "balance_loss_mlp": 1.00827599, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7634550664904225, + "language_loss": 0.62689477, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64802879, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.37988281, + "router_z_loss_mlp": 0.02566528, + "step": 2989, + "time_per_iteration": 3.0326626300811768 + }, + { + "auxiliary_loss_clip": 0.01168222, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.06604004, + "balance_loss_mlp": 1.02003479, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.322124132476105, + "language_loss": 0.71291625, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.73496675, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.16809082, + "step": 2990, + "time_per_iteration": 2.506276845932007 + }, + { + "auxiliary_loss_clip": 0.01158225, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.05759943, + "balance_loss_mlp": 1.02211976, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 2.2205040781009155, + "language_loss": 0.69037968, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.71234763, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.16455078, + "step": 2991, + "time_per_iteration": 2.6397159099578857 + }, + { + "auxiliary_loss_clip": 0.01162634, + "auxiliary_loss_mlp": 0.0104116, + "balance_loss_clip": 1.06272769, + "balance_loss_mlp": 1.02389884, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.8055318837157568, + "language_loss": 0.82791102, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84994888, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17260742, + "step": 2992, + "time_per_iteration": 2.532520055770874 + }, + { + "auxiliary_loss_clip": 0.01154894, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.05851197, + "balance_loss_mlp": 1.0191164, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.3974426586468187, + "language_loss": 0.82265961, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84456277, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16290283, + "step": 2993, + "time_per_iteration": 2.4523026943206787 + }, + { + "auxiliary_loss_clip": 0.01158793, + "auxiliary_loss_mlp": 0.0104036, + "balance_loss_clip": 1.05884099, + "balance_loss_mlp": 1.02411187, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.6333311367642003, + "language_loss": 0.78205419, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80404574, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.16247559, + "step": 2994, + "time_per_iteration": 2.439887285232544 + }, + { + "auxiliary_loss_clip": 0.01168706, + "auxiliary_loss_mlp": 0.01045413, + "balance_loss_clip": 1.06758213, + "balance_loss_mlp": 1.02905703, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 2.9909148285041325, + "language_loss": 0.80901659, + "learning_rate": 3.768371587287296e-06, + "loss": 0.83115774, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.16345215, + "step": 2995, + "time_per_iteration": 2.4657809734344482 + }, + { + "auxiliary_loss_clip": 0.01158573, + "auxiliary_loss_mlp": 0.01042036, + "balance_loss_clip": 1.05847728, + "balance_loss_mlp": 1.02687311, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 2.070040239988586, + "language_loss": 0.84286308, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86486924, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.15161133, + "step": 2996, + "time_per_iteration": 2.414238929748535 + }, + { + "auxiliary_loss_clip": 0.01148234, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.05232048, + "balance_loss_mlp": 1.02158093, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 1.6233001098132098, + "language_loss": 0.87910408, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90095884, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.15661621, + "step": 2997, + "time_per_iteration": 2.533074378967285 + }, + { + "auxiliary_loss_clip": 0.01154336, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.05101049, + "balance_loss_mlp": 1.02475023, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 1.8441853391658385, + "language_loss": 0.84852481, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87048614, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.1706543, + "step": 2998, + "time_per_iteration": 2.4857771396636963 + }, + { + "auxiliary_loss_clip": 0.01154396, + "auxiliary_loss_mlp": 0.01040449, + "balance_loss_clip": 1.05686283, + "balance_loss_mlp": 1.02420056, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 2.0449517972319184, + "language_loss": 0.84173858, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86368698, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.16247559, + "step": 2999, + "time_per_iteration": 2.5163753032684326 + }, + { + "auxiliary_loss_clip": 0.01156741, + "auxiliary_loss_mlp": 0.01049016, + "balance_loss_clip": 1.05572259, + "balance_loss_mlp": 1.03144503, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 2.621151232343526, + "language_loss": 0.7526294, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77468693, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.17565918, + "step": 3000, + "time_per_iteration": 2.46413516998291 + }, + { + "auxiliary_loss_clip": 0.01160033, + "auxiliary_loss_mlp": 0.01057452, + "balance_loss_clip": 1.06242681, + "balance_loss_mlp": 1.04025054, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.1123188522183085, + "language_loss": 0.71053851, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73271334, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.17211914, + "step": 3001, + "time_per_iteration": 2.489471197128296 + }, + { + "auxiliary_loss_clip": 0.01151385, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.0509454, + "balance_loss_mlp": 1.03198063, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.2203440754895123, + "language_loss": 0.88491476, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90693867, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.19030762, + "step": 3002, + "time_per_iteration": 2.4661264419555664 + }, + { + "auxiliary_loss_clip": 0.01153583, + "auxiliary_loss_mlp": 0.01045277, + "balance_loss_clip": 1.05547285, + "balance_loss_mlp": 1.02909994, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.8314988290692173, + "language_loss": 0.80569768, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82768625, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.16174316, + "step": 3003, + "time_per_iteration": 2.4561266899108887 + }, + { + "auxiliary_loss_clip": 0.01151845, + "auxiliary_loss_mlp": 0.01049187, + "balance_loss_clip": 1.05108285, + "balance_loss_mlp": 1.03072178, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 1.9877903217298616, + "language_loss": 0.67365992, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69567025, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.18481445, + "step": 3004, + "time_per_iteration": 2.494863748550415 + }, + { + "auxiliary_loss_clip": 0.0115718, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.05692673, + "balance_loss_mlp": 1.02868629, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 2.0113926936115263, + "language_loss": 0.85381019, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.8758381, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.16943359, + "step": 3005, + "time_per_iteration": 2.508819103240967 + }, + { + "auxiliary_loss_clip": 0.01147972, + "auxiliary_loss_mlp": 0.01045023, + "balance_loss_clip": 1.05128741, + "balance_loss_mlp": 1.02823877, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 2.033857154917852, + "language_loss": 0.83074617, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85267615, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16796875, + "step": 3006, + "time_per_iteration": 2.4951984882354736 + }, + { + "auxiliary_loss_clip": 0.01151374, + "auxiliary_loss_mlp": 0.01057548, + "balance_loss_clip": 1.04977298, + "balance_loss_mlp": 1.04029822, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.7989712370605393, + "language_loss": 0.77184784, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79393709, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.17236328, + "step": 3007, + "time_per_iteration": 2.5122838020324707 + }, + { + "auxiliary_loss_clip": 0.01102596, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.06511807, + "balance_loss_mlp": 1.03706634, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8042704671042522, + "language_loss": 0.57007658, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59149688, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 0.02368164, + "step": 3008, + "time_per_iteration": 3.2454662322998047 + }, + { + "auxiliary_loss_clip": 0.01151715, + "auxiliary_loss_mlp": 0.01047229, + "balance_loss_clip": 1.05404937, + "balance_loss_mlp": 1.02970529, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 2.058299919653002, + "language_loss": 0.67362547, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69561487, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.17529297, + "step": 3009, + "time_per_iteration": 2.4939911365509033 + }, + { + "auxiliary_loss_clip": 0.01153373, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.0547719, + "balance_loss_mlp": 1.02244997, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 2.1053317926469544, + "language_loss": 0.76048005, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.78239262, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.15441895, + "step": 3010, + "time_per_iteration": 2.501181125640869 + }, + { + "auxiliary_loss_clip": 0.01157383, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.06063962, + "balance_loss_mlp": 1.02165627, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.5768938008392726, + "language_loss": 0.66890669, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.69084972, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.15270996, + "step": 3011, + "time_per_iteration": 2.46818470954895 + }, + { + "auxiliary_loss_clip": 0.01144245, + "auxiliary_loss_mlp": 0.0104183, + "balance_loss_clip": 1.04940653, + "balance_loss_mlp": 1.02659476, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.851564174231148, + "language_loss": 0.71349299, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73535371, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.15222168, + "step": 3012, + "time_per_iteration": 2.7545053958892822 + }, + { + "auxiliary_loss_clip": 0.01159508, + "auxiliary_loss_mlp": 0.01051262, + "balance_loss_clip": 1.06092167, + "balance_loss_mlp": 1.03485858, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.246109380205062, + "language_loss": 0.62316114, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64526886, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16394043, + "step": 3013, + "time_per_iteration": 2.5612125396728516 + }, + { + "auxiliary_loss_clip": 0.01149508, + "auxiliary_loss_mlp": 0.01050437, + "balance_loss_clip": 1.05171394, + "balance_loss_mlp": 1.03466535, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.8172735843903915, + "language_loss": 0.75832117, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78032064, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.15771484, + "step": 3014, + "time_per_iteration": 2.5383217334747314 + }, + { + "auxiliary_loss_clip": 0.01156858, + "auxiliary_loss_mlp": 0.01047939, + "balance_loss_clip": 1.05628514, + "balance_loss_mlp": 1.02950931, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 2.095919198194839, + "language_loss": 0.66257387, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.68462181, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.1842041, + "step": 3015, + "time_per_iteration": 2.516963005065918 + }, + { + "auxiliary_loss_clip": 0.01149543, + "auxiliary_loss_mlp": 0.01044114, + "balance_loss_clip": 1.05357647, + "balance_loss_mlp": 1.02651834, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 2.303949679527855, + "language_loss": 0.77751523, + "learning_rate": 3.764536253816785e-06, + "loss": 0.79945183, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.17590332, + "step": 3016, + "time_per_iteration": 2.488112211227417 + }, + { + "auxiliary_loss_clip": 0.0115927, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.0589149, + "balance_loss_mlp": 1.02711213, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 2.0165441098408343, + "language_loss": 0.83675003, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85878539, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.17163086, + "step": 3017, + "time_per_iteration": 2.4337143898010254 + }, + { + "auxiliary_loss_clip": 0.01144496, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.04822028, + "balance_loss_mlp": 1.02197218, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 1.979175193203173, + "language_loss": 0.67186987, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69369715, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16247559, + "step": 3018, + "time_per_iteration": 4.094010829925537 + }, + { + "auxiliary_loss_clip": 0.01148701, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.05001581, + "balance_loss_mlp": 1.01812387, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.002274433988276, + "language_loss": 0.76122677, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.78306353, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.1685791, + "step": 3019, + "time_per_iteration": 2.4593188762664795 + }, + { + "auxiliary_loss_clip": 0.01153093, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.05434752, + "balance_loss_mlp": 1.02336299, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.996834430120976, + "language_loss": 0.81893492, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.84086996, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.17028809, + "step": 3020, + "time_per_iteration": 2.487938642501831 + }, + { + "auxiliary_loss_clip": 0.01154639, + "auxiliary_loss_mlp": 0.01037187, + "balance_loss_clip": 1.05667567, + "balance_loss_mlp": 1.02006817, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.3097487317929764, + "language_loss": 0.77540833, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79732656, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.17138672, + "step": 3021, + "time_per_iteration": 2.4613254070281982 + }, + { + "auxiliary_loss_clip": 0.01146963, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.02040851, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.8041281761431895, + "language_loss": 0.85108793, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87292767, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.1661377, + "step": 3022, + "time_per_iteration": 2.508436918258667 + }, + { + "auxiliary_loss_clip": 0.01154588, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.05309296, + "balance_loss_mlp": 1.02166581, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 2.369095485375672, + "language_loss": 0.69612217, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71806699, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.18225098, + "step": 3023, + "time_per_iteration": 3.8644917011260986 + }, + { + "auxiliary_loss_clip": 0.01144303, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.04811835, + "balance_loss_mlp": 1.01973295, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.8732819172259638, + "language_loss": 0.74284834, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.7646538, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16516113, + "step": 3024, + "time_per_iteration": 2.4267427921295166 + }, + { + "auxiliary_loss_clip": 0.01148546, + "auxiliary_loss_mlp": 0.01042066, + "balance_loss_clip": 1.04872787, + "balance_loss_mlp": 1.02468514, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.585880993396532, + "language_loss": 0.8834312, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90533733, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17382812, + "step": 3025, + "time_per_iteration": 2.453734874725342 + }, + { + "auxiliary_loss_clip": 0.01151283, + "auxiliary_loss_mlp": 0.01047759, + "balance_loss_clip": 1.05245256, + "balance_loss_mlp": 1.0303781, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.3004001394682634, + "language_loss": 0.78928661, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81127703, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.17370605, + "step": 3026, + "time_per_iteration": 3.8335800170898438 + }, + { + "auxiliary_loss_clip": 0.01151487, + "auxiliary_loss_mlp": 0.01051131, + "balance_loss_clip": 1.04941487, + "balance_loss_mlp": 1.03360724, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6386034561622431, + "language_loss": 0.76308703, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78511322, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.1751709, + "step": 3027, + "time_per_iteration": 2.5406174659729004 + }, + { + "auxiliary_loss_clip": 0.01153131, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.05072451, + "balance_loss_mlp": 1.02679455, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.9728000221178106, + "language_loss": 0.85284293, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87481558, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.17346191, + "step": 3028, + "time_per_iteration": 2.4601399898529053 + }, + { + "auxiliary_loss_clip": 0.0114933, + "auxiliary_loss_mlp": 0.01040551, + "balance_loss_clip": 1.05248618, + "balance_loss_mlp": 1.02425551, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 2.205245381208791, + "language_loss": 0.82392156, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84582031, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16296387, + "step": 3029, + "time_per_iteration": 3.9329662322998047 + }, + { + "auxiliary_loss_clip": 0.01159669, + "auxiliary_loss_mlp": 0.01039884, + "balance_loss_clip": 1.05946541, + "balance_loss_mlp": 1.02271819, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0006750327952716, + "language_loss": 0.77946472, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80146027, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.17138672, + "step": 3030, + "time_per_iteration": 2.450349807739258 + }, + { + "auxiliary_loss_clip": 0.011581, + "auxiliary_loss_mlp": 0.01040313, + "balance_loss_clip": 1.05583286, + "balance_loss_mlp": 1.02307498, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 2.779033260901093, + "language_loss": 0.85091901, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87290311, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.17260742, + "step": 3031, + "time_per_iteration": 2.5089187622070312 + }, + { + "auxiliary_loss_clip": 0.01149747, + "auxiliary_loss_mlp": 0.01039045, + "balance_loss_clip": 1.05092227, + "balance_loss_mlp": 1.02285635, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8875180747827893, + "language_loss": 0.80052513, + "learning_rate": 3.76159428580299e-06, + "loss": 0.82241303, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16186523, + "step": 3032, + "time_per_iteration": 2.47259783744812 + }, + { + "auxiliary_loss_clip": 0.01160664, + "auxiliary_loss_mlp": 0.01053006, + "balance_loss_clip": 1.05397177, + "balance_loss_mlp": 1.0351361, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.000546577916837, + "language_loss": 0.81387967, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83601636, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.17871094, + "step": 3033, + "time_per_iteration": 2.458508014678955 + }, + { + "auxiliary_loss_clip": 0.01089235, + "auxiliary_loss_mlp": 0.0100628, + "balance_loss_clip": 1.05233502, + "balance_loss_mlp": 1.00331521, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8835532017213774, + "language_loss": 0.63489032, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65584552, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.36865234, + "router_z_loss_mlp": 0.02966309, + "step": 3034, + "time_per_iteration": 3.058443307876587 + }, + { + "auxiliary_loss_clip": 0.01151106, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.05296469, + "balance_loss_mlp": 1.02269769, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 1.8195304564867127, + "language_loss": 0.79485011, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81674564, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.15771484, + "step": 3035, + "time_per_iteration": 2.444540500640869 + }, + { + "auxiliary_loss_clip": 0.01146635, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.05008054, + "balance_loss_mlp": 1.02356219, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.2971590240274806, + "language_loss": 0.8499769, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87184203, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16333008, + "step": 3036, + "time_per_iteration": 2.57944917678833 + }, + { + "auxiliary_loss_clip": 0.01147987, + "auxiliary_loss_mlp": 0.01040627, + "balance_loss_clip": 1.05206084, + "balance_loss_mlp": 1.02422333, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.215473204361665, + "language_loss": 0.80304116, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82492733, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.1640625, + "step": 3037, + "time_per_iteration": 2.423521041870117 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.05205989, + "balance_loss_mlp": 1.02391374, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 3.2532488982898955, + "language_loss": 0.79900324, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.82091606, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17858887, + "step": 3038, + "time_per_iteration": 2.5048627853393555 + }, + { + "auxiliary_loss_clip": 0.01145506, + "auxiliary_loss_mlp": 0.01039469, + "balance_loss_clip": 1.04770446, + "balance_loss_mlp": 1.02316093, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3655510510932816, + "language_loss": 0.67571032, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69756007, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16308594, + "step": 3039, + "time_per_iteration": 2.6046602725982666 + }, + { + "auxiliary_loss_clip": 0.01153127, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_clip": 1.05405962, + "balance_loss_mlp": 1.02678764, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.8273681488791835, + "language_loss": 0.73843569, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.76040339, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.16845703, + "step": 3040, + "time_per_iteration": 2.8174619674682617 + }, + { + "auxiliary_loss_clip": 0.01164499, + "auxiliary_loss_mlp": 0.01043429, + "balance_loss_clip": 1.06239176, + "balance_loss_mlp": 1.02635813, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 2.088471461328592, + "language_loss": 0.60538602, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62746531, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.17053223, + "step": 3041, + "time_per_iteration": 2.5429630279541016 + }, + { + "auxiliary_loss_clip": 0.0115213, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.05353546, + "balance_loss_mlp": 1.03729236, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 2.2109162668352376, + "language_loss": 0.59954673, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62160981, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16882324, + "step": 3042, + "time_per_iteration": 2.7561256885528564 + }, + { + "auxiliary_loss_clip": 0.01155167, + "auxiliary_loss_mlp": 0.01041444, + "balance_loss_clip": 1.05769134, + "balance_loss_mlp": 1.02512467, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 2.68975223918603, + "language_loss": 0.87409484, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89606094, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16320801, + "step": 3043, + "time_per_iteration": 2.4963791370391846 + }, + { + "auxiliary_loss_clip": 0.0115059, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.04956937, + "balance_loss_mlp": 1.03108001, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 3.2088264840816203, + "language_loss": 0.70717502, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72916847, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17663574, + "step": 3044, + "time_per_iteration": 2.508791923522949 + }, + { + "auxiliary_loss_clip": 0.01154376, + "auxiliary_loss_mlp": 0.01048269, + "balance_loss_clip": 1.04980373, + "balance_loss_mlp": 1.02967286, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.214325591913864, + "language_loss": 0.64110744, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66313386, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18579102, + "step": 3045, + "time_per_iteration": 2.545560121536255 + }, + { + "auxiliary_loss_clip": 0.01152836, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_clip": 1.05400014, + "balance_loss_mlp": 1.03336954, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.381534626295854, + "language_loss": 0.78819811, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81022894, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.1685791, + "step": 3046, + "time_per_iteration": 2.4755945205688477 + }, + { + "auxiliary_loss_clip": 0.01159885, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.05672467, + "balance_loss_mlp": 1.02684045, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 2.4232084909157625, + "language_loss": 0.79049128, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.8125385, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.18005371, + "step": 3047, + "time_per_iteration": 2.4683051109313965 + }, + { + "auxiliary_loss_clip": 0.01149129, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.0520997, + "balance_loss_mlp": 1.02428269, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 2.040603217041269, + "language_loss": 0.8068974, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.82880008, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.1685791, + "step": 3048, + "time_per_iteration": 2.557957410812378 + }, + { + "auxiliary_loss_clip": 0.01150224, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.04876065, + "balance_loss_mlp": 1.03277087, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 2.117073631125672, + "language_loss": 0.86434484, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88637787, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.20324707, + "step": 3049, + "time_per_iteration": 2.4388256072998047 + }, + { + "auxiliary_loss_clip": 0.01156047, + "auxiliary_loss_mlp": 0.01039576, + "balance_loss_clip": 1.05206048, + "balance_loss_mlp": 1.02156317, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.294593202558269, + "language_loss": 0.773265, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79522121, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.18017578, + "step": 3050, + "time_per_iteration": 2.5174660682678223 + }, + { + "auxiliary_loss_clip": 0.01144563, + "auxiliary_loss_mlp": 0.01039987, + "balance_loss_clip": 1.04626131, + "balance_loss_mlp": 1.02338147, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.0610751695333014, + "language_loss": 0.99377209, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01561761, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.16601562, + "step": 3051, + "time_per_iteration": 2.4783709049224854 + }, + { + "auxiliary_loss_clip": 0.0115862, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.05975437, + "balance_loss_mlp": 1.01769686, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 2.3911384248014236, + "language_loss": 0.86216974, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88410318, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.17028809, + "step": 3052, + "time_per_iteration": 2.7331080436706543 + }, + { + "auxiliary_loss_clip": 0.01149063, + "auxiliary_loss_mlp": 0.01043374, + "balance_loss_clip": 1.05305576, + "balance_loss_mlp": 1.02661276, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 2.0998492279867276, + "language_loss": 0.73409814, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75602245, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16760254, + "step": 3053, + "time_per_iteration": 2.4473512172698975 + }, + { + "auxiliary_loss_clip": 0.01164088, + "auxiliary_loss_mlp": 0.01046081, + "balance_loss_clip": 1.06001246, + "balance_loss_mlp": 1.02842629, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.8518683280270596, + "language_loss": 0.61483812, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.63693988, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.17651367, + "step": 3054, + "time_per_iteration": 2.592808485031128 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.05249906, + "balance_loss_mlp": 1.02416039, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 1.9577808014397042, + "language_loss": 0.78835517, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.81030512, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.17199707, + "step": 3055, + "time_per_iteration": 2.564776659011841 + }, + { + "auxiliary_loss_clip": 0.01153896, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_clip": 1.05835915, + "balance_loss_mlp": 1.0306468, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 2.5602815368695024, + "language_loss": 0.70085192, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72285831, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.1607666, + "step": 3056, + "time_per_iteration": 2.530442237854004 + }, + { + "auxiliary_loss_clip": 0.0115372, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.04986882, + "balance_loss_mlp": 1.02257299, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 2.3894247130174375, + "language_loss": 0.8052929, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82722235, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.16650391, + "step": 3057, + "time_per_iteration": 2.4743807315826416 + }, + { + "auxiliary_loss_clip": 0.01155765, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.0497539, + "balance_loss_mlp": 1.02423525, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.330562008834925, + "language_loss": 0.828511, + "learning_rate": 3.756777127858533e-06, + "loss": 0.85050774, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.19677734, + "step": 3058, + "time_per_iteration": 2.48612380027771 + }, + { + "auxiliary_loss_clip": 0.01177127, + "auxiliary_loss_mlp": 0.01046892, + "balance_loss_clip": 1.06998444, + "balance_loss_mlp": 1.02976131, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.4117682193051713, + "language_loss": 0.86100364, + "learning_rate": 3.756590952429017e-06, + "loss": 0.8832438, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.17150879, + "step": 3059, + "time_per_iteration": 2.603208303451538 + }, + { + "auxiliary_loss_clip": 0.011568, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.05515802, + "balance_loss_mlp": 1.02046704, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.8736150173452615, + "language_loss": 0.72955501, + "learning_rate": 3.756404710389396e-06, + "loss": 0.75148934, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.16162109, + "step": 3060, + "time_per_iteration": 2.541069269180298 + }, + { + "auxiliary_loss_clip": 0.01161348, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.06011891, + "balance_loss_mlp": 1.01921749, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 5.352667037296347, + "language_loss": 0.72700536, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74898762, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17663574, + "step": 3061, + "time_per_iteration": 2.545485258102417 + }, + { + "auxiliary_loss_clip": 0.0115824, + "auxiliary_loss_mlp": 0.01041863, + "balance_loss_clip": 1.05881381, + "balance_loss_mlp": 1.02424359, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.7621074158570922, + "language_loss": 0.81631589, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83831692, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.17626953, + "step": 3062, + "time_per_iteration": 4.013848304748535 + }, + { + "auxiliary_loss_clip": 0.01160856, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.05968869, + "balance_loss_mlp": 1.02278531, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.9384192857862523, + "language_loss": 0.7295875, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75159705, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.17297363, + "step": 3063, + "time_per_iteration": 2.488481044769287 + }, + { + "auxiliary_loss_clip": 0.01159273, + "auxiliary_loss_mlp": 0.01050388, + "balance_loss_clip": 1.05632448, + "balance_loss_mlp": 1.03492618, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 1.8587449720425506, + "language_loss": 0.65501863, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6771152, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.15466309, + "step": 3064, + "time_per_iteration": 2.5140624046325684 + }, + { + "auxiliary_loss_clip": 0.01160502, + "auxiliary_loss_mlp": 0.01044762, + "balance_loss_clip": 1.06033981, + "balance_loss_mlp": 1.02784646, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.0276569325879175, + "language_loss": 0.68581742, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70787013, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.16931152, + "step": 3065, + "time_per_iteration": 2.5698721408843994 + }, + { + "auxiliary_loss_clip": 0.01153674, + "auxiliary_loss_mlp": 0.01042455, + "balance_loss_clip": 1.05387175, + "balance_loss_mlp": 1.02515793, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.1734716397825076, + "language_loss": 0.73218733, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.7541486, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.17285156, + "step": 3066, + "time_per_iteration": 2.5399160385131836 + }, + { + "auxiliary_loss_clip": 0.01155811, + "auxiliary_loss_mlp": 0.01044771, + "balance_loss_clip": 1.05417883, + "balance_loss_mlp": 1.02827263, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.450548293790169, + "language_loss": 0.81868839, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84069425, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.16503906, + "step": 3067, + "time_per_iteration": 2.6133816242218018 + }, + { + "auxiliary_loss_clip": 0.01097429, + "auxiliary_loss_mlp": 0.01013786, + "balance_loss_clip": 1.06152701, + "balance_loss_mlp": 1.01161349, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.8201050634974879, + "language_loss": 0.59737229, + "learning_rate": 3.754912376956657e-06, + "loss": 0.6184845, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.35986328, + "router_z_loss_mlp": 0.02172852, + "step": 3068, + "time_per_iteration": 4.367095708847046 + }, + { + "auxiliary_loss_clip": 0.01153702, + "auxiliary_loss_mlp": 0.01045039, + "balance_loss_clip": 1.05746579, + "balance_loss_mlp": 1.02861238, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8236524592743077, + "language_loss": 0.76150465, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78349209, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16430664, + "step": 3069, + "time_per_iteration": 2.4436216354370117 + }, + { + "auxiliary_loss_clip": 0.01159803, + "auxiliary_loss_mlp": 0.01042548, + "balance_loss_clip": 1.05934322, + "balance_loss_mlp": 1.02572715, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 2.6660894134732733, + "language_loss": 0.84601688, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86804044, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.16821289, + "step": 3070, + "time_per_iteration": 3.878571033477783 + }, + { + "auxiliary_loss_clip": 0.01159083, + "auxiliary_loss_mlp": 0.01042536, + "balance_loss_clip": 1.0570761, + "balance_loss_mlp": 1.02510726, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 2.1854522134606214, + "language_loss": 0.779477, + "learning_rate": 3.754351653708265e-06, + "loss": 0.80149317, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17419434, + "step": 3071, + "time_per_iteration": 2.541389226913452 + }, + { + "auxiliary_loss_clip": 0.01151027, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.05064988, + "balance_loss_mlp": 1.02812266, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.3524289820440987, + "language_loss": 0.77678883, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79875278, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.17272949, + "step": 3072, + "time_per_iteration": 2.5069830417633057 + }, + { + "auxiliary_loss_clip": 0.01158597, + "auxiliary_loss_mlp": 0.01040845, + "balance_loss_clip": 1.05693746, + "balance_loss_mlp": 1.02183151, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 8.365894742003384, + "language_loss": 0.86473382, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88672829, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.19018555, + "step": 3073, + "time_per_iteration": 3.8966968059539795 + }, + { + "auxiliary_loss_clip": 0.01150201, + "auxiliary_loss_mlp": 0.01042221, + "balance_loss_clip": 1.05016756, + "balance_loss_mlp": 1.02611566, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.4975110570775665, + "language_loss": 0.91754681, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.93947101, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.16113281, + "step": 3074, + "time_per_iteration": 2.496763229370117 + }, + { + "auxiliary_loss_clip": 0.01147853, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.04770553, + "balance_loss_mlp": 1.02452946, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9652346936213756, + "language_loss": 0.65048814, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67239547, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18347168, + "step": 3075, + "time_per_iteration": 2.5574543476104736 + }, + { + "auxiliary_loss_clip": 0.01154936, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_clip": 1.0561738, + "balance_loss_mlp": 1.02815557, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 2.130003724415306, + "language_loss": 0.72554672, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74754703, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.16943359, + "step": 3076, + "time_per_iteration": 2.4345786571502686 + }, + { + "auxiliary_loss_clip": 0.01157664, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.05355179, + "balance_loss_mlp": 1.02443695, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 3.360915277510373, + "language_loss": 0.80911899, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83110356, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.16345215, + "step": 3077, + "time_per_iteration": 2.4990601539611816 + }, + { + "auxiliary_loss_clip": 0.01145302, + "auxiliary_loss_mlp": 0.01045052, + "balance_loss_clip": 1.04825437, + "balance_loss_mlp": 1.02805293, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 2.9462375817449487, + "language_loss": 0.78884149, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.810745, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16992188, + "step": 3078, + "time_per_iteration": 2.4511237144470215 + }, + { + "auxiliary_loss_clip": 0.01153368, + "auxiliary_loss_mlp": 0.01053889, + "balance_loss_clip": 1.05247855, + "balance_loss_mlp": 1.03719997, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 2.760312318167773, + "language_loss": 0.77693713, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79900968, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.16699219, + "step": 3079, + "time_per_iteration": 2.538581132888794 + }, + { + "auxiliary_loss_clip": 0.01151612, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.05209816, + "balance_loss_mlp": 1.01931334, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.823802021109051, + "language_loss": 0.8166945, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83857059, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16711426, + "step": 3080, + "time_per_iteration": 2.6628592014312744 + }, + { + "auxiliary_loss_clip": 0.0115463, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.05370879, + "balance_loss_mlp": 1.01959848, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.42979225003604, + "language_loss": 0.74266684, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76457906, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.16992188, + "step": 3081, + "time_per_iteration": 2.52400541305542 + }, + { + "auxiliary_loss_clip": 0.01171338, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.06787264, + "balance_loss_mlp": 1.03214431, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 3.0501541312427802, + "language_loss": 0.71828467, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74050033, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.1809082, + "step": 3082, + "time_per_iteration": 2.5012779235839844 + }, + { + "auxiliary_loss_clip": 0.0115496, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.05424941, + "balance_loss_mlp": 1.02394569, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.148022626038916, + "language_loss": 0.69504321, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71700823, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.17578125, + "step": 3083, + "time_per_iteration": 2.4712178707122803 + }, + { + "auxiliary_loss_clip": 0.01151651, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.0547266, + "balance_loss_mlp": 1.02822804, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 2.2010021225862335, + "language_loss": 0.68679076, + "learning_rate": 3.751914936806767e-06, + "loss": 0.70875514, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16552734, + "step": 3084, + "time_per_iteration": 2.5697591304779053 + }, + { + "auxiliary_loss_clip": 0.01152482, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.05235803, + "balance_loss_mlp": 1.02382255, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.8493661512432549, + "language_loss": 0.77565598, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.7975893, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.17028809, + "step": 3085, + "time_per_iteration": 2.487088203430176 + }, + { + "auxiliary_loss_clip": 0.01152304, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.05437636, + "balance_loss_mlp": 1.02565765, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.749545221489817, + "language_loss": 0.73273551, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75467861, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.16333008, + "step": 3086, + "time_per_iteration": 2.488421678543091 + }, + { + "auxiliary_loss_clip": 0.01158129, + "auxiliary_loss_mlp": 0.01044646, + "balance_loss_clip": 1.05906487, + "balance_loss_mlp": 1.02670538, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 2.3640772853422156, + "language_loss": 0.70211792, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72414565, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17932129, + "step": 3087, + "time_per_iteration": 2.4568676948547363 + }, + { + "auxiliary_loss_clip": 0.01157984, + "auxiliary_loss_mlp": 0.01045068, + "balance_loss_clip": 1.05763054, + "balance_loss_mlp": 1.02744937, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.5416708910302352, + "language_loss": 0.72729355, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74932408, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17626953, + "step": 3088, + "time_per_iteration": 2.423305034637451 + }, + { + "auxiliary_loss_clip": 0.01153433, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.05607569, + "balance_loss_mlp": 1.02511716, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 1.9121797302960477, + "language_loss": 0.92011797, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94206846, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.16491699, + "step": 3089, + "time_per_iteration": 2.5893044471740723 + }, + { + "auxiliary_loss_clip": 0.01151633, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.05423498, + "balance_loss_mlp": 1.02106285, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 3.5281828106951676, + "language_loss": 0.58181244, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.60371256, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.1730957, + "step": 3090, + "time_per_iteration": 2.488291025161743 + }, + { + "auxiliary_loss_clip": 0.0115236, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.05599856, + "balance_loss_mlp": 1.02174306, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9522773636143962, + "language_loss": 0.81548369, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.83739018, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16540527, + "step": 3091, + "time_per_iteration": 2.463465690612793 + }, + { + "auxiliary_loss_clip": 0.01163537, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.06207442, + "balance_loss_mlp": 1.02789724, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.4915988755554053, + "language_loss": 0.84264243, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.86472404, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.16723633, + "step": 3092, + "time_per_iteration": 2.4223318099975586 + }, + { + "auxiliary_loss_clip": 0.01169494, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.06485605, + "balance_loss_mlp": 1.02835321, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 2.897834406351222, + "language_loss": 0.93159264, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95374483, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.17382812, + "step": 3093, + "time_per_iteration": 2.4477486610412598 + }, + { + "auxiliary_loss_clip": 0.01161652, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.06219244, + "balance_loss_mlp": 1.02171695, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.6253537115481618, + "language_loss": 0.77076709, + "learning_rate": 3.750032898603443e-06, + "loss": 0.792768, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16723633, + "step": 3094, + "time_per_iteration": 2.4564695358276367 + }, + { + "auxiliary_loss_clip": 0.01149265, + "auxiliary_loss_mlp": 0.01047046, + "balance_loss_clip": 1.05131698, + "balance_loss_mlp": 1.03125131, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.647421589706744, + "language_loss": 0.70015615, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72211921, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15795898, + "step": 3095, + "time_per_iteration": 2.8362791538238525 + }, + { + "auxiliary_loss_clip": 0.01157034, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_clip": 1.05622661, + "balance_loss_mlp": 1.02997136, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 2.0210815244947598, + "language_loss": 0.80772328, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82977426, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.1809082, + "step": 3096, + "time_per_iteration": 2.4494760036468506 + }, + { + "auxiliary_loss_clip": 0.01153872, + "auxiliary_loss_mlp": 0.01052532, + "balance_loss_clip": 1.05308437, + "balance_loss_mlp": 1.03330374, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.419628318855134, + "language_loss": 0.75270021, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.7747643, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.19238281, + "step": 3097, + "time_per_iteration": 2.5023064613342285 + }, + { + "auxiliary_loss_clip": 0.01155313, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_clip": 1.0579921, + "balance_loss_mlp": 1.03207183, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 4.111847980190869, + "language_loss": 0.67047369, + "learning_rate": 3.749278224802352e-06, + "loss": 0.69252443, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.17687988, + "step": 3098, + "time_per_iteration": 2.442586898803711 + }, + { + "auxiliary_loss_clip": 0.01159911, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.05771375, + "balance_loss_mlp": 1.02766943, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.8679843607173559, + "language_loss": 0.69754869, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7196151, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.1907959, + "step": 3099, + "time_per_iteration": 2.531442403793335 + }, + { + "auxiliary_loss_clip": 0.01155885, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_clip": 1.0539031, + "balance_loss_mlp": 1.0324707, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.7835123340133978, + "language_loss": 0.71826875, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74034441, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.1920166, + "step": 3100, + "time_per_iteration": 2.457277774810791 + }, + { + "auxiliary_loss_clip": 0.01158015, + "auxiliary_loss_mlp": 0.01071196, + "balance_loss_clip": 1.05286574, + "balance_loss_mlp": 1.05064464, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 1.999605134137686, + "language_loss": 0.80139315, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82368529, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.20532227, + "step": 3101, + "time_per_iteration": 2.5205886363983154 + }, + { + "auxiliary_loss_clip": 0.01157583, + "auxiliary_loss_mlp": 0.01042245, + "balance_loss_clip": 1.05833578, + "balance_loss_mlp": 1.02480507, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.3131618313884195, + "language_loss": 0.76949346, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79149175, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.17431641, + "step": 3102, + "time_per_iteration": 2.4892640113830566 + }, + { + "auxiliary_loss_clip": 0.01157781, + "auxiliary_loss_mlp": 0.01049076, + "balance_loss_clip": 1.05482841, + "balance_loss_mlp": 1.02968049, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 7.210443591648422, + "language_loss": 0.76491249, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78698111, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.19396973, + "step": 3103, + "time_per_iteration": 2.4948012828826904 + }, + { + "auxiliary_loss_clip": 0.01162976, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.0608952, + "balance_loss_mlp": 1.02692199, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.9445882703036454, + "language_loss": 0.78835189, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81042206, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17114258, + "step": 3104, + "time_per_iteration": 2.4507193565368652 + }, + { + "auxiliary_loss_clip": 0.01162679, + "auxiliary_loss_mlp": 0.01043177, + "balance_loss_clip": 1.06229484, + "balance_loss_mlp": 1.02678609, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.8221723677607984, + "language_loss": 0.8547858, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87684441, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.16381836, + "step": 3105, + "time_per_iteration": 2.4908223152160645 + }, + { + "auxiliary_loss_clip": 0.0115794, + "auxiliary_loss_mlp": 0.0105214, + "balance_loss_clip": 1.05110216, + "balance_loss_mlp": 1.03098047, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 2.0224465892294634, + "language_loss": 0.86671066, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.88881147, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.21179199, + "step": 3106, + "time_per_iteration": 3.934141159057617 + }, + { + "auxiliary_loss_clip": 0.01161897, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_clip": 1.05613112, + "balance_loss_mlp": 1.02788115, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 2.013579578163249, + "language_loss": 0.7823087, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80440128, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.19482422, + "step": 3107, + "time_per_iteration": 2.4272336959838867 + }, + { + "auxiliary_loss_clip": 0.01162585, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.05875301, + "balance_loss_mlp": 1.03202975, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 3.128338265936563, + "language_loss": 0.74250323, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76463342, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.18395996, + "step": 3108, + "time_per_iteration": 2.5618534088134766 + }, + { + "auxiliary_loss_clip": 0.01157953, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.05421448, + "balance_loss_mlp": 1.0288074, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 2.5950860586258973, + "language_loss": 0.74582142, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76787162, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18261719, + "step": 3109, + "time_per_iteration": 2.4078660011291504 + }, + { + "auxiliary_loss_clip": 0.01157783, + "auxiliary_loss_mlp": 0.01042179, + "balance_loss_clip": 1.05779457, + "balance_loss_mlp": 1.02423847, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.701624564170491, + "language_loss": 0.84439969, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86639935, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17938232, + "step": 3110, + "time_per_iteration": 2.4666621685028076 + }, + { + "auxiliary_loss_clip": 0.01164935, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.06379128, + "balance_loss_mlp": 1.02475858, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.67254650862628, + "language_loss": 0.84556055, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86763549, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.17803955, + "step": 3111, + "time_per_iteration": 2.480797290802002 + }, + { + "auxiliary_loss_clip": 0.01153816, + "auxiliary_loss_mlp": 0.01041864, + "balance_loss_clip": 1.05394554, + "balance_loss_mlp": 1.02251649, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.7717657611570483, + "language_loss": 0.76824546, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.79020226, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.19348145, + "step": 3112, + "time_per_iteration": 3.844424247741699 + }, + { + "auxiliary_loss_clip": 0.01158449, + "auxiliary_loss_mlp": 0.01046836, + "balance_loss_clip": 1.05533671, + "balance_loss_mlp": 1.02919269, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8867944149458717, + "language_loss": 0.64768511, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66973799, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.17651367, + "step": 3113, + "time_per_iteration": 3.9597485065460205 + }, + { + "auxiliary_loss_clip": 0.01163747, + "auxiliary_loss_mlp": 0.01047025, + "balance_loss_clip": 1.05697513, + "balance_loss_mlp": 1.02863157, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.2005052419583384, + "language_loss": 0.81669551, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83880329, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18383789, + "step": 3114, + "time_per_iteration": 2.4751429557800293 + }, + { + "auxiliary_loss_clip": 0.01161837, + "auxiliary_loss_mlp": 0.01063729, + "balance_loss_clip": 1.05566025, + "balance_loss_mlp": 1.04237843, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.1835441418986146, + "language_loss": 0.57967913, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60193479, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.21350098, + "step": 3115, + "time_per_iteration": 2.51440167427063 + }, + { + "auxiliary_loss_clip": 0.01172957, + "auxiliary_loss_mlp": 0.01046284, + "balance_loss_clip": 1.06899989, + "balance_loss_mlp": 1.02843881, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 2.0145931434170494, + "language_loss": 0.71411109, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73630357, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.1784668, + "step": 3116, + "time_per_iteration": 2.590212345123291 + }, + { + "auxiliary_loss_clip": 0.01155891, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.05740857, + "balance_loss_mlp": 1.01505256, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.9419508757879889, + "language_loss": 0.79020679, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81208521, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16906738, + "step": 3117, + "time_per_iteration": 3.877627372741699 + }, + { + "auxiliary_loss_clip": 0.0115577, + "auxiliary_loss_mlp": 0.01047404, + "balance_loss_clip": 1.05515265, + "balance_loss_mlp": 1.02884269, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.7384164497362877, + "language_loss": 0.84067684, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86270857, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.18566895, + "step": 3118, + "time_per_iteration": 2.54611873626709 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.05860972, + "balance_loss_mlp": 1.02603865, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7312282190888821, + "language_loss": 0.76402593, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78605753, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17150879, + "step": 3119, + "time_per_iteration": 2.4978744983673096 + }, + { + "auxiliary_loss_clip": 0.01157426, + "auxiliary_loss_mlp": 0.01045401, + "balance_loss_clip": 1.05576909, + "balance_loss_mlp": 1.02860403, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 2.190048307568936, + "language_loss": 0.82182258, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84385085, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.16784668, + "step": 3120, + "time_per_iteration": 2.49843168258667 + }, + { + "auxiliary_loss_clip": 0.01165831, + "auxiliary_loss_mlp": 0.01040726, + "balance_loss_clip": 1.06516814, + "balance_loss_mlp": 1.02355945, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.6963486203368536, + "language_loss": 0.85138619, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87345177, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.17175293, + "step": 3121, + "time_per_iteration": 2.533318281173706 + }, + { + "auxiliary_loss_clip": 0.01156425, + "auxiliary_loss_mlp": 0.01046715, + "balance_loss_clip": 1.05456233, + "balance_loss_mlp": 1.02902412, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.2065971284276524, + "language_loss": 0.70812285, + "learning_rate": 3.744727910244937e-06, + "loss": 0.73015434, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.17687988, + "step": 3122, + "time_per_iteration": 2.570207357406616 + }, + { + "auxiliary_loss_clip": 0.01151671, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.02342951, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.174388859038416, + "language_loss": 0.70744795, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72939074, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.19189453, + "step": 3123, + "time_per_iteration": 2.4787580966949463 + }, + { + "auxiliary_loss_clip": 0.01149232, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.0502429, + "balance_loss_mlp": 1.02469885, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 1.95586261947996, + "language_loss": 0.74246264, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76437664, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.17468262, + "step": 3124, + "time_per_iteration": 2.5906291007995605 + }, + { + "auxiliary_loss_clip": 0.01160009, + "auxiliary_loss_mlp": 0.01048518, + "balance_loss_clip": 1.05675244, + "balance_loss_mlp": 1.02909863, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 2.312039388812417, + "language_loss": 0.80629206, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82837731, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.19421387, + "step": 3125, + "time_per_iteration": 2.61539363861084 + }, + { + "auxiliary_loss_clip": 0.01084598, + "auxiliary_loss_mlp": 0.01024074, + "balance_loss_clip": 1.0479641, + "balance_loss_mlp": 1.02114487, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9833676180884123, + "language_loss": 0.63548267, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656936, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.36669922, + "router_z_loss_mlp": 0.02929688, + "step": 3126, + "time_per_iteration": 3.1212611198425293 + }, + { + "auxiliary_loss_clip": 0.01156526, + "auxiliary_loss_mlp": 0.01047083, + "balance_loss_clip": 1.05885839, + "balance_loss_mlp": 1.02777088, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.910542248843876, + "language_loss": 0.8175205, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83955657, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.19311523, + "step": 3127, + "time_per_iteration": 2.509775161743164 + }, + { + "auxiliary_loss_clip": 0.01078711, + "auxiliary_loss_mlp": 0.01007316, + "balance_loss_clip": 1.04341578, + "balance_loss_mlp": 1.00448799, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7717833693211222, + "language_loss": 0.61921656, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64007688, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.35302734, + "router_z_loss_mlp": 0.02828979, + "step": 3128, + "time_per_iteration": 3.176584005355835 + }, + { + "auxiliary_loss_clip": 0.01164521, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.05905259, + "balance_loss_mlp": 1.02839315, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.483615771850661, + "language_loss": 0.71250117, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73462069, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.19042969, + "step": 3129, + "time_per_iteration": 2.5422966480255127 + }, + { + "auxiliary_loss_clip": 0.0115923, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.05774033, + "balance_loss_mlp": 1.0323385, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.3374128475333906, + "language_loss": 0.85390866, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87601984, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.1953125, + "step": 3130, + "time_per_iteration": 2.4440126419067383 + }, + { + "auxiliary_loss_clip": 0.01174346, + "auxiliary_loss_mlp": 0.01046982, + "balance_loss_clip": 1.0686276, + "balance_loss_mlp": 1.02842152, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 2.0102870817178453, + "language_loss": 0.76611948, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.78833282, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.18579102, + "step": 3131, + "time_per_iteration": 2.536390542984009 + }, + { + "auxiliary_loss_clip": 0.01158823, + "auxiliary_loss_mlp": 0.01056139, + "balance_loss_clip": 1.05919576, + "balance_loss_mlp": 1.03714848, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 2.425240198003169, + "language_loss": 0.8153646, + "learning_rate": 3.74282069289017e-06, + "loss": 0.83751416, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.19006348, + "step": 3132, + "time_per_iteration": 2.541295289993286 + }, + { + "auxiliary_loss_clip": 0.01163895, + "auxiliary_loss_mlp": 0.01049171, + "balance_loss_clip": 1.05996072, + "balance_loss_mlp": 1.03133762, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.4357212505745487, + "language_loss": 0.79827619, + "learning_rate": 3.742629607551614e-06, + "loss": 0.82040685, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.17822266, + "step": 3133, + "time_per_iteration": 2.5418341159820557 + }, + { + "auxiliary_loss_clip": 0.01167252, + "auxiliary_loss_mlp": 0.01057578, + "balance_loss_clip": 1.06121302, + "balance_loss_mlp": 1.03889835, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 5.980194432342123, + "language_loss": 0.82989383, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85214221, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.18701172, + "step": 3134, + "time_per_iteration": 2.49641752243042 + }, + { + "auxiliary_loss_clip": 0.0115368, + "auxiliary_loss_mlp": 0.01054932, + "balance_loss_clip": 1.05463123, + "balance_loss_mlp": 1.03715777, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 3.7461493737419738, + "language_loss": 0.82806855, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85015464, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.17773438, + "step": 3135, + "time_per_iteration": 2.5162317752838135 + }, + { + "auxiliary_loss_clip": 0.01162582, + "auxiliary_loss_mlp": 0.01051123, + "balance_loss_clip": 1.05797064, + "balance_loss_mlp": 1.03296709, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 2.224920946899104, + "language_loss": 0.78830314, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81044018, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.18151855, + "step": 3136, + "time_per_iteration": 2.5764894485473633 + }, + { + "auxiliary_loss_clip": 0.01166509, + "auxiliary_loss_mlp": 0.0105397, + "balance_loss_clip": 1.06215286, + "balance_loss_mlp": 1.03524256, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9361102018206338, + "language_loss": 0.81143498, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83363974, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18725586, + "step": 3137, + "time_per_iteration": 2.496570348739624 + }, + { + "auxiliary_loss_clip": 0.011614, + "auxiliary_loss_mlp": 0.0105813, + "balance_loss_clip": 1.05965459, + "balance_loss_mlp": 1.04086804, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 2.8606130979308775, + "language_loss": 0.80827868, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83047396, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.17260742, + "step": 3138, + "time_per_iteration": 2.5507445335388184 + }, + { + "auxiliary_loss_clip": 0.01168745, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.05961919, + "balance_loss_mlp": 1.0438199, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 3.260332625546315, + "language_loss": 0.6361227, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.65843886, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.19030762, + "step": 3139, + "time_per_iteration": 2.6303482055664062 + }, + { + "auxiliary_loss_clip": 0.01161579, + "auxiliary_loss_mlp": 0.01049304, + "balance_loss_clip": 1.05824316, + "balance_loss_mlp": 1.03026664, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 1.8653954368250933, + "language_loss": 0.71704859, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73915744, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.19018555, + "step": 3140, + "time_per_iteration": 2.4961447715759277 + }, + { + "auxiliary_loss_clip": 0.01165306, + "auxiliary_loss_mlp": 0.01049318, + "balance_loss_clip": 1.06156325, + "balance_loss_mlp": 1.03055501, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 15.190975494822705, + "language_loss": 0.87231398, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89446026, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.18762207, + "step": 3141, + "time_per_iteration": 2.430264472961426 + }, + { + "auxiliary_loss_clip": 0.01166406, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.06064343, + "balance_loss_mlp": 1.0212791, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.611301167369163, + "language_loss": 0.77291739, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79498529, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.19104004, + "step": 3142, + "time_per_iteration": 2.501706838607788 + }, + { + "auxiliary_loss_clip": 0.01156232, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.05478811, + "balance_loss_mlp": 1.04014874, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.7680405355126503, + "language_loss": 0.78547204, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80762684, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.19091797, + "step": 3143, + "time_per_iteration": 2.560868263244629 + }, + { + "auxiliary_loss_clip": 0.01157941, + "auxiliary_loss_mlp": 0.0106545, + "balance_loss_clip": 1.05403066, + "balance_loss_mlp": 1.04532766, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.2869750887208764, + "language_loss": 0.7147603, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73699421, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.20117188, + "step": 3144, + "time_per_iteration": 2.553685188293457 + }, + { + "auxiliary_loss_clip": 0.01165504, + "auxiliary_loss_mlp": 0.01046009, + "balance_loss_clip": 1.05886197, + "balance_loss_mlp": 1.02680421, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 3.1049903978426063, + "language_loss": 0.74197018, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.76408529, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.1920166, + "step": 3145, + "time_per_iteration": 2.5267693996429443 + }, + { + "auxiliary_loss_clip": 0.01167191, + "auxiliary_loss_mlp": 0.01048341, + "balance_loss_clip": 1.0649209, + "balance_loss_mlp": 1.03141332, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.252623364096915, + "language_loss": 0.76576477, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78792012, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.16931152, + "step": 3146, + "time_per_iteration": 2.502612352371216 + }, + { + "auxiliary_loss_clip": 0.01163581, + "auxiliary_loss_mlp": 0.01052237, + "balance_loss_clip": 1.05886352, + "balance_loss_mlp": 1.03386712, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.8482239522079134, + "language_loss": 0.78932756, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.81148577, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.18359375, + "step": 3147, + "time_per_iteration": 2.5011775493621826 + }, + { + "auxiliary_loss_clip": 0.01164093, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_clip": 1.05970919, + "balance_loss_mlp": 1.030146, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 2.741652799273365, + "language_loss": 0.67087615, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69300032, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.18164062, + "step": 3148, + "time_per_iteration": 2.5066986083984375 + }, + { + "auxiliary_loss_clip": 0.01164843, + "auxiliary_loss_mlp": 0.0103901, + "balance_loss_clip": 1.06059837, + "balance_loss_mlp": 1.02099717, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.218619562709211, + "language_loss": 0.75949389, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78153241, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 1.04248047, + "router_z_loss_mlp": 0.18005371, + "step": 3149, + "time_per_iteration": 2.4501988887786865 + }, + { + "auxiliary_loss_clip": 0.01157943, + "auxiliary_loss_mlp": 0.01040975, + "balance_loss_clip": 1.05624342, + "balance_loss_mlp": 1.02358198, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 1.9855676483226234, + "language_loss": 0.80613303, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.8281222, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.1739502, + "step": 3150, + "time_per_iteration": 3.8682377338409424 + }, + { + "auxiliary_loss_clip": 0.01165876, + "auxiliary_loss_mlp": 0.01047664, + "balance_loss_clip": 1.06264329, + "balance_loss_mlp": 1.02949667, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.3043941642592665, + "language_loss": 0.85300457, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87513995, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.18164062, + "step": 3151, + "time_per_iteration": 2.4801981449127197 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01049447, + "balance_loss_clip": 1.0562619, + "balance_loss_mlp": 1.03145862, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9872158901634371, + "language_loss": 0.74177825, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76386338, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.17980957, + "step": 3152, + "time_per_iteration": 2.485748052597046 + }, + { + "auxiliary_loss_clip": 0.01163399, + "auxiliary_loss_mlp": 0.01054335, + "balance_loss_clip": 1.05711412, + "balance_loss_mlp": 1.03434408, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 2.0721803452542096, + "language_loss": 0.75642312, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77860045, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.1998291, + "step": 3153, + "time_per_iteration": 2.4923665523529053 + }, + { + "auxiliary_loss_clip": 0.01163714, + "auxiliary_loss_mlp": 0.01050831, + "balance_loss_clip": 1.0573771, + "balance_loss_mlp": 1.03162622, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 2.0277210485182757, + "language_loss": 0.79948187, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.82162732, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.19213867, + "step": 3154, + "time_per_iteration": 2.4419689178466797 + }, + { + "auxiliary_loss_clip": 0.01165196, + "auxiliary_loss_mlp": 0.01057177, + "balance_loss_clip": 1.05693054, + "balance_loss_mlp": 1.0378654, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.4579911843570006, + "language_loss": 0.72523105, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74745482, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.19299316, + "step": 3155, + "time_per_iteration": 2.4486804008483887 + }, + { + "auxiliary_loss_clip": 0.01168477, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_clip": 1.06320095, + "balance_loss_mlp": 1.02859068, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.9651355418575296, + "language_loss": 0.73732311, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.7594803, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.18664551, + "step": 3156, + "time_per_iteration": 3.8542370796203613 + }, + { + "auxiliary_loss_clip": 0.01164994, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.06077695, + "balance_loss_mlp": 1.02628326, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.8049669602498182, + "language_loss": 0.68243098, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70452142, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.1776123, + "step": 3157, + "time_per_iteration": 3.9543585777282715 + }, + { + "auxiliary_loss_clip": 0.01162751, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.05664134, + "balance_loss_mlp": 1.02720165, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 2.1255911720443, + "language_loss": 0.79748571, + "learning_rate": 3.737831016747176e-06, + "loss": 0.81956643, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.18127441, + "step": 3158, + "time_per_iteration": 2.5643062591552734 + }, + { + "auxiliary_loss_clip": 0.01169089, + "auxiliary_loss_mlp": 0.01054596, + "balance_loss_clip": 1.05916858, + "balance_loss_mlp": 1.03487849, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 4.027511459794902, + "language_loss": 0.72400069, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74623758, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.19714355, + "step": 3159, + "time_per_iteration": 2.5065090656280518 + }, + { + "auxiliary_loss_clip": 0.01160875, + "auxiliary_loss_mlp": 0.0105078, + "balance_loss_clip": 1.0583967, + "balance_loss_mlp": 1.03105092, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 2.1074550634708107, + "language_loss": 0.85686833, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87898487, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.19726562, + "step": 3160, + "time_per_iteration": 2.535827159881592 + }, + { + "auxiliary_loss_clip": 0.01157203, + "auxiliary_loss_mlp": 0.0104531, + "balance_loss_clip": 1.05726182, + "balance_loss_mlp": 1.02814412, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 2.5381295346746553, + "language_loss": 0.73178059, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75380564, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17163086, + "step": 3161, + "time_per_iteration": 3.9595518112182617 + }, + { + "auxiliary_loss_clip": 0.01161223, + "auxiliary_loss_mlp": 0.01054557, + "balance_loss_clip": 1.058532, + "balance_loss_mlp": 1.03632998, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 2.2708336634608326, + "language_loss": 0.80813366, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83029139, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.18237305, + "step": 3162, + "time_per_iteration": 2.6121506690979004 + }, + { + "auxiliary_loss_clip": 0.01163387, + "auxiliary_loss_mlp": 0.01044438, + "balance_loss_clip": 1.06028903, + "balance_loss_mlp": 1.02648544, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 3.388955181286507, + "language_loss": 0.75718009, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77925831, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.17944336, + "step": 3163, + "time_per_iteration": 2.4750165939331055 + }, + { + "auxiliary_loss_clip": 0.011602, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.05912638, + "balance_loss_mlp": 1.02353609, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.608630877847844, + "language_loss": 0.74414313, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76616371, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.18334961, + "step": 3164, + "time_per_iteration": 2.711678981781006 + }, + { + "auxiliary_loss_clip": 0.01159661, + "auxiliary_loss_mlp": 0.01038742, + "balance_loss_clip": 1.05828023, + "balance_loss_mlp": 1.02097964, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.4837620896657666, + "language_loss": 0.66726464, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.68924868, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.1776123, + "step": 3165, + "time_per_iteration": 2.9033236503601074 + }, + { + "auxiliary_loss_clip": 0.01158536, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.05792332, + "balance_loss_mlp": 1.0274924, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.169442716796661, + "language_loss": 0.74958932, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.7716344, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.18481445, + "step": 3166, + "time_per_iteration": 2.399388074874878 + }, + { + "auxiliary_loss_clip": 0.01103117, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.06186867, + "balance_loss_mlp": 1.03635001, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.7917623877476389, + "language_loss": 0.50333846, + "learning_rate": 3.736093435602968e-06, + "loss": 0.5247578, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 0.0246582, + "step": 3167, + "time_per_iteration": 3.1270487308502197 + }, + { + "auxiliary_loss_clip": 0.01168627, + "auxiliary_loss_mlp": 0.01048165, + "balance_loss_clip": 1.06760037, + "balance_loss_mlp": 1.0305692, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 2.446384164090972, + "language_loss": 0.74648881, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76865673, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.17602539, + "step": 3168, + "time_per_iteration": 2.4488637447357178 + }, + { + "auxiliary_loss_clip": 0.01078794, + "auxiliary_loss_mlp": 0.01018798, + "balance_loss_clip": 1.04180682, + "balance_loss_mlp": 1.01657808, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8703626196523621, + "language_loss": 0.60066199, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62163794, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.37011719, + "router_z_loss_mlp": 0.0222168, + "step": 3169, + "time_per_iteration": 3.0046679973602295 + }, + { + "auxiliary_loss_clip": 0.01161916, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.05716133, + "balance_loss_mlp": 1.03111458, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 2.2298537964797376, + "language_loss": 0.78377664, + "learning_rate": 3.735513056633436e-06, + "loss": 0.8058818, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.17504883, + "step": 3170, + "time_per_iteration": 2.4683334827423096 + }, + { + "auxiliary_loss_clip": 0.01155171, + "auxiliary_loss_mlp": 0.01043857, + "balance_loss_clip": 1.05668545, + "balance_loss_mlp": 1.02659583, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 2.0481178412567407, + "language_loss": 0.78211927, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80410957, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17272949, + "step": 3171, + "time_per_iteration": 2.45198655128479 + }, + { + "auxiliary_loss_clip": 0.01167705, + "auxiliary_loss_mlp": 0.01045155, + "balance_loss_clip": 1.06366789, + "balance_loss_mlp": 1.02752352, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 2.397824756476315, + "language_loss": 0.78516459, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80729318, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.17626953, + "step": 3172, + "time_per_iteration": 2.5349223613739014 + }, + { + "auxiliary_loss_clip": 0.0115195, + "auxiliary_loss_mlp": 0.01048682, + "balance_loss_clip": 1.05138731, + "balance_loss_mlp": 1.03146839, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.6190535871622718, + "language_loss": 0.80477893, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82678533, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17211914, + "step": 3173, + "time_per_iteration": 2.503418445587158 + }, + { + "auxiliary_loss_clip": 0.01161369, + "auxiliary_loss_mlp": 0.01054392, + "balance_loss_clip": 1.0576427, + "balance_loss_mlp": 1.03708315, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.5372287880954993, + "language_loss": 0.78484201, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80699956, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.1730957, + "step": 3174, + "time_per_iteration": 2.5017359256744385 + }, + { + "auxiliary_loss_clip": 0.01163187, + "auxiliary_loss_mlp": 0.01043409, + "balance_loss_clip": 1.06144261, + "balance_loss_mlp": 1.02625525, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.7053710374768694, + "language_loss": 0.81511366, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.83717966, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.17150879, + "step": 3175, + "time_per_iteration": 2.432756185531616 + }, + { + "auxiliary_loss_clip": 0.01163904, + "auxiliary_loss_mlp": 0.01060904, + "balance_loss_clip": 1.06385803, + "balance_loss_mlp": 1.04295087, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.8433428238105773, + "language_loss": 0.85507619, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87732428, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17956543, + "step": 3176, + "time_per_iteration": 2.416712522506714 + }, + { + "auxiliary_loss_clip": 0.01155708, + "auxiliary_loss_mlp": 0.01053599, + "balance_loss_clip": 1.05306625, + "balance_loss_mlp": 1.03447771, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.086619311598954, + "language_loss": 0.81596243, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83805549, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.19116211, + "step": 3177, + "time_per_iteration": 2.541424036026001 + }, + { + "auxiliary_loss_clip": 0.01147903, + "auxiliary_loss_mlp": 0.01047093, + "balance_loss_clip": 1.0498358, + "balance_loss_mlp": 1.02973628, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 2.7027054163204434, + "language_loss": 0.74937075, + "learning_rate": 3.73396248424356e-06, + "loss": 0.7713207, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.17346191, + "step": 3178, + "time_per_iteration": 2.4662728309631348 + }, + { + "auxiliary_loss_clip": 0.01149347, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.0495286, + "balance_loss_mlp": 1.0214541, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 2.04914077515132, + "language_loss": 0.81243992, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83431101, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.16308594, + "step": 3179, + "time_per_iteration": 2.5278918743133545 + }, + { + "auxiliary_loss_clip": 0.01163424, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.06197429, + "balance_loss_mlp": 1.02453589, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 4.896999882518952, + "language_loss": 0.79911005, + "learning_rate": 3.733574183478691e-06, + "loss": 0.82115662, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.16711426, + "step": 3180, + "time_per_iteration": 2.420437812805176 + }, + { + "auxiliary_loss_clip": 0.01153734, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_clip": 1.05394387, + "balance_loss_mlp": 1.02420747, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 3.005170849602274, + "language_loss": 0.79620194, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81815624, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17492676, + "step": 3181, + "time_per_iteration": 2.5145199298858643 + }, + { + "auxiliary_loss_clip": 0.0115255, + "auxiliary_loss_mlp": 0.01042676, + "balance_loss_clip": 1.05166054, + "balance_loss_mlp": 1.02574825, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7568200539504488, + "language_loss": 0.73720014, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.75915241, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.16918945, + "step": 3182, + "time_per_iteration": 2.496990442276001 + }, + { + "auxiliary_loss_clip": 0.01159851, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.0589323, + "balance_loss_mlp": 1.02409244, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.9526860272753157, + "language_loss": 0.65075618, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67276764, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.17205811, + "step": 3183, + "time_per_iteration": 2.5168423652648926 + }, + { + "auxiliary_loss_clip": 0.01161765, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.06055355, + "balance_loss_mlp": 1.02518022, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.72218631293055, + "language_loss": 0.73557305, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75762081, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.17834473, + "step": 3184, + "time_per_iteration": 2.5112783908843994 + }, + { + "auxiliary_loss_clip": 0.01161472, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_clip": 1.05813432, + "balance_loss_mlp": 1.02808952, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 11.159960665694141, + "language_loss": 0.88355708, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90563977, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.18701172, + "step": 3185, + "time_per_iteration": 2.517291784286499 + }, + { + "auxiliary_loss_clip": 0.01159786, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.06257927, + "balance_loss_mlp": 1.02410209, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.5511173307049804, + "language_loss": 0.73329836, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75530672, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16943359, + "step": 3186, + "time_per_iteration": 2.482332229614258 + }, + { + "auxiliary_loss_clip": 0.01162648, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.06019163, + "balance_loss_mlp": 1.02120876, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 1.8819167265897647, + "language_loss": 0.83822823, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86026007, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.1932373, + "step": 3187, + "time_per_iteration": 2.5888476371765137 + }, + { + "auxiliary_loss_clip": 0.0107792, + "auxiliary_loss_mlp": 0.01011131, + "balance_loss_clip": 1.04165268, + "balance_loss_mlp": 1.00874352, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8640236082428165, + "language_loss": 0.55876434, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57965481, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.36279297, + "router_z_loss_mlp": 0.02386475, + "step": 3188, + "time_per_iteration": 3.1859312057495117 + }, + { + "auxiliary_loss_clip": 0.01152857, + "auxiliary_loss_mlp": 0.0104622, + "balance_loss_clip": 1.05474067, + "balance_loss_mlp": 1.02912498, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.8874870933137264, + "language_loss": 0.70196474, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72395551, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.17089844, + "step": 3189, + "time_per_iteration": 2.551548719406128 + }, + { + "auxiliary_loss_clip": 0.01145848, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.05097318, + "balance_loss_mlp": 1.01910377, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.2562943612556277, + "language_loss": 0.74009818, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76190293, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.15515137, + "step": 3190, + "time_per_iteration": 2.5362322330474854 + }, + { + "auxiliary_loss_clip": 0.01154821, + "auxiliary_loss_mlp": 0.01055104, + "balance_loss_clip": 1.05687249, + "balance_loss_mlp": 1.03836739, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 2.43028602713493, + "language_loss": 0.84162313, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86372232, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.1673584, + "step": 3191, + "time_per_iteration": 2.4806532859802246 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01047459, + "balance_loss_clip": 1.05263138, + "balance_loss_mlp": 1.03166389, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.8229399578167818, + "language_loss": 0.88991898, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91187775, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.15820312, + "step": 3192, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01165655, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.06508029, + "balance_loss_mlp": 1.03018975, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 1.798220769009592, + "language_loss": 0.74732906, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.76946795, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.18041992, + "step": 3193, + "time_per_iteration": 3.9263906478881836 + }, + { + "auxiliary_loss_clip": 0.01156443, + "auxiliary_loss_mlp": 0.01044706, + "balance_loss_clip": 1.05578017, + "balance_loss_mlp": 1.02744436, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 2.0923664235213533, + "language_loss": 0.74892962, + "learning_rate": 3.730848718849612e-06, + "loss": 0.77094114, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.17272949, + "step": 3194, + "time_per_iteration": 2.5445406436920166 + }, + { + "auxiliary_loss_clip": 0.01086654, + "auxiliary_loss_mlp": 0.01017228, + "balance_loss_clip": 1.05140722, + "balance_loss_mlp": 1.0148077, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7833813576867028, + "language_loss": 0.68452096, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70555973, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.35253906, + "router_z_loss_mlp": 0.02420044, + "step": 3195, + "time_per_iteration": 3.0752947330474854 + }, + { + "auxiliary_loss_clip": 0.01164014, + "auxiliary_loss_mlp": 0.0105301, + "balance_loss_clip": 1.06342912, + "balance_loss_mlp": 1.03645217, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 3.989434684869215, + "language_loss": 0.72814453, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75031477, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.16564941, + "step": 3196, + "time_per_iteration": 2.463172435760498 + }, + { + "auxiliary_loss_clip": 0.01161512, + "auxiliary_loss_mlp": 0.01043628, + "balance_loss_clip": 1.06430697, + "balance_loss_mlp": 1.02734399, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 2.005814365775389, + "language_loss": 0.83446586, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.8565172, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.16271973, + "step": 3197, + "time_per_iteration": 2.4647629261016846 + }, + { + "auxiliary_loss_clip": 0.01153768, + "auxiliary_loss_mlp": 0.0104793, + "balance_loss_clip": 1.05469894, + "balance_loss_mlp": 1.03053761, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.327324353297063, + "language_loss": 0.80354011, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82555711, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.1739502, + "step": 3198, + "time_per_iteration": 2.5320632457733154 + }, + { + "auxiliary_loss_clip": 0.01151038, + "auxiliary_loss_mlp": 0.01049616, + "balance_loss_clip": 1.0507443, + "balance_loss_mlp": 1.03147292, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 2.1172092281702475, + "language_loss": 0.78974557, + "learning_rate": 3.729872219959029e-06, + "loss": 0.81175208, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.18151855, + "step": 3199, + "time_per_iteration": 3.9036788940429688 + }, + { + "auxiliary_loss_clip": 0.01159734, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_clip": 1.06006241, + "balance_loss_mlp": 1.0309968, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.2141891029446894, + "language_loss": 0.83686769, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85893869, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.16345215, + "step": 3200, + "time_per_iteration": 2.42010760307312 + }, + { + "auxiliary_loss_clip": 0.01154021, + "auxiliary_loss_mlp": 0.01050676, + "balance_loss_clip": 1.0546093, + "balance_loss_mlp": 1.0338676, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 1.8896141363259256, + "language_loss": 0.79494578, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81699276, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16821289, + "step": 3201, + "time_per_iteration": 3.8426880836486816 + }, + { + "auxiliary_loss_clip": 0.01152173, + "auxiliary_loss_mlp": 0.01038395, + "balance_loss_clip": 1.05172229, + "balance_loss_mlp": 1.02200341, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.262369296783004, + "language_loss": 0.69379377, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71569949, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.16394043, + "step": 3202, + "time_per_iteration": 2.455768585205078 + }, + { + "auxiliary_loss_clip": 0.01154593, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_clip": 1.05640507, + "balance_loss_mlp": 1.02276301, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.8915343018934672, + "language_loss": 0.91274464, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93470287, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.18469238, + "step": 3203, + "time_per_iteration": 2.449732780456543 + }, + { + "auxiliary_loss_clip": 0.01155431, + "auxiliary_loss_mlp": 0.01040571, + "balance_loss_clip": 1.05621207, + "balance_loss_mlp": 1.0229876, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.3306768877467423, + "language_loss": 0.81740355, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83936352, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.17578125, + "step": 3204, + "time_per_iteration": 3.8176419734954834 + }, + { + "auxiliary_loss_clip": 0.01155453, + "auxiliary_loss_mlp": 0.01041776, + "balance_loss_clip": 1.0567807, + "balance_loss_mlp": 1.02463412, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 1.9680122162553502, + "language_loss": 0.75851011, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.78048253, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.17150879, + "step": 3205, + "time_per_iteration": 2.4627747535705566 + }, + { + "auxiliary_loss_clip": 0.01161077, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.06082082, + "balance_loss_mlp": 1.02669835, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.5227532071355365, + "language_loss": 0.83491313, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85695952, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.16870117, + "step": 3206, + "time_per_iteration": 2.5366032123565674 + }, + { + "auxiliary_loss_clip": 0.01077054, + "auxiliary_loss_mlp": 0.01012517, + "balance_loss_clip": 1.04103768, + "balance_loss_mlp": 1.01054382, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8418182474797765, + "language_loss": 0.60592431, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62682003, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.35986328, + "router_z_loss_mlp": 0.01971436, + "step": 3207, + "time_per_iteration": 2.907240867614746 + }, + { + "auxiliary_loss_clip": 0.0115675, + "auxiliary_loss_mlp": 0.01047147, + "balance_loss_clip": 1.05597651, + "balance_loss_mlp": 1.03006423, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 3.495299483177343, + "language_loss": 0.7509104, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.7729494, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.1706543, + "step": 3208, + "time_per_iteration": 2.4863836765289307 + }, + { + "auxiliary_loss_clip": 0.01158735, + "auxiliary_loss_mlp": 0.01045813, + "balance_loss_clip": 1.05538702, + "balance_loss_mlp": 1.02763367, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 1.9507780626180744, + "language_loss": 0.60351801, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62556344, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.1817627, + "step": 3209, + "time_per_iteration": 2.4498276710510254 + }, + { + "auxiliary_loss_clip": 0.01152066, + "auxiliary_loss_mlp": 0.01053435, + "balance_loss_clip": 1.05042124, + "balance_loss_mlp": 1.03290761, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 2.0374111835888824, + "language_loss": 0.79984677, + "learning_rate": 3.727718151176243e-06, + "loss": 0.8219018, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.2052002, + "step": 3210, + "time_per_iteration": 2.6887643337249756 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01039185, + "balance_loss_clip": 1.05285323, + "balance_loss_mlp": 1.02284098, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.092585656949012, + "language_loss": 0.82878649, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85069323, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16357422, + "step": 3211, + "time_per_iteration": 2.416015863418579 + }, + { + "auxiliary_loss_clip": 0.01074649, + "auxiliary_loss_mlp": 0.01012983, + "balance_loss_clip": 1.03935742, + "balance_loss_mlp": 1.01033974, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9705886789212921, + "language_loss": 0.63653708, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65741336, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.35400391, + "router_z_loss_mlp": 0.02645874, + "step": 3212, + "time_per_iteration": 2.9461660385131836 + }, + { + "auxiliary_loss_clip": 0.01151483, + "auxiliary_loss_mlp": 0.0104858, + "balance_loss_clip": 1.05359137, + "balance_loss_mlp": 1.03217649, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.7220950812287745, + "language_loss": 0.76175374, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78375435, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.16394043, + "step": 3213, + "time_per_iteration": 2.5883097648620605 + }, + { + "auxiliary_loss_clip": 0.01164925, + "auxiliary_loss_mlp": 0.01049505, + "balance_loss_clip": 1.06194186, + "balance_loss_mlp": 1.03187406, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 1.872155866837776, + "language_loss": 0.7109493, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73309362, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.17651367, + "step": 3214, + "time_per_iteration": 2.509460687637329 + }, + { + "auxiliary_loss_clip": 0.0115745, + "auxiliary_loss_mlp": 0.01048724, + "balance_loss_clip": 1.05490565, + "balance_loss_mlp": 1.03068709, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.5732854581665343, + "language_loss": 0.75579917, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77786094, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.18054199, + "step": 3215, + "time_per_iteration": 2.4096014499664307 + }, + { + "auxiliary_loss_clip": 0.011489, + "auxiliary_loss_mlp": 0.01050824, + "balance_loss_clip": 1.05267811, + "balance_loss_mlp": 1.03309774, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.7072245981643115, + "language_loss": 0.88612175, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90811902, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.17724609, + "step": 3216, + "time_per_iteration": 2.53922176361084 + }, + { + "auxiliary_loss_clip": 0.01157561, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.05923271, + "balance_loss_mlp": 1.02863097, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 2.0158121439152135, + "language_loss": 0.80462301, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82664371, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.15881348, + "step": 3217, + "time_per_iteration": 2.4773948192596436 + }, + { + "auxiliary_loss_clip": 0.01172914, + "auxiliary_loss_mlp": 0.01049193, + "balance_loss_clip": 1.06522965, + "balance_loss_mlp": 1.0304172, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.3849164073356817, + "language_loss": 0.6240325, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.64625353, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 1.07568359, + "router_z_loss_mlp": 0.18774414, + "step": 3218, + "time_per_iteration": 2.440972328186035 + }, + { + "auxiliary_loss_clip": 0.01157609, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.05586517, + "balance_loss_mlp": 1.02720332, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.674565865832766, + "language_loss": 0.80133849, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82335281, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.1661377, + "step": 3219, + "time_per_iteration": 2.4150524139404297 + }, + { + "auxiliary_loss_clip": 0.01168274, + "auxiliary_loss_mlp": 0.01048318, + "balance_loss_clip": 1.06759691, + "balance_loss_mlp": 1.03104496, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.7237643133918086, + "language_loss": 0.86406064, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.88622653, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17272949, + "step": 3220, + "time_per_iteration": 2.433570623397827 + }, + { + "auxiliary_loss_clip": 0.01170486, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.07246637, + "balance_loss_mlp": 1.02238941, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.4755158609111816, + "language_loss": 0.84109807, + "learning_rate": 3.725556155051766e-06, + "loss": 0.8631705, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.14361572, + "step": 3221, + "time_per_iteration": 2.4502620697021484 + }, + { + "auxiliary_loss_clip": 0.01157687, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.06200612, + "balance_loss_mlp": 1.02621603, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.1110754125112234, + "language_loss": 0.85935986, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88135493, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15612793, + "step": 3222, + "time_per_iteration": 2.4667534828186035 + }, + { + "auxiliary_loss_clip": 0.01156287, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.05761051, + "balance_loss_mlp": 1.02358127, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 4.037195685747347, + "language_loss": 0.78430974, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80627835, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.16992188, + "step": 3223, + "time_per_iteration": 2.4469380378723145 + }, + { + "auxiliary_loss_clip": 0.01153777, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.05814576, + "balance_loss_mlp": 1.0251832, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8403498692931042, + "language_loss": 0.75829428, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.78025019, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16638184, + "step": 3224, + "time_per_iteration": 2.4661502838134766 + }, + { + "auxiliary_loss_clip": 0.01158568, + "auxiliary_loss_mlp": 0.0104262, + "balance_loss_clip": 1.06062222, + "balance_loss_mlp": 1.02612174, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.6076076570086437, + "language_loss": 0.71220434, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73421621, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.16491699, + "step": 3225, + "time_per_iteration": 2.7059900760650635 + }, + { + "auxiliary_loss_clip": 0.01157313, + "auxiliary_loss_mlp": 0.01044122, + "balance_loss_clip": 1.05802989, + "balance_loss_mlp": 1.02756357, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.500343267647936, + "language_loss": 0.69568044, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71769476, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.16564941, + "step": 3226, + "time_per_iteration": 2.5158441066741943 + }, + { + "auxiliary_loss_clip": 0.01157822, + "auxiliary_loss_mlp": 0.01041165, + "balance_loss_clip": 1.05866218, + "balance_loss_mlp": 1.02364159, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 2.7852829486016284, + "language_loss": 0.76129544, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78328526, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.17529297, + "step": 3227, + "time_per_iteration": 2.5200436115264893 + }, + { + "auxiliary_loss_clip": 0.01157441, + "auxiliary_loss_mlp": 0.0104669, + "balance_loss_clip": 1.05312872, + "balance_loss_mlp": 1.03014421, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 2.4687391415190607, + "language_loss": 0.69044292, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71248424, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.16540527, + "step": 3228, + "time_per_iteration": 2.4966747760772705 + }, + { + "auxiliary_loss_clip": 0.01155506, + "auxiliary_loss_mlp": 0.01042812, + "balance_loss_clip": 1.05654287, + "balance_loss_mlp": 1.02634907, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 2.7086283191335006, + "language_loss": 0.74131209, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76329523, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.16455078, + "step": 3229, + "time_per_iteration": 2.520679473876953 + }, + { + "auxiliary_loss_clip": 0.01159343, + "auxiliary_loss_mlp": 0.01043269, + "balance_loss_clip": 1.06095576, + "balance_loss_mlp": 1.02733064, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 2.2918614247821023, + "language_loss": 0.65702641, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67905259, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.15942383, + "step": 3230, + "time_per_iteration": 2.5065410137176514 + }, + { + "auxiliary_loss_clip": 0.01154853, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.05733943, + "balance_loss_mlp": 1.0248903, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 2.321378982026431, + "language_loss": 0.81784087, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83980864, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17053223, + "step": 3231, + "time_per_iteration": 2.4436941146850586 + }, + { + "auxiliary_loss_clip": 0.01170273, + "auxiliary_loss_mlp": 0.01042441, + "balance_loss_clip": 1.06886053, + "balance_loss_mlp": 1.02467883, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 2.345738809487913, + "language_loss": 0.86822367, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89035076, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.1776123, + "step": 3232, + "time_per_iteration": 2.5236406326293945 + }, + { + "auxiliary_loss_clip": 0.01153951, + "auxiliary_loss_mlp": 0.01042885, + "balance_loss_clip": 1.05734181, + "balance_loss_mlp": 1.02620721, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.8729512444623542, + "language_loss": 0.85388219, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87585062, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.16674805, + "step": 3233, + "time_per_iteration": 2.4768075942993164 + }, + { + "auxiliary_loss_clip": 0.01156764, + "auxiliary_loss_mlp": 0.01054161, + "balance_loss_clip": 1.05575657, + "balance_loss_mlp": 1.03616047, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.9506984878767022, + "language_loss": 0.89170825, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91381752, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.17980957, + "step": 3234, + "time_per_iteration": 2.5238325595855713 + }, + { + "auxiliary_loss_clip": 0.01163175, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.0585233, + "balance_loss_mlp": 1.02812076, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.6496639544899425, + "language_loss": 0.78664815, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80873001, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.16894531, + "step": 3235, + "time_per_iteration": 2.431875228881836 + }, + { + "auxiliary_loss_clip": 0.01157088, + "auxiliary_loss_mlp": 0.01055554, + "balance_loss_clip": 1.05822396, + "balance_loss_mlp": 1.03754103, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 1.9106902970388489, + "language_loss": 0.79297817, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.8151046, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.18017578, + "step": 3236, + "time_per_iteration": 2.4946482181549072 + }, + { + "auxiliary_loss_clip": 0.01168964, + "auxiliary_loss_mlp": 0.01040741, + "balance_loss_clip": 1.06953287, + "balance_loss_mlp": 1.0236342, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 1.9568000017401657, + "language_loss": 0.75422966, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77632672, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.17102051, + "step": 3237, + "time_per_iteration": 4.1178672313690186 + }, + { + "auxiliary_loss_clip": 0.01160503, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.05787921, + "balance_loss_mlp": 1.03209412, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 1.7944653078365465, + "language_loss": 0.74757504, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.7696659, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.16467285, + "step": 3238, + "time_per_iteration": 2.4933416843414307 + }, + { + "auxiliary_loss_clip": 0.01168717, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_clip": 1.06642318, + "balance_loss_mlp": 1.02544212, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8590220096971675, + "language_loss": 0.73499858, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75711918, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.17907715, + "step": 3239, + "time_per_iteration": 2.4790091514587402 + }, + { + "auxiliary_loss_clip": 0.01154523, + "auxiliary_loss_mlp": 0.01042282, + "balance_loss_clip": 1.05676055, + "balance_loss_mlp": 1.02562869, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 2.0950548885514797, + "language_loss": 0.73715675, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75912482, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.16674805, + "step": 3240, + "time_per_iteration": 2.454690933227539 + }, + { + "auxiliary_loss_clip": 0.01163211, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.06142974, + "balance_loss_mlp": 1.02456212, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 2.9674514114206114, + "language_loss": 0.67091656, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.69295669, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.16235352, + "step": 3241, + "time_per_iteration": 2.6191420555114746 + }, + { + "auxiliary_loss_clip": 0.01158442, + "auxiliary_loss_mlp": 0.01041747, + "balance_loss_clip": 1.06061745, + "balance_loss_mlp": 1.02520132, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.3414564158091447, + "language_loss": 0.83117712, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85317904, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.16564941, + "step": 3242, + "time_per_iteration": 2.47253155708313 + }, + { + "auxiliary_loss_clip": 0.01074842, + "auxiliary_loss_mlp": 0.01012792, + "balance_loss_clip": 1.04154658, + "balance_loss_mlp": 1.00959158, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8506710351103023, + "language_loss": 0.57639337, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59726971, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.33300781, + "router_z_loss_mlp": 0.03201294, + "step": 3243, + "time_per_iteration": 4.523228883743286 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01042317, + "balance_loss_clip": 1.0539012, + "balance_loss_mlp": 1.02494812, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.029410405066854, + "language_loss": 0.83680356, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85875905, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.17358398, + "step": 3244, + "time_per_iteration": 2.535921573638916 + }, + { + "auxiliary_loss_clip": 0.01158498, + "auxiliary_loss_mlp": 0.01057296, + "balance_loss_clip": 1.0596447, + "balance_loss_mlp": 1.03891397, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.6856757103808424, + "language_loss": 0.76776898, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.78992695, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.18371582, + "step": 3245, + "time_per_iteration": 3.9730477333068848 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.05780602, + "balance_loss_mlp": 1.0289216, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.5264013545412545, + "language_loss": 0.84053338, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86259508, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.19274902, + "step": 3246, + "time_per_iteration": 2.4550132751464844 + }, + { + "auxiliary_loss_clip": 0.01163612, + "auxiliary_loss_mlp": 0.01046111, + "balance_loss_clip": 1.06303334, + "balance_loss_mlp": 1.02908754, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.434663859956937, + "language_loss": 0.76203209, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78412938, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17016602, + "step": 3247, + "time_per_iteration": 3.9206466674804688 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01048085, + "balance_loss_clip": 1.06639695, + "balance_loss_mlp": 1.0302515, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.5251897764508036, + "language_loss": 0.75428611, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77644855, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.17834473, + "step": 3248, + "time_per_iteration": 2.5066235065460205 + }, + { + "auxiliary_loss_clip": 0.01165394, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.06404757, + "balance_loss_mlp": 1.02843964, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 2.142341522881688, + "language_loss": 0.7843374, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80644703, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.17138672, + "step": 3249, + "time_per_iteration": 2.492013692855835 + }, + { + "auxiliary_loss_clip": 0.01162956, + "auxiliary_loss_mlp": 0.01050767, + "balance_loss_clip": 1.06087947, + "balance_loss_mlp": 1.03212261, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.630870464196987, + "language_loss": 0.73138297, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75352019, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.18652344, + "step": 3250, + "time_per_iteration": 2.4687185287475586 + }, + { + "auxiliary_loss_clip": 0.01165071, + "auxiliary_loss_mlp": 0.01039178, + "balance_loss_clip": 1.06715429, + "balance_loss_mlp": 1.02334726, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 2.484602018998457, + "language_loss": 0.79208481, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81412733, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.15808105, + "step": 3251, + "time_per_iteration": 2.581005096435547 + }, + { + "auxiliary_loss_clip": 0.01164879, + "auxiliary_loss_mlp": 0.01042497, + "balance_loss_clip": 1.0616734, + "balance_loss_mlp": 1.02510428, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 2.17059055379221, + "language_loss": 0.83931911, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86139286, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.17382812, + "step": 3252, + "time_per_iteration": 2.502826452255249 + }, + { + "auxiliary_loss_clip": 0.01166772, + "auxiliary_loss_mlp": 0.01049314, + "balance_loss_clip": 1.0605464, + "balance_loss_mlp": 1.03024018, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.941883065133969, + "language_loss": 0.73858249, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76074338, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.19067383, + "step": 3253, + "time_per_iteration": 2.659471035003662 + }, + { + "auxiliary_loss_clip": 0.01169946, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.06453347, + "balance_loss_mlp": 1.02408433, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.9017545458343366, + "language_loss": 0.7635082, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78562731, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.17883301, + "step": 3254, + "time_per_iteration": 2.480678081512451 + }, + { + "auxiliary_loss_clip": 0.01099018, + "auxiliary_loss_mlp": 0.01004903, + "balance_loss_clip": 1.06378841, + "balance_loss_mlp": 1.00196171, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.776706301277216, + "language_loss": 0.55260408, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57364333, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.35253906, + "router_z_loss_mlp": 0.02941895, + "step": 3255, + "time_per_iteration": 3.115544080734253 + }, + { + "auxiliary_loss_clip": 0.01167171, + "auxiliary_loss_mlp": 0.0104216, + "balance_loss_clip": 1.06553388, + "balance_loss_mlp": 1.02451658, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.484460686803608, + "language_loss": 0.70303249, + "learning_rate": 3.718624450942688e-06, + "loss": 0.72512579, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.1763916, + "step": 3256, + "time_per_iteration": 2.4038217067718506 + }, + { + "auxiliary_loss_clip": 0.01160175, + "auxiliary_loss_mlp": 0.01042467, + "balance_loss_clip": 1.06024814, + "balance_loss_mlp": 1.0245502, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.422417297620508, + "language_loss": 0.80426466, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82629114, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.17907715, + "step": 3257, + "time_per_iteration": 2.455838441848755 + }, + { + "auxiliary_loss_clip": 0.01160061, + "auxiliary_loss_mlp": 0.01051221, + "balance_loss_clip": 1.05882978, + "balance_loss_mlp": 1.03256452, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.7451739680083116, + "language_loss": 0.75222373, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77433652, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.18652344, + "step": 3258, + "time_per_iteration": 2.4920544624328613 + }, + { + "auxiliary_loss_clip": 0.01160653, + "auxiliary_loss_mlp": 0.01041027, + "balance_loss_clip": 1.05808342, + "balance_loss_mlp": 1.0232048, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7538122064972197, + "language_loss": 0.74512661, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.76714337, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.17810059, + "step": 3259, + "time_per_iteration": 2.5974087715148926 + }, + { + "auxiliary_loss_clip": 0.01160829, + "auxiliary_loss_mlp": 0.01046324, + "balance_loss_clip": 1.05725217, + "balance_loss_mlp": 1.02514005, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.243291004831082, + "language_loss": 0.77434731, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79641885, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.21166992, + "step": 3260, + "time_per_iteration": 2.4136767387390137 + }, + { + "auxiliary_loss_clip": 0.01161413, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_clip": 1.06027257, + "balance_loss_mlp": 1.02718306, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.296034476933425, + "language_loss": 0.81678677, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.83885157, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.17871094, + "step": 3261, + "time_per_iteration": 2.4806432723999023 + }, + { + "auxiliary_loss_clip": 0.01160584, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.06065273, + "balance_loss_mlp": 1.02734149, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.7908442459499168, + "language_loss": 0.77185214, + "learning_rate": 3.717428133894807e-06, + "loss": 0.79393536, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.20397949, + "step": 3262, + "time_per_iteration": 2.526193618774414 + }, + { + "auxiliary_loss_clip": 0.01160759, + "auxiliary_loss_mlp": 0.01046194, + "balance_loss_clip": 1.06133318, + "balance_loss_mlp": 1.02945721, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.6273736835992856, + "language_loss": 0.86755776, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88962734, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16760254, + "step": 3263, + "time_per_iteration": 2.646919012069702 + }, + { + "auxiliary_loss_clip": 0.01155995, + "auxiliary_loss_mlp": 0.01044956, + "balance_loss_clip": 1.056252, + "balance_loss_mlp": 1.02748036, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.8540448762463444, + "language_loss": 0.73773038, + "learning_rate": 3.717028840464455e-06, + "loss": 0.75974, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.17480469, + "step": 3264, + "time_per_iteration": 2.460317373275757 + }, + { + "auxiliary_loss_clip": 0.01158031, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_clip": 1.0595845, + "balance_loss_mlp": 1.03202856, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 2.1900736643831364, + "language_loss": 0.7868799, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80894876, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.16845703, + "step": 3265, + "time_per_iteration": 2.472031831741333 + }, + { + "auxiliary_loss_clip": 0.01072458, + "auxiliary_loss_mlp": 0.01016382, + "balance_loss_clip": 1.03837287, + "balance_loss_mlp": 1.01417029, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7835421089004396, + "language_loss": 0.53392619, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55481452, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.34082031, + "router_z_loss_mlp": 0.02212524, + "step": 3266, + "time_per_iteration": 3.1067631244659424 + }, + { + "auxiliary_loss_clip": 0.01155904, + "auxiliary_loss_mlp": 0.01046162, + "balance_loss_clip": 1.05254519, + "balance_loss_mlp": 1.02631426, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 2.191000701489342, + "language_loss": 0.80260432, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82462502, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.1986084, + "step": 3267, + "time_per_iteration": 2.4713027477264404 + }, + { + "auxiliary_loss_clip": 0.01152636, + "auxiliary_loss_mlp": 0.01046704, + "balance_loss_clip": 1.05508935, + "balance_loss_mlp": 1.02941823, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.687552425862218, + "language_loss": 0.86911714, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89111054, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.17285156, + "step": 3268, + "time_per_iteration": 2.4796359539031982 + }, + { + "auxiliary_loss_clip": 0.01149455, + "auxiliary_loss_mlp": 0.01040927, + "balance_loss_clip": 1.05281901, + "balance_loss_mlp": 1.02342677, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.716991676561058, + "language_loss": 0.68650877, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.70841259, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.17492676, + "step": 3269, + "time_per_iteration": 2.502263307571411 + }, + { + "auxiliary_loss_clip": 0.01160331, + "auxiliary_loss_mlp": 0.01057387, + "balance_loss_clip": 1.05845881, + "balance_loss_mlp": 1.03874254, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 3.0456681684902187, + "language_loss": 0.8086338, + "learning_rate": 3.715829397778135e-06, + "loss": 0.8308109, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.18652344, + "step": 3270, + "time_per_iteration": 2.496384382247925 + }, + { + "auxiliary_loss_clip": 0.01148638, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.05075502, + "balance_loss_mlp": 1.02349889, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 3.048652069854845, + "language_loss": 0.83954531, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86143303, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.1661377, + "step": 3271, + "time_per_iteration": 2.479567766189575 + }, + { + "auxiliary_loss_clip": 0.01151087, + "auxiliary_loss_mlp": 0.01054393, + "balance_loss_clip": 1.05458307, + "balance_loss_mlp": 1.03597486, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 4.937199174303627, + "language_loss": 0.80499381, + "learning_rate": 3.715429062953087e-06, + "loss": 0.8270486, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.1842041, + "step": 3272, + "time_per_iteration": 2.454641103744507 + }, + { + "auxiliary_loss_clip": 0.01159841, + "auxiliary_loss_mlp": 0.01046024, + "balance_loss_clip": 1.05917668, + "balance_loss_mlp": 1.02763057, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 2.3428310036178908, + "language_loss": 0.81132317, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.83338189, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.18383789, + "step": 3273, + "time_per_iteration": 2.510143518447876 + }, + { + "auxiliary_loss_clip": 0.01162219, + "auxiliary_loss_mlp": 0.0105098, + "balance_loss_clip": 1.05994189, + "balance_loss_mlp": 1.03346789, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 1.7742733018773371, + "language_loss": 0.77809477, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80022675, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.17504883, + "step": 3274, + "time_per_iteration": 2.46297025680542 + }, + { + "auxiliary_loss_clip": 0.01160042, + "auxiliary_loss_mlp": 0.01045102, + "balance_loss_clip": 1.05966663, + "balance_loss_mlp": 1.02695882, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 4.103301166017665, + "language_loss": 0.8137536, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83580506, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.18151855, + "step": 3275, + "time_per_iteration": 2.510551691055298 + }, + { + "auxiliary_loss_clip": 0.01160685, + "auxiliary_loss_mlp": 0.01046626, + "balance_loss_clip": 1.05881143, + "balance_loss_mlp": 1.02818418, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 2.412845004202286, + "language_loss": 0.80988395, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.8319571, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.18432617, + "step": 3276, + "time_per_iteration": 2.441861152648926 + }, + { + "auxiliary_loss_clip": 0.01155243, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.05667901, + "balance_loss_mlp": 1.01859617, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.18540654407471, + "language_loss": 0.89306659, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91497922, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.17419434, + "step": 3277, + "time_per_iteration": 2.476245641708374 + }, + { + "auxiliary_loss_clip": 0.01165769, + "auxiliary_loss_mlp": 0.01050485, + "balance_loss_clip": 1.06350255, + "balance_loss_mlp": 1.03097034, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.1546855743304842, + "language_loss": 0.62215209, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64431465, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.19519043, + "step": 3278, + "time_per_iteration": 2.496976137161255 + }, + { + "auxiliary_loss_clip": 0.01168725, + "auxiliary_loss_mlp": 0.0104822, + "balance_loss_clip": 1.06501293, + "balance_loss_mlp": 1.03011227, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 2.137122930880953, + "language_loss": 0.74024105, + "learning_rate": 3.714025842413166e-06, + "loss": 0.76241052, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.1809082, + "step": 3279, + "time_per_iteration": 2.4995245933532715 + }, + { + "auxiliary_loss_clip": 0.01164104, + "auxiliary_loss_mlp": 0.01049364, + "balance_loss_clip": 1.06028533, + "balance_loss_mlp": 1.03173316, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6199471698577141, + "language_loss": 0.82505894, + "learning_rate": 3.713825122291061e-06, + "loss": 0.8471936, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.1763916, + "step": 3280, + "time_per_iteration": 4.046282768249512 + }, + { + "auxiliary_loss_clip": 0.01175071, + "auxiliary_loss_mlp": 0.01043378, + "balance_loss_clip": 1.07214355, + "balance_loss_mlp": 1.0265218, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 2.462372119855318, + "language_loss": 0.77772892, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79991335, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 1.02978516, + "router_z_loss_mlp": 0.16845703, + "step": 3281, + "time_per_iteration": 2.4905409812927246 + }, + { + "auxiliary_loss_clip": 0.01156814, + "auxiliary_loss_mlp": 0.01041847, + "balance_loss_clip": 1.06071889, + "balance_loss_mlp": 1.02567065, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 2.0905477399267958, + "language_loss": 0.795349, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81733567, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.16162109, + "step": 3282, + "time_per_iteration": 2.507620334625244 + }, + { + "auxiliary_loss_clip": 0.01159919, + "auxiliary_loss_mlp": 0.01050629, + "balance_loss_clip": 1.05696678, + "balance_loss_mlp": 1.03144789, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.245816439196547, + "language_loss": 0.72351867, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74562413, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.19189453, + "step": 3283, + "time_per_iteration": 2.4804296493530273 + }, + { + "auxiliary_loss_clip": 0.01159785, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.05758119, + "balance_loss_mlp": 1.02546906, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.9737757409671892, + "language_loss": 0.78728044, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.80929869, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.16577148, + "step": 3284, + "time_per_iteration": 2.479580879211426 + }, + { + "auxiliary_loss_clip": 0.01163305, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_clip": 1.05804539, + "balance_loss_mlp": 1.02498841, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 1.9680456728115352, + "language_loss": 0.8651253, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88719666, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.18835449, + "step": 3285, + "time_per_iteration": 2.4542267322540283 + }, + { + "auxiliary_loss_clip": 0.01162393, + "auxiliary_loss_mlp": 0.01046019, + "balance_loss_clip": 1.06321168, + "balance_loss_mlp": 1.02837586, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.2072555850541304, + "language_loss": 0.88382769, + "learning_rate": 3.712619437068174e-06, + "loss": 0.9059118, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.1763916, + "step": 3286, + "time_per_iteration": 3.945709466934204 + }, + { + "auxiliary_loss_clip": 0.01163813, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_clip": 1.06219935, + "balance_loss_mlp": 1.02656949, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.0664975296488812, + "language_loss": 0.7813344, + "learning_rate": 3.712418262187102e-06, + "loss": 0.8034358, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.19763184, + "step": 3287, + "time_per_iteration": 2.4377782344818115 + }, + { + "auxiliary_loss_clip": 0.01165057, + "auxiliary_loss_mlp": 0.01045719, + "balance_loss_clip": 1.06083584, + "balance_loss_mlp": 1.02708626, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.278440075258507, + "language_loss": 0.81504083, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83714855, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.18640137, + "step": 3288, + "time_per_iteration": 2.4802563190460205 + }, + { + "auxiliary_loss_clip": 0.011564, + "auxiliary_loss_mlp": 0.01049849, + "balance_loss_clip": 1.06093526, + "balance_loss_mlp": 1.03096652, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7725939524787127, + "language_loss": 0.72982246, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75188494, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.18884277, + "step": 3289, + "time_per_iteration": 3.8942697048187256 + }, + { + "auxiliary_loss_clip": 0.01158751, + "auxiliary_loss_mlp": 0.01049385, + "balance_loss_clip": 1.0574491, + "balance_loss_mlp": 1.03120589, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.1270713566182664, + "language_loss": 0.79639328, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81847465, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.18188477, + "step": 3290, + "time_per_iteration": 4.089313745498657 + }, + { + "auxiliary_loss_clip": 0.01093017, + "auxiliary_loss_mlp": 0.010167, + "balance_loss_clip": 1.05836248, + "balance_loss_mlp": 1.01436627, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.8985771837903299, + "language_loss": 0.60310113, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62419832, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.34667969, + "router_z_loss_mlp": 0.02334595, + "step": 3291, + "time_per_iteration": 3.1046295166015625 + }, + { + "auxiliary_loss_clip": 0.01164737, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_clip": 1.06012976, + "balance_loss_mlp": 1.02357244, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 1.8395800907783777, + "language_loss": 0.81065279, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.8327238, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 1.04736328, + "router_z_loss_mlp": 0.18786621, + "step": 3292, + "time_per_iteration": 2.5035791397094727 + }, + { + "auxiliary_loss_clip": 0.01158794, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_clip": 1.06169367, + "balance_loss_mlp": 1.02784801, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 1.8352624378804008, + "language_loss": 0.81688708, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.83892435, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.17077637, + "step": 3293, + "time_per_iteration": 2.4798266887664795 + }, + { + "auxiliary_loss_clip": 0.01170908, + "auxiliary_loss_mlp": 0.01049776, + "balance_loss_clip": 1.06433654, + "balance_loss_mlp": 1.03071451, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.0900496106512763, + "language_loss": 0.60528296, + "learning_rate": 3.711008220265093e-06, + "loss": 0.62748981, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.19067383, + "step": 3294, + "time_per_iteration": 2.517575740814209 + }, + { + "auxiliary_loss_clip": 0.01156051, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.05664241, + "balance_loss_mlp": 1.02642441, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.286607151491656, + "language_loss": 0.87259692, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89458573, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.1640625, + "step": 3295, + "time_per_iteration": 2.473865032196045 + }, + { + "auxiliary_loss_clip": 0.01157108, + "auxiliary_loss_mlp": 0.01051233, + "balance_loss_clip": 1.05833602, + "balance_loss_mlp": 1.0346508, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 15.87419151164237, + "language_loss": 0.80604428, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82812768, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16589355, + "step": 3296, + "time_per_iteration": 2.481410503387451 + }, + { + "auxiliary_loss_clip": 0.01157367, + "auxiliary_loss_mlp": 0.01043756, + "balance_loss_clip": 1.05515575, + "balance_loss_mlp": 1.02450418, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.6286114551339628, + "language_loss": 0.68205512, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70406634, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.19250488, + "step": 3297, + "time_per_iteration": 2.620194673538208 + }, + { + "auxiliary_loss_clip": 0.01153963, + "auxiliary_loss_mlp": 0.01045699, + "balance_loss_clip": 1.05808616, + "balance_loss_mlp": 1.02678072, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7422341564683936, + "language_loss": 0.81238115, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83437777, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.18908691, + "step": 3298, + "time_per_iteration": 2.500786542892456 + }, + { + "auxiliary_loss_clip": 0.01165398, + "auxiliary_loss_mlp": 0.01045907, + "balance_loss_clip": 1.06155229, + "balance_loss_mlp": 1.02593982, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.1577311504599628, + "language_loss": 0.85355347, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.8756665, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.19995117, + "step": 3299, + "time_per_iteration": 2.4445674419403076 + }, + { + "auxiliary_loss_clip": 0.01075577, + "auxiliary_loss_mlp": 0.01017323, + "balance_loss_clip": 1.04242861, + "balance_loss_mlp": 1.0150907, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7695084557065534, + "language_loss": 0.53240496, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55333394, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.33154297, + "router_z_loss_mlp": 0.02236938, + "step": 3300, + "time_per_iteration": 3.1045165061950684 + }, + { + "auxiliary_loss_clip": 0.01159914, + "auxiliary_loss_mlp": 0.01049291, + "balance_loss_clip": 1.05954587, + "balance_loss_mlp": 1.03098035, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.8979307324816928, + "language_loss": 0.7368933, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75898534, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.1829834, + "step": 3301, + "time_per_iteration": 2.5435149669647217 + }, + { + "auxiliary_loss_clip": 0.01167572, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.06691504, + "balance_loss_mlp": 1.02017951, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 3.059017349502366, + "language_loss": 0.87815416, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90019941, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.16784668, + "step": 3302, + "time_per_iteration": 2.4876227378845215 + }, + { + "auxiliary_loss_clip": 0.01164244, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.06217551, + "balance_loss_mlp": 1.03385901, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 3.1619653980844684, + "language_loss": 0.73517179, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75732952, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17675781, + "step": 3303, + "time_per_iteration": 2.711080551147461 + }, + { + "auxiliary_loss_clip": 0.01156444, + "auxiliary_loss_mlp": 0.01040029, + "balance_loss_clip": 1.0586884, + "balance_loss_mlp": 1.02342272, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8150780780971332, + "language_loss": 0.75276923, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.77473396, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16601562, + "step": 3304, + "time_per_iteration": 2.5541019439697266 + }, + { + "auxiliary_loss_clip": 0.01152831, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.05522943, + "balance_loss_mlp": 1.022542, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.749491141091487, + "language_loss": 0.86263835, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88456488, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.17285156, + "step": 3305, + "time_per_iteration": 2.490151882171631 + }, + { + "auxiliary_loss_clip": 0.0115745, + "auxiliary_loss_mlp": 0.01042497, + "balance_loss_clip": 1.05553508, + "balance_loss_mlp": 1.02550936, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.5296150105257318, + "language_loss": 0.6823653, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70436478, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.1697998, + "step": 3306, + "time_per_iteration": 2.5109188556671143 + }, + { + "auxiliary_loss_clip": 0.01158861, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.06098413, + "balance_loss_mlp": 1.02010345, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 3.9412774102116095, + "language_loss": 0.76499259, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78693581, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.15368652, + "step": 3307, + "time_per_iteration": 2.5056443214416504 + }, + { + "auxiliary_loss_clip": 0.011562, + "auxiliary_loss_mlp": 0.01046964, + "balance_loss_clip": 1.05886364, + "balance_loss_mlp": 1.02951157, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.9036986013387325, + "language_loss": 0.75887167, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78090334, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.17456055, + "step": 3308, + "time_per_iteration": 2.5284388065338135 + }, + { + "auxiliary_loss_clip": 0.01152452, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.05394077, + "balance_loss_mlp": 1.02073002, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.9293454891436943, + "language_loss": 0.76146042, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.78336596, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.17382812, + "step": 3309, + "time_per_iteration": 2.5618374347686768 + }, + { + "auxiliary_loss_clip": 0.01154291, + "auxiliary_loss_mlp": 0.01049846, + "balance_loss_clip": 1.05591929, + "balance_loss_mlp": 1.03110671, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.5086893192326802, + "language_loss": 0.87911212, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90115356, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.18737793, + "step": 3310, + "time_per_iteration": 2.4656713008880615 + }, + { + "auxiliary_loss_clip": 0.01151779, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.05374765, + "balance_loss_mlp": 1.02393925, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.762728665259189, + "language_loss": 0.64322072, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66515291, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.17480469, + "step": 3311, + "time_per_iteration": 2.593506097793579 + }, + { + "auxiliary_loss_clip": 0.01156512, + "auxiliary_loss_mlp": 0.01041487, + "balance_loss_clip": 1.05596876, + "balance_loss_mlp": 1.02461934, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.520505827599716, + "language_loss": 0.74142754, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76340753, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.16870117, + "step": 3312, + "time_per_iteration": 2.571169853210449 + }, + { + "auxiliary_loss_clip": 0.01161212, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.06267691, + "balance_loss_mlp": 1.02462304, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2209984529789937, + "language_loss": 0.83821046, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8602379, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.16906738, + "step": 3313, + "time_per_iteration": 2.4372012615203857 + }, + { + "auxiliary_loss_clip": 0.01155965, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.05685568, + "balance_loss_mlp": 1.02295136, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 2.49112687814157, + "language_loss": 0.81050038, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83246517, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.17565918, + "step": 3314, + "time_per_iteration": 2.5295937061309814 + }, + { + "auxiliary_loss_clip": 0.01150096, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.05329442, + "balance_loss_mlp": 1.02644026, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5724904513843567, + "language_loss": 0.87540835, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89733851, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16479492, + "step": 3315, + "time_per_iteration": 2.5188474655151367 + }, + { + "auxiliary_loss_clip": 0.01160384, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_clip": 1.05968857, + "balance_loss_mlp": 1.0289979, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.6020373731558804, + "language_loss": 0.70969254, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73176777, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.18139648, + "step": 3316, + "time_per_iteration": 2.541616678237915 + }, + { + "auxiliary_loss_clip": 0.01074755, + "auxiliary_loss_mlp": 0.01016317, + "balance_loss_clip": 1.03924561, + "balance_loss_mlp": 1.01370907, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8372733822337641, + "language_loss": 0.66309369, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68400443, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.35498047, + "router_z_loss_mlp": 0.02609253, + "step": 3317, + "time_per_iteration": 3.185817241668701 + }, + { + "auxiliary_loss_clip": 0.01162835, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.05834699, + "balance_loss_mlp": 1.0273838, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.456929318500539, + "language_loss": 0.74293303, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76502049, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.18505859, + "step": 3318, + "time_per_iteration": 2.4418535232543945 + }, + { + "auxiliary_loss_clip": 0.01149808, + "auxiliary_loss_mlp": 0.01043356, + "balance_loss_clip": 1.05396998, + "balance_loss_mlp": 1.02565348, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 3.2890086773616005, + "language_loss": 0.78568888, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.80762047, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.17700195, + "step": 3319, + "time_per_iteration": 2.644857406616211 + }, + { + "auxiliary_loss_clip": 0.01155091, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.05552018, + "balance_loss_mlp": 1.02633572, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.663732623986451, + "language_loss": 0.75129247, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77331245, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.20568848, + "step": 3320, + "time_per_iteration": 2.7172670364379883 + }, + { + "auxiliary_loss_clip": 0.01150134, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.05256248, + "balance_loss_mlp": 1.02469492, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 1.5861977050024896, + "language_loss": 0.80232632, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82425749, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.1829834, + "step": 3321, + "time_per_iteration": 2.600606918334961 + }, + { + "auxiliary_loss_clip": 0.01073358, + "auxiliary_loss_mlp": 0.01019518, + "balance_loss_clip": 1.03812945, + "balance_loss_mlp": 1.01736307, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.873508863305582, + "language_loss": 0.6511997, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.6721285, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.35205078, + "router_z_loss_mlp": 0.02157593, + "step": 3322, + "time_per_iteration": 2.863391399383545 + }, + { + "auxiliary_loss_clip": 0.01076423, + "auxiliary_loss_mlp": 0.01021416, + "balance_loss_clip": 1.03982306, + "balance_loss_mlp": 1.01906514, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7863109701039425, + "language_loss": 0.56995624, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59093457, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.36621094, + "router_z_loss_mlp": 0.02349854, + "step": 3323, + "time_per_iteration": 4.605191707611084 + }, + { + "auxiliary_loss_clip": 0.01158921, + "auxiliary_loss_mlp": 0.01037304, + "balance_loss_clip": 1.06101394, + "balance_loss_mlp": 1.01980376, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 2.374260706676508, + "language_loss": 0.80641288, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.8283751, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.17504883, + "step": 3324, + "time_per_iteration": 2.4636662006378174 + }, + { + "auxiliary_loss_clip": 0.01157765, + "auxiliary_loss_mlp": 0.01037946, + "balance_loss_clip": 1.05887127, + "balance_loss_mlp": 1.01949263, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 2.0252241558209225, + "language_loss": 0.53945887, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56141603, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.18457031, + "step": 3325, + "time_per_iteration": 2.5145621299743652 + }, + { + "auxiliary_loss_clip": 0.0115465, + "auxiliary_loss_mlp": 0.0103752, + "balance_loss_clip": 1.05672407, + "balance_loss_mlp": 1.0206517, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 3.1923231979643885, + "language_loss": 0.85896039, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88088208, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.16870117, + "step": 3326, + "time_per_iteration": 2.4564085006713867 + }, + { + "auxiliary_loss_clip": 0.01158259, + "auxiliary_loss_mlp": 0.01044108, + "balance_loss_clip": 1.06141067, + "balance_loss_mlp": 1.02753758, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 2.3298471020518545, + "language_loss": 0.72137177, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74339545, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16577148, + "step": 3327, + "time_per_iteration": 2.467495918273926 + }, + { + "auxiliary_loss_clip": 0.01154399, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.0535897, + "balance_loss_mlp": 1.0201174, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.7614345334378059, + "language_loss": 0.76747608, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78941631, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.19506836, + "step": 3328, + "time_per_iteration": 2.4990897178649902 + }, + { + "auxiliary_loss_clip": 0.01145973, + "auxiliary_loss_mlp": 0.01040779, + "balance_loss_clip": 1.05228484, + "balance_loss_mlp": 1.02313638, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 1.9373518450867384, + "language_loss": 0.69394195, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71580946, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.17651367, + "step": 3329, + "time_per_iteration": 2.639834403991699 + }, + { + "auxiliary_loss_clip": 0.01163301, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_clip": 1.06088901, + "balance_loss_mlp": 1.02769506, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.7731651425786807, + "language_loss": 0.8149147, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83702159, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.19702148, + "step": 3330, + "time_per_iteration": 4.034168004989624 + }, + { + "auxiliary_loss_clip": 0.0115208, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.05367613, + "balance_loss_mlp": 1.01984286, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 3.767735494611521, + "language_loss": 0.76693994, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78885281, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.19372559, + "step": 3331, + "time_per_iteration": 4.041628360748291 + }, + { + "auxiliary_loss_clip": 0.0116308, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.05793762, + "balance_loss_mlp": 1.02682114, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 2.995963542326873, + "language_loss": 0.79340839, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.8154934, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.18579102, + "step": 3332, + "time_per_iteration": 2.4423606395721436 + }, + { + "auxiliary_loss_clip": 0.0108556, + "auxiliary_loss_mlp": 0.01024091, + "balance_loss_clip": 1.05000675, + "balance_loss_mlp": 1.0214175, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 1.3885675568167193, + "language_loss": 0.61963248, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64072895, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.35449219, + "router_z_loss_mlp": 0.0267334, + "step": 3333, + "time_per_iteration": 4.41703987121582 + }, + { + "auxiliary_loss_clip": 0.01155911, + "auxiliary_loss_mlp": 0.01045259, + "balance_loss_clip": 1.05545306, + "balance_loss_mlp": 1.02725816, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.31455765153217, + "language_loss": 0.81659007, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.83860171, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17993164, + "step": 3334, + "time_per_iteration": 2.5153801441192627 + }, + { + "auxiliary_loss_clip": 0.01171572, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_clip": 1.06668115, + "balance_loss_mlp": 1.0251658, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.8309607409995334, + "language_loss": 0.74067765, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76283956, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.19470215, + "step": 3335, + "time_per_iteration": 2.5463950634002686 + }, + { + "auxiliary_loss_clip": 0.01159929, + "auxiliary_loss_mlp": 0.01051967, + "balance_loss_clip": 1.05954909, + "balance_loss_mlp": 1.03472924, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 2.01060370347702, + "language_loss": 0.79949206, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82161105, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.17248535, + "step": 3336, + "time_per_iteration": 2.524510622024536 + }, + { + "auxiliary_loss_clip": 0.01170004, + "auxiliary_loss_mlp": 0.01045758, + "balance_loss_clip": 1.06360424, + "balance_loss_mlp": 1.02645755, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 2.205455829098083, + "language_loss": 0.77775437, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79991198, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.19287109, + "step": 3337, + "time_per_iteration": 2.440890312194824 + }, + { + "auxiliary_loss_clip": 0.01163213, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.06287384, + "balance_loss_mlp": 1.02700686, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.9767695799805125, + "language_loss": 0.69029748, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71238601, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.1862793, + "step": 3338, + "time_per_iteration": 2.4877820014953613 + }, + { + "auxiliary_loss_clip": 0.01159472, + "auxiliary_loss_mlp": 0.01053669, + "balance_loss_clip": 1.05964708, + "balance_loss_mlp": 1.03613353, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 3.160452036847369, + "language_loss": 0.69276011, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71489155, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.17553711, + "step": 3339, + "time_per_iteration": 2.4981865882873535 + }, + { + "auxiliary_loss_clip": 0.01164535, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.05898273, + "balance_loss_mlp": 1.02327323, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 3.6887468185172025, + "language_loss": 0.66343129, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68548614, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.17675781, + "step": 3340, + "time_per_iteration": 2.619109630584717 + }, + { + "auxiliary_loss_clip": 0.01168764, + "auxiliary_loss_mlp": 0.01046084, + "balance_loss_clip": 1.06081724, + "balance_loss_mlp": 1.02677226, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.4919980797108745, + "language_loss": 0.74169815, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76384664, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.1932373, + "step": 3341, + "time_per_iteration": 2.665916681289673 + }, + { + "auxiliary_loss_clip": 0.01162793, + "auxiliary_loss_mlp": 0.01046377, + "balance_loss_clip": 1.06390727, + "balance_loss_mlp": 1.02941322, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.5881818285537044, + "language_loss": 0.7156437, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73773539, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.16931152, + "step": 3342, + "time_per_iteration": 2.560091972351074 + }, + { + "auxiliary_loss_clip": 0.01162214, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_clip": 1.06014633, + "balance_loss_mlp": 1.02947223, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.8776344312214348, + "language_loss": 0.7250104, + "learning_rate": 3.701049056727384e-06, + "loss": 0.74710977, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.18249512, + "step": 3343, + "time_per_iteration": 2.4657514095306396 + }, + { + "auxiliary_loss_clip": 0.0116333, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.0617888, + "balance_loss_mlp": 1.03532469, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 2.274937207971017, + "language_loss": 0.81056142, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83273733, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.18933105, + "step": 3344, + "time_per_iteration": 2.4842092990875244 + }, + { + "auxiliary_loss_clip": 0.01167352, + "auxiliary_loss_mlp": 0.0104337, + "balance_loss_clip": 1.06776893, + "balance_loss_mlp": 1.02581048, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.2007716482163215, + "language_loss": 0.83245099, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85455817, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.17565918, + "step": 3345, + "time_per_iteration": 2.4295246601104736 + }, + { + "auxiliary_loss_clip": 0.01152077, + "auxiliary_loss_mlp": 0.01037291, + "balance_loss_clip": 1.05684233, + "balance_loss_mlp": 1.02119827, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.655607245852601, + "language_loss": 0.67712498, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.69901866, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.16088867, + "step": 3346, + "time_per_iteration": 2.4810097217559814 + }, + { + "auxiliary_loss_clip": 0.01162946, + "auxiliary_loss_mlp": 0.01047774, + "balance_loss_clip": 1.06211209, + "balance_loss_mlp": 1.03036964, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.2210246210302342, + "language_loss": 0.73423952, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75634676, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17419434, + "step": 3347, + "time_per_iteration": 2.459527015686035 + }, + { + "auxiliary_loss_clip": 0.01161148, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_clip": 1.06101894, + "balance_loss_mlp": 1.03274155, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.8908644113703814, + "language_loss": 0.86819756, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89029694, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.16052246, + "step": 3348, + "time_per_iteration": 2.5108559131622314 + }, + { + "auxiliary_loss_clip": 0.01155802, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.05770898, + "balance_loss_mlp": 1.02025092, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.8375619688127611, + "language_loss": 0.7108624, + "learning_rate": 3.699818905865346e-06, + "loss": 0.7328133, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.19042969, + "step": 3349, + "time_per_iteration": 2.4931528568267822 + }, + { + "auxiliary_loss_clip": 0.01158925, + "auxiliary_loss_mlp": 0.01053135, + "balance_loss_clip": 1.05877209, + "balance_loss_mlp": 1.03220165, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.7260096422401068, + "language_loss": 0.71630609, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73842669, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.20947266, + "step": 3350, + "time_per_iteration": 2.49743390083313 + }, + { + "auxiliary_loss_clip": 0.01172568, + "auxiliary_loss_mlp": 0.01043883, + "balance_loss_clip": 1.06672931, + "balance_loss_mlp": 1.02466679, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 2.4280372501561707, + "language_loss": 0.7554177, + "learning_rate": 3.69940833983661e-06, + "loss": 0.77758223, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.19226074, + "step": 3351, + "time_per_iteration": 2.487107276916504 + }, + { + "auxiliary_loss_clip": 0.01166074, + "auxiliary_loss_mlp": 0.01045223, + "balance_loss_clip": 1.06399369, + "balance_loss_mlp": 1.0257206, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.7262834371381792, + "language_loss": 0.80544221, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82755524, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.19494629, + "step": 3352, + "time_per_iteration": 2.5139572620391846 + }, + { + "auxiliary_loss_clip": 0.0116, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_clip": 1.06177342, + "balance_loss_mlp": 1.0309782, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.9735270070902162, + "language_loss": 0.80296862, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82505572, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.17736816, + "step": 3353, + "time_per_iteration": 2.550574779510498 + }, + { + "auxiliary_loss_clip": 0.01156284, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.06080723, + "balance_loss_mlp": 1.02675724, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.409544251280276, + "language_loss": 0.90286851, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92487681, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.17797852, + "step": 3354, + "time_per_iteration": 2.4960525035858154 + }, + { + "auxiliary_loss_clip": 0.01114013, + "auxiliary_loss_mlp": 0.01009605, + "balance_loss_clip": 1.07813048, + "balance_loss_mlp": 1.00558758, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8393177081514914, + "language_loss": 0.55847514, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57971126, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.35888672, + "router_z_loss_mlp": 0.04019165, + "step": 3355, + "time_per_iteration": 3.0930819511413574 + }, + { + "auxiliary_loss_clip": 0.01157987, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.06233001, + "balance_loss_mlp": 1.02885079, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.8653308082863127, + "language_loss": 0.84006822, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86210656, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.17004395, + "step": 3356, + "time_per_iteration": 2.666821241378784 + }, + { + "auxiliary_loss_clip": 0.01173759, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.06922448, + "balance_loss_mlp": 1.02348733, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 26.76466986410674, + "language_loss": 0.69757533, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71974945, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.20141602, + "step": 3357, + "time_per_iteration": 2.4438469409942627 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01040978, + "balance_loss_clip": 1.06642652, + "balance_loss_mlp": 1.02200007, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.8172089121335928, + "language_loss": 0.72261906, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.7447198, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.18969727, + "step": 3358, + "time_per_iteration": 2.5170841217041016 + }, + { + "auxiliary_loss_clip": 0.0116204, + "auxiliary_loss_mlp": 0.01049185, + "balance_loss_clip": 1.06319082, + "balance_loss_mlp": 1.03234076, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.1866827605679617, + "language_loss": 0.83277553, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85488784, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.16833496, + "step": 3359, + "time_per_iteration": 2.4235801696777344 + }, + { + "auxiliary_loss_clip": 0.0108993, + "auxiliary_loss_mlp": 0.01011763, + "balance_loss_clip": 1.05327642, + "balance_loss_mlp": 1.00939107, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.7855792761385297, + "language_loss": 0.58973128, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61074817, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.02371216, + "step": 3360, + "time_per_iteration": 3.055143117904663 + }, + { + "auxiliary_loss_clip": 0.01160291, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.05957758, + "balance_loss_mlp": 1.02739203, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 29.14854319884494, + "language_loss": 0.62387836, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64594787, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.19262695, + "step": 3361, + "time_per_iteration": 2.4447336196899414 + }, + { + "auxiliary_loss_clip": 0.01160561, + "auxiliary_loss_mlp": 0.01055953, + "balance_loss_clip": 1.06240511, + "balance_loss_mlp": 1.03872705, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 3.304239580060446, + "language_loss": 0.757514, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77967912, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.17224121, + "step": 3362, + "time_per_iteration": 2.528531551361084 + }, + { + "auxiliary_loss_clip": 0.0115833, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_clip": 1.05935752, + "balance_loss_mlp": 1.02445281, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 6.773860976668842, + "language_loss": 0.76407295, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78607613, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.17529297, + "step": 3363, + "time_per_iteration": 2.4838485717773438 + }, + { + "auxiliary_loss_clip": 0.01155701, + "auxiliary_loss_mlp": 0.01044134, + "balance_loss_clip": 1.05720508, + "balance_loss_mlp": 1.02756393, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 2.179183935570395, + "language_loss": 0.74934638, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77134478, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.16577148, + "step": 3364, + "time_per_iteration": 2.513310670852661 + }, + { + "auxiliary_loss_clip": 0.01158362, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.05780959, + "balance_loss_mlp": 1.02250838, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 6.628028816046487, + "language_loss": 0.71515137, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73715341, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.19335938, + "step": 3365, + "time_per_iteration": 3.95620059967041 + }, + { + "auxiliary_loss_clip": 0.01169909, + "auxiliary_loss_mlp": 0.01046624, + "balance_loss_clip": 1.06709254, + "balance_loss_mlp": 1.02875495, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.882979370022273, + "language_loss": 0.85349035, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87565565, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.17871094, + "step": 3366, + "time_per_iteration": 2.437572956085205 + }, + { + "auxiliary_loss_clip": 0.01161802, + "auxiliary_loss_mlp": 0.01042075, + "balance_loss_clip": 1.05989003, + "balance_loss_mlp": 1.02391958, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.700792121496185, + "language_loss": 0.69201899, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7140578, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.18164062, + "step": 3367, + "time_per_iteration": 2.5664730072021484 + }, + { + "auxiliary_loss_clip": 0.01166532, + "auxiliary_loss_mlp": 0.0103967, + "balance_loss_clip": 1.06353664, + "balance_loss_mlp": 1.01938081, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.559201250083683, + "language_loss": 0.6849162, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.7069782, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.20275879, + "step": 3368, + "time_per_iteration": 2.6536436080932617 + }, + { + "auxiliary_loss_clip": 0.01161166, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.06204748, + "balance_loss_mlp": 1.03098261, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 1.7852922075651534, + "language_loss": 0.77611345, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79824084, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.20593262, + "step": 3369, + "time_per_iteration": 2.6369450092315674 + }, + { + "auxiliary_loss_clip": 0.01167045, + "auxiliary_loss_mlp": 0.010562, + "balance_loss_clip": 1.06278968, + "balance_loss_mlp": 1.03776991, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 3.0628469890901386, + "language_loss": 0.64845777, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67069024, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18432617, + "step": 3370, + "time_per_iteration": 2.4642934799194336 + }, + { + "auxiliary_loss_clip": 0.01095082, + "auxiliary_loss_mlp": 0.01008201, + "balance_loss_clip": 1.05948114, + "balance_loss_mlp": 1.00550723, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6899788940515759, + "language_loss": 0.58115554, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60218835, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.02694702, + "step": 3371, + "time_per_iteration": 3.1903798580169678 + }, + { + "auxiliary_loss_clip": 0.01162414, + "auxiliary_loss_mlp": 0.01042636, + "balance_loss_clip": 1.05975175, + "balance_loss_mlp": 1.02514768, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.714360000435387, + "language_loss": 0.91653001, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.93858051, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.17504883, + "step": 3372, + "time_per_iteration": 2.5100936889648438 + }, + { + "auxiliary_loss_clip": 0.01164758, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.06324303, + "balance_loss_mlp": 1.02668953, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.8937323209377206, + "language_loss": 0.78082049, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80292535, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.19042969, + "step": 3373, + "time_per_iteration": 3.965785264968872 + }, + { + "auxiliary_loss_clip": 0.01161759, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.06431365, + "balance_loss_mlp": 1.02030742, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.943517921969392, + "language_loss": 0.71408647, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73608106, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.17407227, + "step": 3374, + "time_per_iteration": 2.639469623565674 + }, + { + "auxiliary_loss_clip": 0.01100262, + "auxiliary_loss_mlp": 0.01004123, + "balance_loss_clip": 1.06515229, + "balance_loss_mlp": 1.00192988, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9814522445280993, + "language_loss": 0.62469018, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64573407, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.35058594, + "router_z_loss_mlp": 0.02194214, + "step": 3375, + "time_per_iteration": 4.565025568008423 + }, + { + "auxiliary_loss_clip": 0.01159951, + "auxiliary_loss_mlp": 0.01039439, + "balance_loss_clip": 1.06215572, + "balance_loss_mlp": 1.02261889, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.6370980408579945, + "language_loss": 0.82381558, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84580952, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16833496, + "step": 3376, + "time_per_iteration": 2.4367759227752686 + }, + { + "auxiliary_loss_clip": 0.01165855, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.06354547, + "balance_loss_mlp": 1.01781917, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 1.9055045993750783, + "language_loss": 0.81668103, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83870232, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.18469238, + "step": 3377, + "time_per_iteration": 3.9788880348205566 + }, + { + "auxiliary_loss_clip": 0.01160941, + "auxiliary_loss_mlp": 0.01048892, + "balance_loss_clip": 1.06190002, + "balance_loss_mlp": 1.03020024, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 2.0095932514720856, + "language_loss": 0.76751995, + "learning_rate": 3.69384049496805e-06, + "loss": 0.78961825, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.18664551, + "step": 3378, + "time_per_iteration": 2.488682985305786 + }, + { + "auxiliary_loss_clip": 0.01158118, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_clip": 1.05807137, + "balance_loss_mlp": 1.02686334, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.8121718576699428, + "language_loss": 0.79560769, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81764042, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18286133, + "step": 3379, + "time_per_iteration": 2.5140464305877686 + }, + { + "auxiliary_loss_clip": 0.01152103, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.0574882, + "balance_loss_mlp": 1.02063441, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7043507710462829, + "language_loss": 0.86805046, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.889947, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16906738, + "step": 3380, + "time_per_iteration": 2.6340560913085938 + }, + { + "auxiliary_loss_clip": 0.01162432, + "auxiliary_loss_mlp": 0.01049704, + "balance_loss_clip": 1.06432438, + "balance_loss_mlp": 1.0315485, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 5.225040409831457, + "language_loss": 0.75150704, + "learning_rate": 3.693218952340186e-06, + "loss": 0.77362841, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.18151855, + "step": 3381, + "time_per_iteration": 2.55961537361145 + }, + { + "auxiliary_loss_clip": 0.01162742, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_clip": 1.06203985, + "balance_loss_mlp": 1.02748895, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.8203382127217527, + "language_loss": 0.79349005, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81556904, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17663574, + "step": 3382, + "time_per_iteration": 2.466155767440796 + }, + { + "auxiliary_loss_clip": 0.01161596, + "auxiliary_loss_mlp": 0.01039385, + "balance_loss_clip": 1.05861139, + "balance_loss_mlp": 1.01969182, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 2.478996234360016, + "language_loss": 0.80158997, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82359982, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.19689941, + "step": 3383, + "time_per_iteration": 2.5090575218200684 + }, + { + "auxiliary_loss_clip": 0.0115746, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.05930281, + "balance_loss_mlp": 1.01988268, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 2.285322311734635, + "language_loss": 0.74176204, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76372039, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.18505859, + "step": 3384, + "time_per_iteration": 2.4558804035186768 + }, + { + "auxiliary_loss_clip": 0.01162807, + "auxiliary_loss_mlp": 0.010526, + "balance_loss_clip": 1.05877614, + "balance_loss_mlp": 1.03188145, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 3.133789492916298, + "language_loss": 0.76609993, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78825402, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.20715332, + "step": 3385, + "time_per_iteration": 2.4475347995758057 + }, + { + "auxiliary_loss_clip": 0.01163182, + "auxiliary_loss_mlp": 0.01062584, + "balance_loss_clip": 1.06254017, + "balance_loss_mlp": 1.04314053, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8165369633523474, + "language_loss": 0.6852181, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70747578, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.19433594, + "step": 3386, + "time_per_iteration": 2.481396198272705 + }, + { + "auxiliary_loss_clip": 0.01160325, + "auxiliary_loss_mlp": 0.01060673, + "balance_loss_clip": 1.05870271, + "balance_loss_mlp": 1.04013324, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.4774396580836227, + "language_loss": 0.81201208, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83422208, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.20556641, + "step": 3387, + "time_per_iteration": 2.5597896575927734 + }, + { + "auxiliary_loss_clip": 0.01164389, + "auxiliary_loss_mlp": 0.01050126, + "balance_loss_clip": 1.06467783, + "balance_loss_mlp": 1.03142178, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.2917151114200367, + "language_loss": 0.80056828, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82271338, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.18695068, + "step": 3388, + "time_per_iteration": 2.4048120975494385 + }, + { + "auxiliary_loss_clip": 0.01164799, + "auxiliary_loss_mlp": 0.01041153, + "balance_loss_clip": 1.06434512, + "balance_loss_mlp": 1.02246118, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.9036054761088481, + "language_loss": 0.71996844, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74202794, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.18676758, + "step": 3389, + "time_per_iteration": 2.446556806564331 + }, + { + "auxiliary_loss_clip": 0.01161294, + "auxiliary_loss_mlp": 0.0104514, + "balance_loss_clip": 1.06340992, + "balance_loss_mlp": 1.02686524, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 2.1418788428468876, + "language_loss": 0.87627435, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89833868, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.18286133, + "step": 3390, + "time_per_iteration": 2.4404609203338623 + }, + { + "auxiliary_loss_clip": 0.01166567, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.06678987, + "balance_loss_mlp": 1.02859461, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 2.090659812294117, + "language_loss": 0.70423245, + "learning_rate": 3.691142971316662e-06, + "loss": 0.72636586, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.18164062, + "step": 3391, + "time_per_iteration": 2.4833364486694336 + }, + { + "auxiliary_loss_clip": 0.0116755, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_clip": 1.06782556, + "balance_loss_mlp": 1.02846813, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.431798080690232, + "language_loss": 0.86554176, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88767534, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.17340088, + "step": 3392, + "time_per_iteration": 2.4476425647735596 + }, + { + "auxiliary_loss_clip": 0.01158369, + "auxiliary_loss_mlp": 0.01049762, + "balance_loss_clip": 1.05768991, + "balance_loss_mlp": 1.03254867, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.6814547972115097, + "language_loss": 0.8064639, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82854521, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.17199707, + "step": 3393, + "time_per_iteration": 2.7288262844085693 + }, + { + "auxiliary_loss_clip": 0.0116115, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.06097317, + "balance_loss_mlp": 1.02455091, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.87462764057069, + "language_loss": 0.86065137, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88267791, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.16943359, + "step": 3394, + "time_per_iteration": 2.5194668769836426 + }, + { + "auxiliary_loss_clip": 0.01165962, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.06722665, + "balance_loss_mlp": 1.02441752, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 3.019314647584289, + "language_loss": 0.83789247, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85996044, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16430664, + "step": 3395, + "time_per_iteration": 2.4830169677734375 + }, + { + "auxiliary_loss_clip": 0.01169481, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.06584907, + "balance_loss_mlp": 1.01798797, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.2965829368107347, + "language_loss": 0.83774412, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85980207, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.18334961, + "step": 3396, + "time_per_iteration": 2.427245855331421 + }, + { + "auxiliary_loss_clip": 0.01162753, + "auxiliary_loss_mlp": 0.01042971, + "balance_loss_clip": 1.06401706, + "balance_loss_mlp": 1.02408814, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.7136490362048087, + "language_loss": 0.77424604, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79630327, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.1887207, + "step": 3397, + "time_per_iteration": 2.5204310417175293 + }, + { + "auxiliary_loss_clip": 0.01171245, + "auxiliary_loss_mlp": 0.01040495, + "balance_loss_clip": 1.07051134, + "balance_loss_mlp": 1.02367473, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.625219361985122, + "language_loss": 0.87620449, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89832187, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.16833496, + "step": 3398, + "time_per_iteration": 2.476414680480957 + }, + { + "auxiliary_loss_clip": 0.01172338, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.06805921, + "balance_loss_mlp": 1.02599871, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.1668874009346797, + "language_loss": 0.78167516, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80384284, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18432617, + "step": 3399, + "time_per_iteration": 2.4734749794006348 + }, + { + "auxiliary_loss_clip": 0.01158807, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.05785108, + "balance_loss_mlp": 1.02546239, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 2.214347978722733, + "language_loss": 0.7603972, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78242278, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.18286133, + "step": 3400, + "time_per_iteration": 2.51758074760437 + }, + { + "auxiliary_loss_clip": 0.01161948, + "auxiliary_loss_mlp": 0.01045967, + "balance_loss_clip": 1.06420314, + "balance_loss_mlp": 1.02785897, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 2.0235774714981427, + "language_loss": 0.79474455, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81682372, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.18115234, + "step": 3401, + "time_per_iteration": 2.516388416290283 + }, + { + "auxiliary_loss_clip": 0.01183744, + "auxiliary_loss_mlp": 0.01042331, + "balance_loss_clip": 1.07726789, + "balance_loss_mlp": 1.02323413, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.595581953080248, + "language_loss": 0.69580758, + "learning_rate": 3.688851985676991e-06, + "loss": 0.7180683, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1907959, + "step": 3402, + "time_per_iteration": 2.6277079582214355 + }, + { + "auxiliary_loss_clip": 0.01169833, + "auxiliary_loss_mlp": 0.01042989, + "balance_loss_clip": 1.06742525, + "balance_loss_mlp": 1.02420211, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.7972501836825385, + "language_loss": 0.80943358, + "learning_rate": 3.688643329848496e-06, + "loss": 0.8315618, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.18774414, + "step": 3403, + "time_per_iteration": 2.4524145126342773 + }, + { + "auxiliary_loss_clip": 0.01168857, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_clip": 1.06793189, + "balance_loss_mlp": 1.02433956, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 2.0517288643811016, + "language_loss": 0.8390497, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.861157, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.17541504, + "step": 3404, + "time_per_iteration": 2.521969795227051 + }, + { + "auxiliary_loss_clip": 0.01162031, + "auxiliary_loss_mlp": 0.01049717, + "balance_loss_clip": 1.06025171, + "balance_loss_mlp": 1.03129888, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.7868084198742349, + "language_loss": 0.86081481, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88293231, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.18408203, + "step": 3405, + "time_per_iteration": 2.675058364868164 + }, + { + "auxiliary_loss_clip": 0.0116581, + "auxiliary_loss_mlp": 0.01041302, + "balance_loss_clip": 1.0662365, + "balance_loss_mlp": 1.02425563, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.407170871537502, + "language_loss": 0.84280694, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86487806, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.1706543, + "step": 3406, + "time_per_iteration": 2.551884889602661 + }, + { + "auxiliary_loss_clip": 0.01159071, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.06198001, + "balance_loss_mlp": 1.02139127, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 3.429194568616399, + "language_loss": 0.6779874, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.6999706, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.1784668, + "step": 3407, + "time_per_iteration": 2.4412810802459717 + }, + { + "auxiliary_loss_clip": 0.01161647, + "auxiliary_loss_mlp": 0.01050117, + "balance_loss_clip": 1.06140089, + "balance_loss_mlp": 1.03128171, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.9170286610847223, + "language_loss": 0.84123307, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86335075, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18847656, + "step": 3408, + "time_per_iteration": 3.960547685623169 + }, + { + "auxiliary_loss_clip": 0.01169027, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.06553769, + "balance_loss_mlp": 1.02509868, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.2284730320511956, + "language_loss": 0.64686489, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66898334, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.17718506, + "step": 3409, + "time_per_iteration": 2.463644504547119 + }, + { + "auxiliary_loss_clip": 0.01162793, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.06026089, + "balance_loss_mlp": 1.02778864, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.433941271423194, + "language_loss": 0.80454117, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82662797, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.18103027, + "step": 3410, + "time_per_iteration": 2.578707456588745 + }, + { + "auxiliary_loss_clip": 0.01163088, + "auxiliary_loss_mlp": 0.01062371, + "balance_loss_clip": 1.06192076, + "balance_loss_mlp": 1.0423677, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.4071835093039122, + "language_loss": 0.7663027, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78855729, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.19995117, + "step": 3411, + "time_per_iteration": 2.51547908782959 + }, + { + "auxiliary_loss_clip": 0.01168233, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.06786418, + "balance_loss_mlp": 1.02518928, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 1.9380166186158747, + "language_loss": 0.73752075, + "learning_rate": 3.686762546833722e-06, + "loss": 0.7596215, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.16662598, + "step": 3412, + "time_per_iteration": 2.4713916778564453 + }, + { + "auxiliary_loss_clip": 0.01171649, + "auxiliary_loss_mlp": 0.0104759, + "balance_loss_clip": 1.06621194, + "balance_loss_mlp": 1.02917218, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.488422382155914, + "language_loss": 0.7790764, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80126882, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.18408203, + "step": 3413, + "time_per_iteration": 2.423025369644165 + }, + { + "auxiliary_loss_clip": 0.01170078, + "auxiliary_loss_mlp": 0.01045473, + "balance_loss_clip": 1.07281685, + "balance_loss_mlp": 1.02727032, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 2.528004011793142, + "language_loss": 0.84600306, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86815864, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.18212891, + "step": 3414, + "time_per_iteration": 2.4503824710845947 + }, + { + "auxiliary_loss_clip": 0.01178233, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_clip": 1.07515705, + "balance_loss_mlp": 1.02621531, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.8521261201819008, + "language_loss": 0.80745178, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82967842, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.18206787, + "step": 3415, + "time_per_iteration": 2.4714314937591553 + }, + { + "auxiliary_loss_clip": 0.01170268, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.07110202, + "balance_loss_mlp": 1.02254272, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.8796536257859207, + "language_loss": 0.73275006, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75483716, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.15905762, + "step": 3416, + "time_per_iteration": 3.932217836380005 + }, + { + "auxiliary_loss_clip": 0.01183729, + "auxiliary_loss_mlp": 0.01046726, + "balance_loss_clip": 1.07833409, + "balance_loss_mlp": 1.02846277, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.28813975973073, + "language_loss": 0.78727698, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80958152, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.18249512, + "step": 3417, + "time_per_iteration": 2.4884114265441895 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.06930137, + "balance_loss_mlp": 1.03164625, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.40842409619666, + "language_loss": 0.87223226, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89443141, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.18603516, + "step": 3418, + "time_per_iteration": 2.6163864135742188 + }, + { + "auxiliary_loss_clip": 0.01161364, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.06174707, + "balance_loss_mlp": 1.02277541, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.048071875379097, + "language_loss": 0.62208581, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64411002, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.18286133, + "step": 3419, + "time_per_iteration": 3.868243932723999 + }, + { + "auxiliary_loss_clip": 0.01165058, + "auxiliary_loss_mlp": 0.01056419, + "balance_loss_clip": 1.06177366, + "balance_loss_mlp": 1.03572452, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 1.9228579196216522, + "language_loss": 0.86417246, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88638729, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.20703125, + "step": 3420, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.01165756, + "auxiliary_loss_mlp": 0.0104238, + "balance_loss_clip": 1.0648365, + "balance_loss_mlp": 1.02445114, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.2020085538826035, + "language_loss": 0.70832264, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73040402, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.17932129, + "step": 3421, + "time_per_iteration": 4.080257415771484 + }, + { + "auxiliary_loss_clip": 0.01186185, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.0811646, + "balance_loss_mlp": 1.02596688, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 2.0416720989795865, + "language_loss": 0.71090668, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.73321235, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.1842041, + "step": 3422, + "time_per_iteration": 2.5330183506011963 + }, + { + "auxiliary_loss_clip": 0.01100299, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.06348395, + "balance_loss_mlp": 1.03419912, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7514630612190366, + "language_loss": 0.55507922, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57644415, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.36816406, + "router_z_loss_mlp": 0.01992798, + "step": 3423, + "time_per_iteration": 3.132570743560791 + }, + { + "auxiliary_loss_clip": 0.01164809, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.06353235, + "balance_loss_mlp": 1.02419734, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 19.950661698972933, + "language_loss": 0.71665967, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73874068, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.19091797, + "step": 3424, + "time_per_iteration": 2.674783945083618 + }, + { + "auxiliary_loss_clip": 0.01168161, + "auxiliary_loss_mlp": 0.01046324, + "balance_loss_clip": 1.06913996, + "balance_loss_mlp": 1.02850246, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 2.8483614291874435, + "language_loss": 0.75336981, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77551472, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17834473, + "step": 3425, + "time_per_iteration": 2.507972002029419 + }, + { + "auxiliary_loss_clip": 0.01183674, + "auxiliary_loss_mlp": 0.01058238, + "balance_loss_clip": 1.08223116, + "balance_loss_mlp": 1.03986788, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 3.3751986000818386, + "language_loss": 0.88385057, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90626973, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.18347168, + "step": 3426, + "time_per_iteration": 2.4851744174957275 + }, + { + "auxiliary_loss_clip": 0.01166268, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.06760228, + "balance_loss_mlp": 1.02932668, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.7864115801958056, + "language_loss": 0.76748484, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78962398, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.18310547, + "step": 3427, + "time_per_iteration": 2.4919421672821045 + }, + { + "auxiliary_loss_clip": 0.01167196, + "auxiliary_loss_mlp": 0.01051882, + "balance_loss_clip": 1.0634079, + "balance_loss_mlp": 1.03299904, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 2.8986098045558375, + "language_loss": 0.73864007, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76083088, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.18884277, + "step": 3428, + "time_per_iteration": 2.492506742477417 + }, + { + "auxiliary_loss_clip": 0.01176832, + "auxiliary_loss_mlp": 0.01050611, + "balance_loss_clip": 1.07107115, + "balance_loss_mlp": 1.03094137, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 1.9247866224351498, + "language_loss": 0.73939466, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.7616691, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.19665527, + "step": 3429, + "time_per_iteration": 2.5753049850463867 + }, + { + "auxiliary_loss_clip": 0.01181633, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.07632101, + "balance_loss_mlp": 1.03333068, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.8658108727172504, + "language_loss": 0.85222936, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87456703, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.18798828, + "step": 3430, + "time_per_iteration": 2.47727370262146 + }, + { + "auxiliary_loss_clip": 0.01168576, + "auxiliary_loss_mlp": 0.01050588, + "balance_loss_clip": 1.06610882, + "balance_loss_mlp": 1.03177691, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.8477447241281286, + "language_loss": 0.69173825, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.71392989, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.18798828, + "step": 3431, + "time_per_iteration": 2.454298973083496 + }, + { + "auxiliary_loss_clip": 0.01094232, + "auxiliary_loss_mlp": 0.01025534, + "balance_loss_clip": 1.05976772, + "balance_loss_mlp": 1.02297699, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.812247721819573, + "language_loss": 0.60255265, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62375033, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.34423828, + "router_z_loss_mlp": 0.02560425, + "step": 3432, + "time_per_iteration": 3.2130672931671143 + }, + { + "auxiliary_loss_clip": 0.01168868, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.06874061, + "balance_loss_mlp": 1.02775574, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7356971334677973, + "language_loss": 0.72544158, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74758738, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17944336, + "step": 3433, + "time_per_iteration": 2.476231575012207 + }, + { + "auxiliary_loss_clip": 0.01167332, + "auxiliary_loss_mlp": 0.01062602, + "balance_loss_clip": 1.0635494, + "balance_loss_mlp": 1.04175246, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.6912150662189034, + "language_loss": 0.86881566, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89111507, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.20849609, + "step": 3434, + "time_per_iteration": 2.4546139240264893 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01045824, + "balance_loss_clip": 1.06553173, + "balance_loss_mlp": 1.02751327, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.7467750696825914, + "language_loss": 0.69091272, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71309173, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.1829834, + "step": 3435, + "time_per_iteration": 2.511589765548706 + }, + { + "auxiliary_loss_clip": 0.01166501, + "auxiliary_loss_mlp": 0.01044414, + "balance_loss_clip": 1.06490004, + "balance_loss_mlp": 1.02499461, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.8445536777531264, + "language_loss": 0.89191496, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91402411, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.1940918, + "step": 3436, + "time_per_iteration": 2.5104503631591797 + }, + { + "auxiliary_loss_clip": 0.01167403, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.0664078, + "balance_loss_mlp": 1.02078497, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.9391758801259904, + "language_loss": 0.76544344, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78750587, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.18066406, + "step": 3437, + "time_per_iteration": 2.4926350116729736 + }, + { + "auxiliary_loss_clip": 0.01169156, + "auxiliary_loss_mlp": 0.01042229, + "balance_loss_clip": 1.06734681, + "balance_loss_mlp": 1.02468157, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 2.3154462331180903, + "language_loss": 0.77449059, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79660445, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.17553711, + "step": 3438, + "time_per_iteration": 2.483689785003662 + }, + { + "auxiliary_loss_clip": 0.01089843, + "auxiliary_loss_mlp": 0.01012699, + "balance_loss_clip": 1.05302763, + "balance_loss_mlp": 1.0107944, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8370155008888308, + "language_loss": 0.67080176, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69182718, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.36816406, + "router_z_loss_mlp": 0.01901245, + "step": 3439, + "time_per_iteration": 3.0716826915740967 + }, + { + "auxiliary_loss_clip": 0.01165062, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.06243134, + "balance_loss_mlp": 1.01831722, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.5201974079546616, + "language_loss": 0.84082341, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86283553, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.17822266, + "step": 3440, + "time_per_iteration": 2.426816701889038 + }, + { + "auxiliary_loss_clip": 0.01167061, + "auxiliary_loss_mlp": 0.01046722, + "balance_loss_clip": 1.06379628, + "balance_loss_mlp": 1.02910304, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 2.0611458793966406, + "language_loss": 0.85257196, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87470984, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.17614746, + "step": 3441, + "time_per_iteration": 2.433518409729004 + }, + { + "auxiliary_loss_clip": 0.01164866, + "auxiliary_loss_mlp": 0.01047804, + "balance_loss_clip": 1.06644058, + "balance_loss_mlp": 1.03027987, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.8737899642468145, + "language_loss": 0.85786021, + "learning_rate": 3.680455884806959e-06, + "loss": 0.87998688, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.17529297, + "step": 3442, + "time_per_iteration": 2.6707606315612793 + }, + { + "auxiliary_loss_clip": 0.01170653, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.06584525, + "balance_loss_mlp": 1.02752507, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 2.211394694595543, + "language_loss": 0.73270983, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75488126, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 1.05078125, + "router_z_loss_mlp": 0.1895752, + "step": 3443, + "time_per_iteration": 2.4921722412109375 + }, + { + "auxiliary_loss_clip": 0.01165631, + "auxiliary_loss_mlp": 0.010416, + "balance_loss_clip": 1.06578493, + "balance_loss_mlp": 1.02498269, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 1.7359848225186527, + "language_loss": 0.85265738, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87472969, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.1661377, + "step": 3444, + "time_per_iteration": 2.4928252696990967 + }, + { + "auxiliary_loss_clip": 0.01091204, + "auxiliary_loss_mlp": 0.01009647, + "balance_loss_clip": 1.05657089, + "balance_loss_mlp": 1.00701499, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6872872141028934, + "language_loss": 0.57081473, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59182322, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.34619141, + "router_z_loss_mlp": 0.02630615, + "step": 3445, + "time_per_iteration": 3.0678062438964844 + }, + { + "auxiliary_loss_clip": 0.01161157, + "auxiliary_loss_mlp": 0.01044706, + "balance_loss_clip": 1.06201661, + "balance_loss_mlp": 1.02666998, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.585829881754175, + "language_loss": 0.78012371, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80218232, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.18041992, + "step": 3446, + "time_per_iteration": 2.4764556884765625 + }, + { + "auxiliary_loss_clip": 0.01167501, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.06191516, + "balance_loss_mlp": 1.02816796, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.247111067077458, + "language_loss": 0.6194011, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64156735, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.2097168, + "step": 3447, + "time_per_iteration": 2.541168451309204 + }, + { + "auxiliary_loss_clip": 0.01159416, + "auxiliary_loss_mlp": 0.01056635, + "balance_loss_clip": 1.0569942, + "balance_loss_mlp": 1.03771663, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.678182595058284, + "language_loss": 0.86122668, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88338715, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.18908691, + "step": 3448, + "time_per_iteration": 2.5117225646972656 + }, + { + "auxiliary_loss_clip": 0.01159972, + "auxiliary_loss_mlp": 0.01052182, + "balance_loss_clip": 1.05979323, + "balance_loss_mlp": 1.03053403, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.0572498162591955, + "language_loss": 0.75112998, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77325153, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.21655273, + "step": 3449, + "time_per_iteration": 2.4639410972595215 + }, + { + "auxiliary_loss_clip": 0.0116298, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_clip": 1.05676389, + "balance_loss_mlp": 1.02774119, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 2.5903216932099746, + "language_loss": 0.76474977, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78685361, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.19677734, + "step": 3450, + "time_per_iteration": 2.4310810565948486 + }, + { + "auxiliary_loss_clip": 0.01165422, + "auxiliary_loss_mlp": 0.01043343, + "balance_loss_clip": 1.06218183, + "balance_loss_mlp": 1.02472281, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 3.1265108568340465, + "language_loss": 0.82392222, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84600985, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.1862793, + "step": 3451, + "time_per_iteration": 2.4645230770111084 + }, + { + "auxiliary_loss_clip": 0.01080328, + "auxiliary_loss_mlp": 0.01006982, + "balance_loss_clip": 1.04396379, + "balance_loss_mlp": 1.00496125, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7898894916928935, + "language_loss": 0.56527263, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58614576, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.36279297, + "router_z_loss_mlp": 0.02020264, + "step": 3452, + "time_per_iteration": 4.375219345092773 + }, + { + "auxiliary_loss_clip": 0.01163143, + "auxiliary_loss_mlp": 0.01046961, + "balance_loss_clip": 1.05934691, + "balance_loss_mlp": 1.02781618, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.9895489413193899, + "language_loss": 0.87643504, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.89853609, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.19152832, + "step": 3453, + "time_per_iteration": 2.5654759407043457 + }, + { + "auxiliary_loss_clip": 0.01163913, + "auxiliary_loss_mlp": 0.01059699, + "balance_loss_clip": 1.06012511, + "balance_loss_mlp": 1.03703761, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.728709728507241, + "language_loss": 0.80204976, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82428592, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 1.03759766, + "router_z_loss_mlp": 0.22668457, + "step": 3454, + "time_per_iteration": 2.578691005706787 + }, + { + "auxiliary_loss_clip": 0.01163235, + "auxiliary_loss_mlp": 0.01049282, + "balance_loss_clip": 1.06013405, + "balance_loss_mlp": 1.03060222, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 5.485637823532227, + "language_loss": 0.75918663, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78131181, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.18664551, + "step": 3455, + "time_per_iteration": 2.4118077754974365 + }, + { + "auxiliary_loss_clip": 0.01165442, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.06423104, + "balance_loss_mlp": 1.03357172, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.9479372143723666, + "language_loss": 0.80411553, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82628596, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.18029785, + "step": 3456, + "time_per_iteration": 2.4479966163635254 + }, + { + "auxiliary_loss_clip": 0.01171666, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.06782317, + "balance_loss_mlp": 1.02784503, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.9653184464042368, + "language_loss": 0.78361928, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.8058126, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.19812012, + "step": 3457, + "time_per_iteration": 2.4609122276306152 + }, + { + "auxiliary_loss_clip": 0.01172139, + "auxiliary_loss_mlp": 0.01048881, + "balance_loss_clip": 1.0644747, + "balance_loss_mlp": 1.02773321, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.8084731822536642, + "language_loss": 0.83096802, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85317826, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.21142578, + "step": 3458, + "time_per_iteration": 2.410353183746338 + }, + { + "auxiliary_loss_clip": 0.01170287, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.06806564, + "balance_loss_mlp": 1.01695728, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 2.223811796992361, + "language_loss": 0.75966382, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78171456, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.17822266, + "step": 3459, + "time_per_iteration": 2.5105676651000977 + }, + { + "auxiliary_loss_clip": 0.0116316, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.06392622, + "balance_loss_mlp": 1.01815271, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 3.039698376860617, + "language_loss": 0.77343804, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.7954281, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17687988, + "step": 3460, + "time_per_iteration": 3.954251289367676 + }, + { + "auxiliary_loss_clip": 0.01169032, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_clip": 1.064888, + "balance_loss_mlp": 1.02750039, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.4417678176291275, + "language_loss": 0.75769401, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77984649, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.18713379, + "step": 3461, + "time_per_iteration": 2.5224714279174805 + }, + { + "auxiliary_loss_clip": 0.01162669, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.05755126, + "balance_loss_mlp": 1.02223158, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.9466866592628809, + "language_loss": 0.88403928, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90608168, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.19311523, + "step": 3462, + "time_per_iteration": 2.559718132019043 + }, + { + "auxiliary_loss_clip": 0.01083033, + "auxiliary_loss_mlp": 0.01004575, + "balance_loss_clip": 1.04591513, + "balance_loss_mlp": 1.00275159, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7731816163691833, + "language_loss": 0.58983892, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.61071503, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.37158203, + "router_z_loss_mlp": 0.01821899, + "step": 3463, + "time_per_iteration": 3.194880962371826 + }, + { + "auxiliary_loss_clip": 0.01162729, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.0592742, + "balance_loss_mlp": 1.03164232, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.9613589763500343, + "language_loss": 0.65945017, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68159306, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.19921875, + "step": 3464, + "time_per_iteration": 3.93798565864563 + }, + { + "auxiliary_loss_clip": 0.01174489, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_clip": 1.0668323, + "balance_loss_mlp": 1.02748823, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 3.4867753173835014, + "language_loss": 0.8387146, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.86093974, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.20507812, + "step": 3465, + "time_per_iteration": 3.9425299167633057 + }, + { + "auxiliary_loss_clip": 0.01169004, + "auxiliary_loss_mlp": 0.01052475, + "balance_loss_clip": 1.06502843, + "balance_loss_mlp": 1.03303218, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.2943929502677687, + "language_loss": 0.82226288, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84447765, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.19445801, + "step": 3466, + "time_per_iteration": 2.476823568344116 + }, + { + "auxiliary_loss_clip": 0.01162593, + "auxiliary_loss_mlp": 0.01043311, + "balance_loss_clip": 1.06097364, + "balance_loss_mlp": 1.02715242, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 4.481673593859506, + "language_loss": 0.82049167, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84255064, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.1618042, + "step": 3467, + "time_per_iteration": 2.4417808055877686 + }, + { + "auxiliary_loss_clip": 0.01154554, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.05818903, + "balance_loss_mlp": 1.02595878, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 5.329586845998867, + "language_loss": 0.81745708, + "learning_rate": 3.674943713009518e-06, + "loss": 0.83943665, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.17443848, + "step": 3468, + "time_per_iteration": 2.477384090423584 + }, + { + "auxiliary_loss_clip": 0.01159222, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.05608439, + "balance_loss_mlp": 1.02825856, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 6.719393938140305, + "language_loss": 0.89881665, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92090046, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.20910645, + "step": 3469, + "time_per_iteration": 2.5496292114257812 + }, + { + "auxiliary_loss_clip": 0.01166015, + "auxiliary_loss_mlp": 0.01051718, + "balance_loss_clip": 1.06284046, + "balance_loss_mlp": 1.03254867, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.7067413121152408, + "language_loss": 0.76765472, + "learning_rate": 3.674517919597092e-06, + "loss": 0.789832, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.19177246, + "step": 3470, + "time_per_iteration": 2.6012375354766846 + }, + { + "auxiliary_loss_clip": 0.01155141, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.05662024, + "balance_loss_mlp": 1.02806139, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 2.3422933410871254, + "language_loss": 0.75728965, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77930617, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.18444824, + "step": 3471, + "time_per_iteration": 2.5135672092437744 + }, + { + "auxiliary_loss_clip": 0.01161947, + "auxiliary_loss_mlp": 0.01055329, + "balance_loss_clip": 1.05683291, + "balance_loss_mlp": 1.03492081, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.7214098087232652, + "language_loss": 0.75672555, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77889824, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.20397949, + "step": 3472, + "time_per_iteration": 2.5524544715881348 + }, + { + "auxiliary_loss_clip": 0.01161721, + "auxiliary_loss_mlp": 0.01050933, + "balance_loss_clip": 1.0585475, + "balance_loss_mlp": 1.0326823, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.9680225782658092, + "language_loss": 0.8432765, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86540306, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.18261719, + "step": 3473, + "time_per_iteration": 2.520655870437622 + }, + { + "auxiliary_loss_clip": 0.01076155, + "auxiliary_loss_mlp": 0.01007538, + "balance_loss_clip": 1.03906131, + "balance_loss_mlp": 1.00560439, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8816442649228263, + "language_loss": 0.63655031, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65738726, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 0.01931763, + "step": 3474, + "time_per_iteration": 2.974874496459961 + }, + { + "auxiliary_loss_clip": 0.0116464, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.06257606, + "balance_loss_mlp": 1.02675593, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 1.9128141464659376, + "language_loss": 0.69832206, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72041512, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.17919922, + "step": 3475, + "time_per_iteration": 2.619974136352539 + }, + { + "auxiliary_loss_clip": 0.01169265, + "auxiliary_loss_mlp": 0.01048365, + "balance_loss_clip": 1.06474459, + "balance_loss_mlp": 1.02976835, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4879807491643824, + "language_loss": 0.6980046, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72018093, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.18615723, + "step": 3476, + "time_per_iteration": 2.5247962474823 + }, + { + "auxiliary_loss_clip": 0.01156174, + "auxiliary_loss_mlp": 0.01043986, + "balance_loss_clip": 1.05772555, + "balance_loss_mlp": 1.02627158, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0324369033658773, + "language_loss": 0.89493698, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.9169386, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.17724609, + "step": 3477, + "time_per_iteration": 2.4907267093658447 + }, + { + "auxiliary_loss_clip": 0.01161216, + "auxiliary_loss_mlp": 0.01048377, + "balance_loss_clip": 1.06040168, + "balance_loss_mlp": 1.02957737, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.3307767923101994, + "language_loss": 0.67849422, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70059013, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.18798828, + "step": 3478, + "time_per_iteration": 2.6180613040924072 + }, + { + "auxiliary_loss_clip": 0.01159265, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_clip": 1.05967665, + "balance_loss_mlp": 1.02386534, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 4.416257496524331, + "language_loss": 0.84766567, + "learning_rate": 3.672598707029127e-06, + "loss": 0.86968458, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.1875, + "step": 3479, + "time_per_iteration": 2.412824869155884 + }, + { + "auxiliary_loss_clip": 0.01164818, + "auxiliary_loss_mlp": 0.01053017, + "balance_loss_clip": 1.06059384, + "balance_loss_mlp": 1.03456318, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.4042366037537537, + "language_loss": 0.73805237, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76023066, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18457031, + "step": 3480, + "time_per_iteration": 2.4591445922851562 + }, + { + "auxiliary_loss_clip": 0.01153874, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.05539525, + "balance_loss_mlp": 1.02846527, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.117158986003747, + "language_loss": 0.7562694, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77825338, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.16064453, + "step": 3481, + "time_per_iteration": 2.420421838760376 + }, + { + "auxiliary_loss_clip": 0.01157905, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_clip": 1.05688059, + "balance_loss_mlp": 1.02723598, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 1.7568222121643136, + "language_loss": 0.85341567, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87544674, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.1796875, + "step": 3482, + "time_per_iteration": 2.449253797531128 + }, + { + "auxiliary_loss_clip": 0.01155747, + "auxiliary_loss_mlp": 0.01046336, + "balance_loss_clip": 1.05725539, + "balance_loss_mlp": 1.02801335, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.053368943638206, + "language_loss": 0.70697498, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.7289958, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.18322754, + "step": 3483, + "time_per_iteration": 2.5675270557403564 + }, + { + "auxiliary_loss_clip": 0.01165556, + "auxiliary_loss_mlp": 0.01051373, + "balance_loss_clip": 1.06115901, + "balance_loss_mlp": 1.03313375, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8651044554863019, + "language_loss": 0.75203729, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77420658, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.18237305, + "step": 3484, + "time_per_iteration": 2.481050968170166 + }, + { + "auxiliary_loss_clip": 0.0115968, + "auxiliary_loss_mlp": 0.01047029, + "balance_loss_clip": 1.05979896, + "balance_loss_mlp": 1.02741933, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8083797753110844, + "language_loss": 0.70400727, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7260744, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.19604492, + "step": 3485, + "time_per_iteration": 2.5492730140686035 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_clip": 1.05928361, + "balance_loss_mlp": 1.02974439, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.977888878385832, + "language_loss": 0.82962579, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85174179, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.20556641, + "step": 3486, + "time_per_iteration": 2.4995298385620117 + }, + { + "auxiliary_loss_clip": 0.01159812, + "auxiliary_loss_mlp": 0.01048974, + "balance_loss_clip": 1.05933261, + "balance_loss_mlp": 1.03164113, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.8911358494284969, + "language_loss": 0.87262994, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89471781, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.17333984, + "step": 3487, + "time_per_iteration": 2.569094657897949 + }, + { + "auxiliary_loss_clip": 0.01159392, + "auxiliary_loss_mlp": 0.0104868, + "balance_loss_clip": 1.0587697, + "balance_loss_mlp": 1.03026259, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 1.9750608963061473, + "language_loss": 0.72164357, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74372423, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.18432617, + "step": 3488, + "time_per_iteration": 2.4955391883850098 + }, + { + "auxiliary_loss_clip": 0.011563, + "auxiliary_loss_mlp": 0.01048373, + "balance_loss_clip": 1.05525732, + "balance_loss_mlp": 1.03070629, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.8046481173399935, + "language_loss": 0.80811942, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.8301661, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.17663574, + "step": 3489, + "time_per_iteration": 2.453219413757324 + }, + { + "auxiliary_loss_clip": 0.01165777, + "auxiliary_loss_mlp": 0.01043452, + "balance_loss_clip": 1.06199026, + "balance_loss_mlp": 1.02609527, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7730412439432435, + "language_loss": 0.72924572, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75133801, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.17346191, + "step": 3490, + "time_per_iteration": 2.487276792526245 + }, + { + "auxiliary_loss_clip": 0.01156278, + "auxiliary_loss_mlp": 0.01046883, + "balance_loss_clip": 1.0606842, + "balance_loss_mlp": 1.03027701, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 2.2311488893180855, + "language_loss": 0.70807004, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.73010164, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.16601562, + "step": 3491, + "time_per_iteration": 2.586390733718872 + }, + { + "auxiliary_loss_clip": 0.01161005, + "auxiliary_loss_mlp": 0.01041659, + "balance_loss_clip": 1.05899787, + "balance_loss_mlp": 1.02271676, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 2.8145911459279365, + "language_loss": 0.79557163, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81759828, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.18933105, + "step": 3492, + "time_per_iteration": 2.4802043437957764 + }, + { + "auxiliary_loss_clip": 0.01163383, + "auxiliary_loss_mlp": 0.0104582, + "balance_loss_clip": 1.06063378, + "balance_loss_mlp": 1.02748513, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.8071914385156411, + "language_loss": 0.87264478, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89473689, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.18334961, + "step": 3493, + "time_per_iteration": 2.4502573013305664 + }, + { + "auxiliary_loss_clip": 0.01167016, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_clip": 1.06676054, + "balance_loss_mlp": 1.03094244, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 2.0963547254680077, + "language_loss": 0.6868239, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70896453, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.16113281, + "step": 3494, + "time_per_iteration": 2.4442100524902344 + }, + { + "auxiliary_loss_clip": 0.01164414, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.06235647, + "balance_loss_mlp": 1.02306867, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.93627849949988, + "language_loss": 0.78773385, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.80977809, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.16931152, + "step": 3495, + "time_per_iteration": 2.5225489139556885 + }, + { + "auxiliary_loss_clip": 0.01167537, + "auxiliary_loss_mlp": 0.01048549, + "balance_loss_clip": 1.06362319, + "balance_loss_mlp": 1.03052425, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.9435988487522968, + "language_loss": 0.77517688, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79733771, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18017578, + "step": 3496, + "time_per_iteration": 3.9123218059539795 + }, + { + "auxiliary_loss_clip": 0.01162117, + "auxiliary_loss_mlp": 0.01047265, + "balance_loss_clip": 1.05899477, + "balance_loss_mlp": 1.02821517, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 1.8199317820843446, + "language_loss": 0.82177281, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84386659, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.19055176, + "step": 3497, + "time_per_iteration": 2.4366724491119385 + }, + { + "auxiliary_loss_clip": 0.01160806, + "auxiliary_loss_mlp": 0.01052936, + "balance_loss_clip": 1.0594275, + "balance_loss_mlp": 1.03282523, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.7616579477341485, + "language_loss": 0.6757257, + "learning_rate": 3.668530172166741e-06, + "loss": 0.6978631, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.2010498, + "step": 3498, + "time_per_iteration": 2.4884800910949707 + }, + { + "auxiliary_loss_clip": 0.01161743, + "auxiliary_loss_mlp": 0.0105628, + "balance_loss_clip": 1.0572629, + "balance_loss_mlp": 1.03678918, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 1.7531937054243498, + "language_loss": 0.80533421, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82751447, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.19482422, + "step": 3499, + "time_per_iteration": 2.441237449645996 + }, + { + "auxiliary_loss_clip": 0.01158983, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_clip": 1.05932021, + "balance_loss_mlp": 1.0272963, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.7376918280433016, + "language_loss": 0.78270805, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80474108, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17016602, + "step": 3500, + "time_per_iteration": 2.4981765747070312 + }, + { + "auxiliary_loss_clip": 0.01166318, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.06576872, + "balance_loss_mlp": 1.02073503, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 2.373723702522283, + "language_loss": 0.73819929, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.7602464, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.17651367, + "step": 3501, + "time_per_iteration": 2.514224052429199 + }, + { + "auxiliary_loss_clip": 0.01163073, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.06209576, + "balance_loss_mlp": 1.02207017, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.5816338172836986, + "language_loss": 0.75426537, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77629441, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17773438, + "step": 3502, + "time_per_iteration": 2.595104455947876 + }, + { + "auxiliary_loss_clip": 0.01158426, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.05735612, + "balance_loss_mlp": 1.02532125, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.874471088208166, + "language_loss": 0.77292514, + "learning_rate": 3.667455706571316e-06, + "loss": 0.79494202, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.17944336, + "step": 3503, + "time_per_iteration": 3.9688565731048584 + }, + { + "auxiliary_loss_clip": 0.01178024, + "auxiliary_loss_mlp": 0.01047372, + "balance_loss_clip": 1.06741285, + "balance_loss_mlp": 1.0265336, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 3.162060166259222, + "language_loss": 0.78018469, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80243862, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.20849609, + "step": 3504, + "time_per_iteration": 2.4744153022766113 + }, + { + "auxiliary_loss_clip": 0.0117213, + "auxiliary_loss_mlp": 0.01052182, + "balance_loss_clip": 1.06627321, + "balance_loss_mlp": 1.03359699, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.6172906259682083, + "language_loss": 0.76846009, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79070318, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.18566895, + "step": 3505, + "time_per_iteration": 2.570902109146118 + }, + { + "auxiliary_loss_clip": 0.01158333, + "auxiliary_loss_mlp": 0.01049681, + "balance_loss_clip": 1.06198335, + "balance_loss_mlp": 1.03189468, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 2.189546554304731, + "language_loss": 0.64189494, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66397512, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.17773438, + "step": 3506, + "time_per_iteration": 2.5404202938079834 + }, + { + "auxiliary_loss_clip": 0.01164734, + "auxiliary_loss_mlp": 0.01050813, + "balance_loss_clip": 1.06439281, + "balance_loss_mlp": 1.0326097, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.6302784421034107, + "language_loss": 0.81796587, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84012139, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.18212891, + "step": 3507, + "time_per_iteration": 2.570936918258667 + }, + { + "auxiliary_loss_clip": 0.01159007, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_clip": 1.0597688, + "balance_loss_mlp": 1.02909088, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 1.7270333283437582, + "language_loss": 0.7540068, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77606386, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.17626953, + "step": 3508, + "time_per_iteration": 5.273413419723511 + }, + { + "auxiliary_loss_clip": 0.01162789, + "auxiliary_loss_mlp": 0.01041386, + "balance_loss_clip": 1.06086254, + "balance_loss_mlp": 1.02357614, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 4.2340283196670665, + "language_loss": 0.85118818, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87322986, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.17810059, + "step": 3509, + "time_per_iteration": 2.429049491882324 + }, + { + "auxiliary_loss_clip": 0.0115728, + "auxiliary_loss_mlp": 0.01040497, + "balance_loss_clip": 1.0549016, + "balance_loss_mlp": 1.02185249, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.7707354316985537, + "language_loss": 0.68216693, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70414472, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.18640137, + "step": 3510, + "time_per_iteration": 2.536109447479248 + }, + { + "auxiliary_loss_clip": 0.01157646, + "auxiliary_loss_mlp": 0.01042456, + "balance_loss_clip": 1.05601943, + "balance_loss_mlp": 1.02466989, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.550922009754195, + "language_loss": 0.72127342, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74327445, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.17785645, + "step": 3511, + "time_per_iteration": 2.503917694091797 + }, + { + "auxiliary_loss_clip": 0.01158283, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.05485821, + "balance_loss_mlp": 1.02087379, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 3.033306033087995, + "language_loss": 0.69429821, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71629232, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.20263672, + "step": 3512, + "time_per_iteration": 2.456707239151001 + }, + { + "auxiliary_loss_clip": 0.01152855, + "auxiliary_loss_mlp": 0.01049228, + "balance_loss_clip": 1.05209148, + "balance_loss_mlp": 1.03019035, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 2.7683188599722315, + "language_loss": 0.73000813, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75202894, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.19018555, + "step": 3513, + "time_per_iteration": 2.5842018127441406 + }, + { + "auxiliary_loss_clip": 0.01155262, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.05670524, + "balance_loss_mlp": 1.02085209, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.794879149983025, + "language_loss": 0.74298346, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76491678, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17224121, + "step": 3514, + "time_per_iteration": 2.492415189743042 + }, + { + "auxiliary_loss_clip": 0.0116373, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.05881834, + "balance_loss_mlp": 1.02664256, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.926211584591557, + "language_loss": 0.77478182, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.7968626, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17712402, + "step": 3515, + "time_per_iteration": 2.5200159549713135 + }, + { + "auxiliary_loss_clip": 0.01173298, + "auxiliary_loss_mlp": 0.01042286, + "balance_loss_clip": 1.0697068, + "balance_loss_mlp": 1.02386796, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 2.2655710045449213, + "language_loss": 0.682652, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70480776, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.18408203, + "step": 3516, + "time_per_iteration": 2.4748117923736572 + }, + { + "auxiliary_loss_clip": 0.01157585, + "auxiliary_loss_mlp": 0.01055731, + "balance_loss_clip": 1.05549943, + "balance_loss_mlp": 1.03643107, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 3.848441898538415, + "language_loss": 0.8490169, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87115002, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.19299316, + "step": 3517, + "time_per_iteration": 2.469799757003784 + }, + { + "auxiliary_loss_clip": 0.01147667, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.04960656, + "balance_loss_mlp": 1.01950192, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 3.839420911139628, + "language_loss": 0.62617713, + "learning_rate": 3.664222829354512e-06, + "loss": 0.64802319, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17443848, + "step": 3518, + "time_per_iteration": 2.610708236694336 + }, + { + "auxiliary_loss_clip": 0.01153058, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.0540005, + "balance_loss_mlp": 1.03646922, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 2.0414545926640293, + "language_loss": 0.89405614, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91613752, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.18603516, + "step": 3519, + "time_per_iteration": 2.473093032836914 + }, + { + "auxiliary_loss_clip": 0.01160926, + "auxiliary_loss_mlp": 0.0105565, + "balance_loss_clip": 1.05843413, + "balance_loss_mlp": 1.03744674, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.596889430931742, + "language_loss": 0.80904818, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83121395, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.18212891, + "step": 3520, + "time_per_iteration": 2.4900014400482178 + }, + { + "auxiliary_loss_clip": 0.01146987, + "auxiliary_loss_mlp": 0.01047726, + "balance_loss_clip": 1.05079937, + "balance_loss_mlp": 1.03086996, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.5502484201706235, + "language_loss": 0.76259643, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78454351, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.1685791, + "step": 3521, + "time_per_iteration": 2.4789907932281494 + }, + { + "auxiliary_loss_clip": 0.0115591, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.05693066, + "balance_loss_mlp": 1.02482021, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.3829556109628816, + "language_loss": 0.75872469, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78069651, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16455078, + "step": 3522, + "time_per_iteration": 2.47289776802063 + }, + { + "auxiliary_loss_clip": 0.01148583, + "auxiliary_loss_mlp": 0.01048669, + "balance_loss_clip": 1.05150223, + "balance_loss_mlp": 1.03081167, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.014324226476917, + "language_loss": 0.70359021, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72556269, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.17858887, + "step": 3523, + "time_per_iteration": 2.58575177192688 + }, + { + "auxiliary_loss_clip": 0.01156381, + "auxiliary_loss_mlp": 0.0105236, + "balance_loss_clip": 1.0579077, + "balance_loss_mlp": 1.03587341, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.6748604496264097, + "language_loss": 0.76977754, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.79186493, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.16473389, + "step": 3524, + "time_per_iteration": 2.477851152420044 + }, + { + "auxiliary_loss_clip": 0.01158311, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.05418181, + "balance_loss_mlp": 1.0253129, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.0590254585810275, + "language_loss": 0.81426418, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83627641, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.17602539, + "step": 3525, + "time_per_iteration": 2.5074007511138916 + }, + { + "auxiliary_loss_clip": 0.01147095, + "auxiliary_loss_mlp": 0.01044166, + "balance_loss_clip": 1.04938126, + "balance_loss_mlp": 1.02647567, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 2.2352353611928066, + "language_loss": 0.75050044, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77241307, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.17687988, + "step": 3526, + "time_per_iteration": 2.5231826305389404 + }, + { + "auxiliary_loss_clip": 0.01156574, + "auxiliary_loss_mlp": 0.01040585, + "balance_loss_clip": 1.05639768, + "balance_loss_mlp": 1.02272785, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 5.045079896418253, + "language_loss": 0.76722431, + "learning_rate": 3.662276285649284e-06, + "loss": 0.78919595, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.17858887, + "step": 3527, + "time_per_iteration": 2.485335111618042 + }, + { + "auxiliary_loss_clip": 0.01148803, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.05151224, + "balance_loss_mlp": 1.02985561, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 2.265213548964816, + "language_loss": 0.77777046, + "learning_rate": 3.662059687737528e-06, + "loss": 0.79973692, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.17980957, + "step": 3528, + "time_per_iteration": 2.6296257972717285 + }, + { + "auxiliary_loss_clip": 0.01157679, + "auxiliary_loss_mlp": 0.01048046, + "balance_loss_clip": 1.05778599, + "balance_loss_mlp": 1.03107047, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 2.4868796961577524, + "language_loss": 0.81667995, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83873719, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.16967773, + "step": 3529, + "time_per_iteration": 2.476470947265625 + }, + { + "auxiliary_loss_clip": 0.01163368, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.06134617, + "balance_loss_mlp": 1.03401995, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 3.7535615683411288, + "language_loss": 0.76890498, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.79106128, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.18237305, + "step": 3530, + "time_per_iteration": 2.527785539627075 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.06108904, + "balance_loss_mlp": 1.02325451, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.246369314393998, + "language_loss": 0.83400762, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85598922, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.1685791, + "step": 3531, + "time_per_iteration": 2.5018718242645264 + }, + { + "auxiliary_loss_clip": 0.0115757, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.05767488, + "balance_loss_mlp": 1.02302074, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.176291398356205, + "language_loss": 0.73528028, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75727844, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.19238281, + "step": 3532, + "time_per_iteration": 2.48685359954834 + }, + { + "auxiliary_loss_clip": 0.0114786, + "auxiliary_loss_mlp": 0.01040591, + "balance_loss_clip": 1.05044401, + "balance_loss_mlp": 1.02231669, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.7897944345641237, + "language_loss": 0.73662722, + "learning_rate": 3.660975752961054e-06, + "loss": 0.75851166, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.1829834, + "step": 3533, + "time_per_iteration": 2.4806506633758545 + }, + { + "auxiliary_loss_clip": 0.01147908, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.04963303, + "balance_loss_mlp": 1.02310967, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 2.281675904161424, + "language_loss": 0.70627874, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.72816485, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.17602539, + "step": 3534, + "time_per_iteration": 2.626075029373169 + }, + { + "auxiliary_loss_clip": 0.01150692, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.02256298, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 2.3439909461228567, + "language_loss": 0.72086275, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74277419, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17895508, + "step": 3535, + "time_per_iteration": 2.4861679077148438 + }, + { + "auxiliary_loss_clip": 0.01147888, + "auxiliary_loss_mlp": 0.01054542, + "balance_loss_clip": 1.05159748, + "balance_loss_mlp": 1.03592193, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 2.3548951635675475, + "language_loss": 0.70417607, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72620034, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.18640137, + "step": 3536, + "time_per_iteration": 2.495279550552368 + }, + { + "auxiliary_loss_clip": 0.01159454, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.05661547, + "balance_loss_mlp": 1.03285718, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 4.125745175026513, + "language_loss": 0.8791157, + "learning_rate": 3.660107471371981e-06, + "loss": 0.9012261, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.18725586, + "step": 3537, + "time_per_iteration": 2.4732513427734375 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.05428946, + "balance_loss_mlp": 1.02472448, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 2.331640195439954, + "language_loss": 0.80227041, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82419842, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.17053223, + "step": 3538, + "time_per_iteration": 2.6331052780151367 + }, + { + "auxiliary_loss_clip": 0.01151837, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.05438733, + "balance_loss_mlp": 1.02011991, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.673188458061968, + "language_loss": 0.87460911, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89649796, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.16931152, + "step": 3539, + "time_per_iteration": 3.967968225479126 + }, + { + "auxiliary_loss_clip": 0.01157968, + "auxiliary_loss_mlp": 0.01047455, + "balance_loss_clip": 1.05948424, + "balance_loss_mlp": 1.03006268, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 2.246729240852891, + "language_loss": 0.58093566, + "learning_rate": 3.659455599161237e-06, + "loss": 0.60298991, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.1739502, + "step": 3540, + "time_per_iteration": 2.5156807899475098 + }, + { + "auxiliary_loss_clip": 0.01158432, + "auxiliary_loss_mlp": 0.01038518, + "balance_loss_clip": 1.06077015, + "balance_loss_mlp": 1.0207082, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 3.626686731938822, + "language_loss": 0.75578433, + "learning_rate": 3.659238182559888e-06, + "loss": 0.77775389, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.17810059, + "step": 3541, + "time_per_iteration": 2.4493534564971924 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01042608, + "balance_loss_clip": 1.05736852, + "balance_loss_mlp": 1.02608573, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 2.0697985781067882, + "language_loss": 0.69431651, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71627337, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.1652832, + "step": 3542, + "time_per_iteration": 2.4820401668548584 + }, + { + "auxiliary_loss_clip": 0.01155231, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.06028318, + "balance_loss_mlp": 1.02205467, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 2.2452278173744227, + "language_loss": 0.75908649, + "learning_rate": 3.658803160610004e-06, + "loss": 0.78102005, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.16064453, + "step": 3543, + "time_per_iteration": 2.4742817878723145 + }, + { + "auxiliary_loss_clip": 0.01155395, + "auxiliary_loss_mlp": 0.01038671, + "balance_loss_clip": 1.06045568, + "balance_loss_mlp": 1.02163601, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.9659768032475742, + "language_loss": 0.66519767, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68713838, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.17028809, + "step": 3544, + "time_per_iteration": 2.4313244819641113 + }, + { + "auxiliary_loss_clip": 0.01153577, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.05591381, + "balance_loss_mlp": 1.02180147, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.830442438891638, + "language_loss": 0.71028602, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73220587, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.16601562, + "step": 3545, + "time_per_iteration": 2.4812543392181396 + }, + { + "auxiliary_loss_clip": 0.01152318, + "auxiliary_loss_mlp": 0.01055097, + "balance_loss_clip": 1.05388165, + "balance_loss_mlp": 1.03748941, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.8300518837319575, + "language_loss": 0.72039056, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74246466, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17614746, + "step": 3546, + "time_per_iteration": 2.5342559814453125 + }, + { + "auxiliary_loss_clip": 0.01150006, + "auxiliary_loss_mlp": 0.01041821, + "balance_loss_clip": 1.05332994, + "balance_loss_mlp": 1.02495253, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.8349664072378284, + "language_loss": 0.8085804, + "learning_rate": 3.657932361952479e-06, + "loss": 0.83049864, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.16882324, + "step": 3547, + "time_per_iteration": 3.8722915649414062 + }, + { + "auxiliary_loss_clip": 0.0115205, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.05140781, + "balance_loss_mlp": 1.027794, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 4.1156845052802975, + "language_loss": 0.74989402, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.77187437, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.18200684, + "step": 3548, + "time_per_iteration": 2.5210254192352295 + }, + { + "auxiliary_loss_clip": 0.01162875, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.06230164, + "balance_loss_mlp": 1.03034508, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.463783440551933, + "language_loss": 0.7428968, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76501715, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.18811035, + "step": 3549, + "time_per_iteration": 2.4725489616394043 + }, + { + "auxiliary_loss_clip": 0.01154817, + "auxiliary_loss_mlp": 0.01053611, + "balance_loss_clip": 1.05617476, + "balance_loss_mlp": 1.03446603, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6300401778597058, + "language_loss": 0.80945152, + "learning_rate": 3.657278602806357e-06, + "loss": 0.83153582, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.19140625, + "step": 3550, + "time_per_iteration": 2.4764602184295654 + }, + { + "auxiliary_loss_clip": 0.01153584, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.05717289, + "balance_loss_mlp": 1.02911985, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.7300249325596437, + "language_loss": 0.87822986, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90022141, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16455078, + "step": 3551, + "time_per_iteration": 3.9973912239074707 + }, + { + "auxiliary_loss_clip": 0.01154129, + "auxiliary_loss_mlp": 0.0104894, + "balance_loss_clip": 1.05747771, + "balance_loss_mlp": 1.03160715, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 1.9163208295535457, + "language_loss": 0.83291537, + "learning_rate": 3.656842449140983e-06, + "loss": 0.85494602, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17321777, + "step": 3552, + "time_per_iteration": 4.035155296325684 + }, + { + "auxiliary_loss_clip": 0.01152073, + "auxiliary_loss_mlp": 0.01046941, + "balance_loss_clip": 1.05598986, + "balance_loss_mlp": 1.02984595, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7629363879547897, + "language_loss": 0.76796699, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78995705, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.1708374, + "step": 3553, + "time_per_iteration": 2.4975972175598145 + }, + { + "auxiliary_loss_clip": 0.01160588, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.06378949, + "balance_loss_mlp": 1.02836227, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.8799340546588454, + "language_loss": 0.72632229, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74836874, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.15686035, + "step": 3554, + "time_per_iteration": 2.486274003982544 + }, + { + "auxiliary_loss_clip": 0.01152989, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.05547643, + "balance_loss_mlp": 1.02012694, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 2.194434818271903, + "language_loss": 0.67649299, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69839197, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.16784668, + "step": 3555, + "time_per_iteration": 2.503243923187256 + }, + { + "auxiliary_loss_clip": 0.01165124, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.06436813, + "balance_loss_mlp": 1.01975214, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.2072591841495295, + "language_loss": 0.64928973, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.67131364, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.1751709, + "step": 3556, + "time_per_iteration": 2.5577659606933594 + }, + { + "auxiliary_loss_clip": 0.01156422, + "auxiliary_loss_mlp": 0.01045241, + "balance_loss_clip": 1.05938697, + "balance_loss_mlp": 1.02764559, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.9079833549556895, + "language_loss": 0.72586358, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.7478801, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17614746, + "step": 3557, + "time_per_iteration": 2.5341999530792236 + }, + { + "auxiliary_loss_clip": 0.01152155, + "auxiliary_loss_mlp": 0.01047057, + "balance_loss_clip": 1.0512681, + "balance_loss_mlp": 1.02782905, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.8018983813736578, + "language_loss": 0.67334163, + "learning_rate": 3.655532480546528e-06, + "loss": 0.69533378, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.19238281, + "step": 3558, + "time_per_iteration": 2.5608460903167725 + }, + { + "auxiliary_loss_clip": 0.01159349, + "auxiliary_loss_mlp": 0.01045855, + "balance_loss_clip": 1.0537262, + "balance_loss_mlp": 1.02766371, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8746327347934706, + "language_loss": 0.80057323, + "learning_rate": 3.655313932676286e-06, + "loss": 0.82262528, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.18188477, + "step": 3559, + "time_per_iteration": 2.4823670387268066 + }, + { + "auxiliary_loss_clip": 0.01154002, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_clip": 1.05627012, + "balance_loss_mlp": 1.03235841, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.8361229627981963, + "language_loss": 0.67665541, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69869578, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.17675781, + "step": 3560, + "time_per_iteration": 2.5190792083740234 + }, + { + "auxiliary_loss_clip": 0.01156808, + "auxiliary_loss_mlp": 0.01045913, + "balance_loss_clip": 1.05676603, + "balance_loss_mlp": 1.02782845, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 2.6924448675368042, + "language_loss": 0.72970366, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75173092, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.18078613, + "step": 3561, + "time_per_iteration": 2.4428045749664307 + }, + { + "auxiliary_loss_clip": 0.01154011, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.05398679, + "balance_loss_mlp": 1.03321075, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.2146886162245822, + "language_loss": 0.77759629, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79964888, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.18041992, + "step": 3562, + "time_per_iteration": 2.4134562015533447 + }, + { + "auxiliary_loss_clip": 0.01146932, + "auxiliary_loss_mlp": 0.01044772, + "balance_loss_clip": 1.05176485, + "balance_loss_mlp": 1.02737951, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 2.4621682240940608, + "language_loss": 0.84907436, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.87099147, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.17382812, + "step": 3563, + "time_per_iteration": 2.4996371269226074 + }, + { + "auxiliary_loss_clip": 0.0115302, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.05563521, + "balance_loss_mlp": 1.02221394, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.833737518505503, + "language_loss": 0.76551402, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78742993, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.16333008, + "step": 3564, + "time_per_iteration": 2.5745978355407715 + }, + { + "auxiliary_loss_clip": 0.0115028, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_clip": 1.05393171, + "balance_loss_mlp": 1.03015757, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.8647304049501745, + "language_loss": 0.88430512, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90629709, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.18762207, + "step": 3565, + "time_per_iteration": 2.5897912979125977 + }, + { + "auxiliary_loss_clip": 0.01088376, + "auxiliary_loss_mlp": 0.01018846, + "balance_loss_clip": 1.05091715, + "balance_loss_mlp": 1.01629484, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8397367398260221, + "language_loss": 0.52240241, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54347467, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.37451172, + "router_z_loss_mlp": 0.0255127, + "step": 3566, + "time_per_iteration": 3.0468087196350098 + }, + { + "auxiliary_loss_clip": 0.01150566, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.05678201, + "balance_loss_mlp": 1.02343535, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 2.436390103843091, + "language_loss": 0.67708051, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69898069, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.16003418, + "step": 3567, + "time_per_iteration": 2.4715676307678223 + }, + { + "auxiliary_loss_clip": 0.01148827, + "auxiliary_loss_mlp": 0.01037975, + "balance_loss_clip": 1.05583978, + "balance_loss_mlp": 1.0218581, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6800951655246803, + "language_loss": 0.74210167, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76396966, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.16113281, + "step": 3568, + "time_per_iteration": 2.5406811237335205 + }, + { + "auxiliary_loss_clip": 0.01161243, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_clip": 1.0650965, + "balance_loss_mlp": 1.03339863, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 2.194713294351235, + "language_loss": 0.77829832, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80041814, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.17346191, + "step": 3569, + "time_per_iteration": 2.440977096557617 + }, + { + "auxiliary_loss_clip": 0.01162127, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_clip": 1.05809295, + "balance_loss_mlp": 1.02823734, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.2989928881620494, + "language_loss": 0.70184112, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72394782, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.20336914, + "step": 3570, + "time_per_iteration": 2.4429757595062256 + }, + { + "auxiliary_loss_clip": 0.01160868, + "auxiliary_loss_mlp": 0.01047245, + "balance_loss_clip": 1.06031406, + "balance_loss_mlp": 1.02918482, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.490839969029944, + "language_loss": 0.78674972, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80883086, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.18066406, + "step": 3571, + "time_per_iteration": 2.513396739959717 + }, + { + "auxiliary_loss_clip": 0.01153598, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_clip": 1.05583704, + "balance_loss_mlp": 1.02518487, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 4.140999375945946, + "language_loss": 0.82491142, + "learning_rate": 3.652467101342991e-06, + "loss": 0.8468886, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.18945312, + "step": 3572, + "time_per_iteration": 2.712993860244751 + }, + { + "auxiliary_loss_clip": 0.01154232, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_clip": 1.05260253, + "balance_loss_mlp": 1.02479148, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.908331114454304, + "language_loss": 0.64687145, + "learning_rate": 3.652247675452598e-06, + "loss": 0.66885775, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.19616699, + "step": 3573, + "time_per_iteration": 2.5757052898406982 + }, + { + "auxiliary_loss_clip": 0.01144684, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.05053842, + "balance_loss_mlp": 1.02277803, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 1.8479654514680888, + "language_loss": 0.75539082, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77723396, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.1685791, + "step": 3574, + "time_per_iteration": 2.4698486328125 + }, + { + "auxiliary_loss_clip": 0.01151721, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.05445135, + "balance_loss_mlp": 1.02393937, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 2.068777152096239, + "language_loss": 0.72263026, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74456465, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.17773438, + "step": 3575, + "time_per_iteration": 2.4892406463623047 + }, + { + "auxiliary_loss_clip": 0.01159435, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.06139207, + "balance_loss_mlp": 1.02264309, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 2.1394252687955224, + "language_loss": 0.68087399, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70285892, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.16418457, + "step": 3576, + "time_per_iteration": 2.457207441329956 + }, + { + "auxiliary_loss_clip": 0.0115551, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.05441701, + "balance_loss_mlp": 1.02353323, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.2348600527265776, + "language_loss": 0.88397479, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90595537, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.18994141, + "step": 3577, + "time_per_iteration": 2.429809093475342 + }, + { + "auxiliary_loss_clip": 0.01082611, + "auxiliary_loss_mlp": 0.01003641, + "balance_loss_clip": 1.04755843, + "balance_loss_mlp": 1.00143242, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8096298796154354, + "language_loss": 0.56183153, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58269405, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.35009766, + "router_z_loss_mlp": 0.02209473, + "step": 3578, + "time_per_iteration": 3.026616334915161 + }, + { + "auxiliary_loss_clip": 0.01151146, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.05400729, + "balance_loss_mlp": 1.0348891, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.7598728685367084, + "language_loss": 0.88637203, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90842068, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.18811035, + "step": 3579, + "time_per_iteration": 2.4796066284179688 + }, + { + "auxiliary_loss_clip": 0.01153799, + "auxiliary_loss_mlp": 0.01047255, + "balance_loss_clip": 1.05480576, + "balance_loss_mlp": 1.0290395, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 2.193575255999153, + "language_loss": 0.77889276, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80090332, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.18200684, + "step": 3580, + "time_per_iteration": 2.476644992828369 + }, + { + "auxiliary_loss_clip": 0.01156989, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.06098843, + "balance_loss_mlp": 1.02432227, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 2.0132490653607245, + "language_loss": 0.72628701, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.74827814, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.17810059, + "step": 3581, + "time_per_iteration": 2.5030558109283447 + }, + { + "auxiliary_loss_clip": 0.01150373, + "auxiliary_loss_mlp": 0.0104842, + "balance_loss_clip": 1.0528723, + "balance_loss_mlp": 1.02723694, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.566251731776629, + "language_loss": 0.71076947, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73275733, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.21203613, + "step": 3582, + "time_per_iteration": 2.4655418395996094 + }, + { + "auxiliary_loss_clip": 0.01168516, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.06889904, + "balance_loss_mlp": 1.0258317, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.69013978505843, + "language_loss": 0.84279263, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86491865, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.18249512, + "step": 3583, + "time_per_iteration": 3.893671989440918 + }, + { + "auxiliary_loss_clip": 0.01157987, + "auxiliary_loss_mlp": 0.0105188, + "balance_loss_clip": 1.05570626, + "balance_loss_mlp": 1.03380799, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 2.4070837888531393, + "language_loss": 0.83204734, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.854146, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.1809082, + "step": 3584, + "time_per_iteration": 2.5255014896392822 + }, + { + "auxiliary_loss_clip": 0.01151052, + "auxiliary_loss_mlp": 0.01049239, + "balance_loss_clip": 1.05438876, + "balance_loss_mlp": 1.03069043, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.2090547170801695, + "language_loss": 0.89905965, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92106259, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.18554688, + "step": 3585, + "time_per_iteration": 2.5019896030426025 + }, + { + "auxiliary_loss_clip": 0.01173042, + "auxiliary_loss_mlp": 0.01042817, + "balance_loss_clip": 1.07169437, + "balance_loss_mlp": 1.0246973, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 1.938708403048773, + "language_loss": 0.74722689, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76938546, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.18115234, + "step": 3586, + "time_per_iteration": 2.5008091926574707 + }, + { + "auxiliary_loss_clip": 0.01153021, + "auxiliary_loss_mlp": 0.01049803, + "balance_loss_clip": 1.0532006, + "balance_loss_mlp": 1.03248215, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.219376889020928, + "language_loss": 0.82837266, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85040092, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.99755859, + "router_z_loss_mlp": 0.17321777, + "step": 3587, + "time_per_iteration": 2.4856905937194824 + }, + { + "auxiliary_loss_clip": 0.01154144, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.05333662, + "balance_loss_mlp": 1.02200592, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.6773396301299932, + "language_loss": 0.75676322, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77870727, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.18249512, + "step": 3588, + "time_per_iteration": 2.5696234703063965 + }, + { + "auxiliary_loss_clip": 0.01156348, + "auxiliary_loss_mlp": 0.01040393, + "balance_loss_clip": 1.05810916, + "balance_loss_mlp": 1.02239203, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.9209127115653764, + "language_loss": 0.81155765, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83352512, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.18017578, + "step": 3589, + "time_per_iteration": 2.5653369426727295 + }, + { + "auxiliary_loss_clip": 0.01156592, + "auxiliary_loss_mlp": 0.01041235, + "balance_loss_clip": 1.0565176, + "balance_loss_mlp": 1.02410507, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.3128461818862798, + "language_loss": 0.7291503, + "learning_rate": 3.648507856144961e-06, + "loss": 0.75112855, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.17126465, + "step": 3590, + "time_per_iteration": 2.569495439529419 + }, + { + "auxiliary_loss_clip": 0.01159569, + "auxiliary_loss_mlp": 0.01044777, + "balance_loss_clip": 1.05652022, + "balance_loss_mlp": 1.02569151, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.233907476712203, + "language_loss": 0.83933562, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86137903, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.1907959, + "step": 3591, + "time_per_iteration": 3.8537964820861816 + }, + { + "auxiliary_loss_clip": 0.01160461, + "auxiliary_loss_mlp": 0.01052595, + "balance_loss_clip": 1.05819666, + "balance_loss_mlp": 1.03043437, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.022634182979991, + "language_loss": 0.69524199, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71737254, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.22167969, + "step": 3592, + "time_per_iteration": 2.542127847671509 + }, + { + "auxiliary_loss_clip": 0.01162129, + "auxiliary_loss_mlp": 0.01053468, + "balance_loss_clip": 1.06041455, + "balance_loss_mlp": 1.03394151, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 3.814286668654528, + "language_loss": 0.84659249, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86874843, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.19543457, + "step": 3593, + "time_per_iteration": 2.4314892292022705 + }, + { + "auxiliary_loss_clip": 0.01181249, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_clip": 1.07551229, + "balance_loss_mlp": 1.03174639, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.4943119931645104, + "language_loss": 0.75199139, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.7743184, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.19702148, + "step": 3594, + "time_per_iteration": 3.9612975120544434 + }, + { + "auxiliary_loss_clip": 0.01157006, + "auxiliary_loss_mlp": 0.0105078, + "balance_loss_clip": 1.05938601, + "balance_loss_mlp": 1.03117001, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.5967988018766461, + "language_loss": 0.80667454, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.8287524, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.19628906, + "step": 3595, + "time_per_iteration": 3.932941436767578 + }, + { + "auxiliary_loss_clip": 0.01159646, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.05738926, + "balance_loss_mlp": 1.02443767, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 8.443970178769934, + "language_loss": 0.78359067, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80563164, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.20019531, + "step": 3596, + "time_per_iteration": 2.4696807861328125 + }, + { + "auxiliary_loss_clip": 0.01168426, + "auxiliary_loss_mlp": 0.01047064, + "balance_loss_clip": 1.06652558, + "balance_loss_mlp": 1.02975488, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.4958009681637512, + "language_loss": 0.83377135, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85592622, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17321777, + "step": 3597, + "time_per_iteration": 2.4803764820098877 + }, + { + "auxiliary_loss_clip": 0.01163138, + "auxiliary_loss_mlp": 0.01044169, + "balance_loss_clip": 1.06280303, + "balance_loss_mlp": 1.02536964, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.6189485354114213, + "language_loss": 0.80750299, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82957608, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.18811035, + "step": 3598, + "time_per_iteration": 2.4850564002990723 + }, + { + "auxiliary_loss_clip": 0.01167383, + "auxiliary_loss_mlp": 0.01054359, + "balance_loss_clip": 1.06145048, + "balance_loss_mlp": 1.03466594, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 2.0096882442906936, + "language_loss": 0.81868911, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.8409065, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.19689941, + "step": 3599, + "time_per_iteration": 2.5177536010742188 + }, + { + "auxiliary_loss_clip": 0.0116315, + "auxiliary_loss_mlp": 0.01035746, + "balance_loss_clip": 1.06331801, + "balance_loss_mlp": 1.01822209, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 6.2871634832631615, + "language_loss": 0.76507115, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78706002, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.1751709, + "step": 3600, + "time_per_iteration": 2.568666458129883 + }, + { + "auxiliary_loss_clip": 0.01158493, + "auxiliary_loss_mlp": 0.01046502, + "balance_loss_clip": 1.0590415, + "balance_loss_mlp": 1.02987242, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 2.047280527844908, + "language_loss": 0.79764819, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.81969821, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.16638184, + "step": 3601, + "time_per_iteration": 2.6368331909179688 + }, + { + "auxiliary_loss_clip": 0.01168349, + "auxiliary_loss_mlp": 0.01049868, + "balance_loss_clip": 1.06742513, + "balance_loss_mlp": 1.03191519, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 3.5838689369283396, + "language_loss": 0.83635449, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85853666, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.17944336, + "step": 3602, + "time_per_iteration": 2.5412092208862305 + }, + { + "auxiliary_loss_clip": 0.01156237, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.056288, + "balance_loss_mlp": 1.03414059, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.8001648201399136, + "language_loss": 0.74920779, + "learning_rate": 3.645635802397693e-06, + "loss": 0.77131128, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.19970703, + "step": 3603, + "time_per_iteration": 2.485795021057129 + }, + { + "auxiliary_loss_clip": 0.0115811, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.05979109, + "balance_loss_mlp": 1.02564728, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 2.2991039355593874, + "language_loss": 0.73761922, + "learning_rate": 3.645414438132855e-06, + "loss": 0.75963455, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.1776123, + "step": 3604, + "time_per_iteration": 2.4790172576904297 + }, + { + "auxiliary_loss_clip": 0.01170245, + "auxiliary_loss_mlp": 0.01046702, + "balance_loss_clip": 1.07184756, + "balance_loss_mlp": 1.02772415, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.6607706647526648, + "language_loss": 0.79770368, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.81987321, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.18981934, + "step": 3605, + "time_per_iteration": 2.503242254257202 + }, + { + "auxiliary_loss_clip": 0.0108756, + "auxiliary_loss_mlp": 0.01013978, + "balance_loss_clip": 1.05313194, + "balance_loss_mlp": 1.01174617, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.692263529182314, + "language_loss": 0.58314991, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60416526, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.34472656, + "router_z_loss_mlp": 0.02230835, + "step": 3606, + "time_per_iteration": 3.162691354751587 + }, + { + "auxiliary_loss_clip": 0.01168147, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.06420672, + "balance_loss_mlp": 1.03056288, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.0499161151168663, + "language_loss": 0.72968721, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75186503, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.1907959, + "step": 3607, + "time_per_iteration": 2.4863100051879883 + }, + { + "auxiliary_loss_clip": 0.01167269, + "auxiliary_loss_mlp": 0.01050872, + "balance_loss_clip": 1.06064248, + "balance_loss_mlp": 1.03034413, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 1.8414002498602888, + "language_loss": 0.75909245, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78127384, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.20532227, + "step": 3608, + "time_per_iteration": 2.4575021266937256 + }, + { + "auxiliary_loss_clip": 0.01158913, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_clip": 1.05648601, + "balance_loss_mlp": 1.0302372, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.9347100522893317, + "language_loss": 0.74668854, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76875699, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.17687988, + "step": 3609, + "time_per_iteration": 2.515239715576172 + }, + { + "auxiliary_loss_clip": 0.01161051, + "auxiliary_loss_mlp": 0.01048152, + "balance_loss_clip": 1.05897033, + "balance_loss_mlp": 1.02996063, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 3.3557137621794357, + "language_loss": 0.88670737, + "learning_rate": 3.6440849425579e-06, + "loss": 0.90879941, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.18188477, + "step": 3610, + "time_per_iteration": 2.513906717300415 + }, + { + "auxiliary_loss_clip": 0.01160507, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.06041551, + "balance_loss_mlp": 1.02161086, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 1.736628964977447, + "language_loss": 0.77602208, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79802537, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18225098, + "step": 3611, + "time_per_iteration": 2.4903600215911865 + }, + { + "auxiliary_loss_clip": 0.01162027, + "auxiliary_loss_mlp": 0.01043855, + "balance_loss_clip": 1.06367993, + "balance_loss_mlp": 1.02630758, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.420527411289181, + "language_loss": 0.63438624, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65644503, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17553711, + "step": 3612, + "time_per_iteration": 2.4829392433166504 + }, + { + "auxiliary_loss_clip": 0.01168153, + "auxiliary_loss_mlp": 0.01047909, + "balance_loss_clip": 1.06539631, + "balance_loss_mlp": 1.03008699, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.7308091478301877, + "language_loss": 0.75974238, + "learning_rate": 3.643419353014776e-06, + "loss": 0.78190297, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.17822266, + "step": 3613, + "time_per_iteration": 2.4306180477142334 + }, + { + "auxiliary_loss_clip": 0.01158473, + "auxiliary_loss_mlp": 0.01046793, + "balance_loss_clip": 1.05950022, + "balance_loss_mlp": 1.02777898, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 1.9636040203330243, + "language_loss": 0.70849174, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73054439, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.19018555, + "step": 3614, + "time_per_iteration": 2.5532193183898926 + }, + { + "auxiliary_loss_clip": 0.01166897, + "auxiliary_loss_mlp": 0.01048565, + "balance_loss_clip": 1.06633437, + "balance_loss_mlp": 1.03005171, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 3.7799913398589764, + "language_loss": 0.73424971, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75640434, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.18518066, + "step": 3615, + "time_per_iteration": 2.447277784347534 + }, + { + "auxiliary_loss_clip": 0.01162669, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.05718923, + "balance_loss_mlp": 1.02980447, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 3.084382343706433, + "language_loss": 0.8964783, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.91860271, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.19958496, + "step": 3616, + "time_per_iteration": 2.4796228408813477 + }, + { + "auxiliary_loss_clip": 0.01161575, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.05887127, + "balance_loss_mlp": 1.02404284, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.402174609208369, + "language_loss": 0.81066561, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83270991, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.18811035, + "step": 3617, + "time_per_iteration": 2.464496374130249 + }, + { + "auxiliary_loss_clip": 0.01160319, + "auxiliary_loss_mlp": 0.01045018, + "balance_loss_clip": 1.05967617, + "balance_loss_mlp": 1.02735162, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.6574986633246565, + "language_loss": 0.75685859, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77891195, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.17687988, + "step": 3618, + "time_per_iteration": 2.6261043548583984 + }, + { + "auxiliary_loss_clip": 0.01162692, + "auxiliary_loss_mlp": 0.0106038, + "balance_loss_clip": 1.0609777, + "balance_loss_mlp": 1.04017401, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.910197515704789, + "language_loss": 0.69067323, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71290392, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.20214844, + "step": 3619, + "time_per_iteration": 2.4950437545776367 + }, + { + "auxiliary_loss_clip": 0.0116839, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.06413269, + "balance_loss_mlp": 1.03517771, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.8358468982936722, + "language_loss": 0.78400242, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80622697, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18896484, + "step": 3620, + "time_per_iteration": 2.5677754878997803 + }, + { + "auxiliary_loss_clip": 0.01152902, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.05725563, + "balance_loss_mlp": 1.02176881, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 3.8847228772629254, + "language_loss": 0.80207956, + "learning_rate": 3.641641706164509e-06, + "loss": 0.82400036, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.1739502, + "step": 3621, + "time_per_iteration": 2.4991085529327393 + }, + { + "auxiliary_loss_clip": 0.01158581, + "auxiliary_loss_mlp": 0.01044959, + "balance_loss_clip": 1.06013334, + "balance_loss_mlp": 1.02649319, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.8630642984031704, + "language_loss": 0.87620294, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89823836, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.18469238, + "step": 3622, + "time_per_iteration": 2.6177256107330322 + }, + { + "auxiliary_loss_clip": 0.01163668, + "auxiliary_loss_mlp": 0.01044412, + "balance_loss_clip": 1.06026244, + "balance_loss_mlp": 1.02457523, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 1.9885391945945208, + "language_loss": 0.76970518, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79178596, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.19824219, + "step": 3623, + "time_per_iteration": 2.41582989692688 + }, + { + "auxiliary_loss_clip": 0.01165591, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_clip": 1.06137848, + "balance_loss_mlp": 1.02976513, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.4444996541114503, + "language_loss": 0.84763467, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86978394, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.19580078, + "step": 3624, + "time_per_iteration": 2.4873430728912354 + }, + { + "auxiliary_loss_clip": 0.0116985, + "auxiliary_loss_mlp": 0.01054787, + "balance_loss_clip": 1.06665623, + "balance_loss_mlp": 1.03635705, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.787171033486248, + "language_loss": 0.77671105, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79895741, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.18432617, + "step": 3625, + "time_per_iteration": 2.4422521591186523 + }, + { + "auxiliary_loss_clip": 0.01079612, + "auxiliary_loss_mlp": 0.01003695, + "balance_loss_clip": 1.04565954, + "balance_loss_mlp": 1.00169218, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8167097650351473, + "language_loss": 0.60712469, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62795782, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.02001953, + "step": 3626, + "time_per_iteration": 4.714742422103882 + }, + { + "auxiliary_loss_clip": 0.01166115, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.06419337, + "balance_loss_mlp": 1.02578187, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 2.0876388628668177, + "language_loss": 0.90409714, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92621112, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.19494629, + "step": 3627, + "time_per_iteration": 2.495319366455078 + }, + { + "auxiliary_loss_clip": 0.01163215, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.06399941, + "balance_loss_mlp": 1.02933168, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.6808021274796845, + "language_loss": 0.7349152, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75703484, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.19421387, + "step": 3628, + "time_per_iteration": 2.437824010848999 + }, + { + "auxiliary_loss_clip": 0.01168369, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.06775379, + "balance_loss_mlp": 1.0270462, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 7.876007104882234, + "language_loss": 0.76798785, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79012138, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17944336, + "step": 3629, + "time_per_iteration": 2.5680782794952393 + }, + { + "auxiliary_loss_clip": 0.01157295, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.05882001, + "balance_loss_mlp": 1.02085912, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5653165936372506, + "language_loss": 0.71080565, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73275828, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17089844, + "step": 3630, + "time_per_iteration": 2.5942749977111816 + }, + { + "auxiliary_loss_clip": 0.0115209, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.05711627, + "balance_loss_mlp": 1.02472329, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 2.4412813739415076, + "language_loss": 0.76719201, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.7891221, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.1619873, + "step": 3631, + "time_per_iteration": 2.538722515106201 + }, + { + "auxiliary_loss_clip": 0.01155639, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.05531263, + "balance_loss_mlp": 1.01928997, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.302290960439957, + "language_loss": 0.75052929, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77245057, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.17211914, + "step": 3632, + "time_per_iteration": 2.4582924842834473 + }, + { + "auxiliary_loss_clip": 0.01164759, + "auxiliary_loss_mlp": 0.01045459, + "balance_loss_clip": 1.06496656, + "balance_loss_mlp": 1.0286628, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 2.292036245312926, + "language_loss": 0.84012663, + "learning_rate": 3.638967767095249e-06, + "loss": 0.86222887, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.16796875, + "step": 3633, + "time_per_iteration": 2.472424268722534 + }, + { + "auxiliary_loss_clip": 0.01152474, + "auxiliary_loss_mlp": 0.01047143, + "balance_loss_clip": 1.05528808, + "balance_loss_mlp": 1.02971447, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.6890073710453295, + "language_loss": 0.81634659, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83834279, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.17431641, + "step": 3634, + "time_per_iteration": 3.8598997592926025 + }, + { + "auxiliary_loss_clip": 0.01158425, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_clip": 1.05650568, + "balance_loss_mlp": 1.03136265, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 1.7695780505219978, + "language_loss": 0.75249594, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77458918, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.19543457, + "step": 3635, + "time_per_iteration": 2.4752349853515625 + }, + { + "auxiliary_loss_clip": 0.01181686, + "auxiliary_loss_mlp": 0.0104884, + "balance_loss_clip": 1.08071232, + "balance_loss_mlp": 1.03205562, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 1.9208908876501665, + "language_loss": 0.88143587, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90374106, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.16784668, + "step": 3636, + "time_per_iteration": 2.4327738285064697 + }, + { + "auxiliary_loss_clip": 0.01170607, + "auxiliary_loss_mlp": 0.01049052, + "balance_loss_clip": 1.06815147, + "balance_loss_mlp": 1.03139675, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.1498367073992175, + "language_loss": 0.76170969, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78390628, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.1763916, + "step": 3637, + "time_per_iteration": 3.893695116043091 + }, + { + "auxiliary_loss_clip": 0.01165152, + "auxiliary_loss_mlp": 0.0104032, + "balance_loss_clip": 1.06388688, + "balance_loss_mlp": 1.02199781, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 2.85782176481264, + "language_loss": 0.90661252, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92866725, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.18334961, + "step": 3638, + "time_per_iteration": 3.9417614936828613 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.05684638, + "balance_loss_mlp": 1.03098893, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 3.285671010521439, + "language_loss": 0.89553499, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91756821, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.17736816, + "step": 3639, + "time_per_iteration": 2.4089713096618652 + }, + { + "auxiliary_loss_clip": 0.01154925, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.05579436, + "balance_loss_mlp": 1.02406156, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.922744602801638, + "language_loss": 0.79453367, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81649178, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.16833496, + "step": 3640, + "time_per_iteration": 2.618380069732666 + }, + { + "auxiliary_loss_clip": 0.01157597, + "auxiliary_loss_mlp": 0.01049003, + "balance_loss_clip": 1.05886233, + "balance_loss_mlp": 1.03016758, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.278786153650627, + "language_loss": 0.72002584, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74209183, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.18835449, + "step": 3641, + "time_per_iteration": 2.448443651199341 + }, + { + "auxiliary_loss_clip": 0.01164006, + "auxiliary_loss_mlp": 0.01040992, + "balance_loss_clip": 1.0634799, + "balance_loss_mlp": 1.02410018, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 3.241240936281432, + "language_loss": 0.80997396, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83202392, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.16894531, + "step": 3642, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01165421, + "auxiliary_loss_mlp": 0.01049952, + "balance_loss_clip": 1.06554675, + "balance_loss_mlp": 1.03152204, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 1.6946609301603544, + "language_loss": 0.72021854, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.74237227, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.18432617, + "step": 3643, + "time_per_iteration": 2.4803965091705322 + }, + { + "auxiliary_loss_clip": 0.01159496, + "auxiliary_loss_mlp": 0.01048879, + "balance_loss_clip": 1.05868196, + "balance_loss_mlp": 1.02980566, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 1.7774780371315613, + "language_loss": 0.68393779, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70602155, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.1907959, + "step": 3644, + "time_per_iteration": 2.7192296981811523 + }, + { + "auxiliary_loss_clip": 0.01168741, + "auxiliary_loss_mlp": 0.01046015, + "balance_loss_clip": 1.06381965, + "balance_loss_mlp": 1.02785969, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 3.166092900862215, + "language_loss": 0.78227359, + "learning_rate": 3.636284878455669e-06, + "loss": 0.80442113, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.18139648, + "step": 3645, + "time_per_iteration": 2.452258825302124 + }, + { + "auxiliary_loss_clip": 0.01160947, + "auxiliary_loss_mlp": 0.01051986, + "balance_loss_clip": 1.06447983, + "balance_loss_mlp": 1.03516543, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.5730240967651914, + "language_loss": 0.82322991, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84535915, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16809082, + "step": 3646, + "time_per_iteration": 2.530583620071411 + }, + { + "auxiliary_loss_clip": 0.01152893, + "auxiliary_loss_mlp": 0.01036825, + "balance_loss_clip": 1.05796957, + "balance_loss_mlp": 1.02007592, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.9088893771878699, + "language_loss": 0.83274615, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85464334, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16760254, + "step": 3647, + "time_per_iteration": 2.471332550048828 + }, + { + "auxiliary_loss_clip": 0.01153827, + "auxiliary_loss_mlp": 0.01048227, + "balance_loss_clip": 1.05750704, + "balance_loss_mlp": 1.03108478, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.6103814038802815, + "language_loss": 0.72521335, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74723393, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.17138672, + "step": 3648, + "time_per_iteration": 2.587805986404419 + }, + { + "auxiliary_loss_clip": 0.01157213, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.05573106, + "balance_loss_mlp": 1.02338994, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3994701170344452, + "language_loss": 0.74501783, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76701939, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.19555664, + "step": 3649, + "time_per_iteration": 2.4426798820495605 + }, + { + "auxiliary_loss_clip": 0.01153529, + "auxiliary_loss_mlp": 0.01046095, + "balance_loss_clip": 1.0579102, + "balance_loss_mlp": 1.02886915, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.0499523070133665, + "language_loss": 0.86394036, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88593656, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.17211914, + "step": 3650, + "time_per_iteration": 2.4795241355895996 + }, + { + "auxiliary_loss_clip": 0.01152942, + "auxiliary_loss_mlp": 0.01043948, + "balance_loss_clip": 1.05430961, + "balance_loss_mlp": 1.02568555, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.089896084671301, + "language_loss": 0.83652806, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85849702, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.18273926, + "step": 3651, + "time_per_iteration": 2.6671998500823975 + }, + { + "auxiliary_loss_clip": 0.01153793, + "auxiliary_loss_mlp": 0.01047154, + "balance_loss_clip": 1.05549645, + "balance_loss_mlp": 1.02970815, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 2.1318970464604563, + "language_loss": 0.74634397, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76835346, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17449951, + "step": 3652, + "time_per_iteration": 2.5355753898620605 + }, + { + "auxiliary_loss_clip": 0.01097406, + "auxiliary_loss_mlp": 0.01025553, + "balance_loss_clip": 1.0607748, + "balance_loss_mlp": 1.02322555, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7339172456805746, + "language_loss": 0.51516342, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53639305, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 0.02328491, + "step": 3653, + "time_per_iteration": 3.087923288345337 + }, + { + "auxiliary_loss_clip": 0.01159535, + "auxiliary_loss_mlp": 0.0105029, + "balance_loss_clip": 1.06181169, + "balance_loss_mlp": 1.03285003, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 2.3061267496039743, + "language_loss": 0.75673473, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77883303, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.17431641, + "step": 3654, + "time_per_iteration": 2.515925645828247 + }, + { + "auxiliary_loss_clip": 0.01167725, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_clip": 1.06775641, + "balance_loss_mlp": 1.02631402, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 3.498408617446454, + "language_loss": 0.72756463, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74968529, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18017578, + "step": 3655, + "time_per_iteration": 2.488386869430542 + }, + { + "auxiliary_loss_clip": 0.01161991, + "auxiliary_loss_mlp": 0.01040977, + "balance_loss_clip": 1.06261981, + "balance_loss_mlp": 1.0240252, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6573665168211733, + "language_loss": 0.80852139, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83055103, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.16943359, + "step": 3656, + "time_per_iteration": 2.4907572269439697 + }, + { + "auxiliary_loss_clip": 0.01164542, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.06618476, + "balance_loss_mlp": 1.01995635, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.2767347233901587, + "language_loss": 0.84970105, + "learning_rate": 3.63359305489566e-06, + "loss": 0.87171507, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.16906738, + "step": 3657, + "time_per_iteration": 2.4522218704223633 + }, + { + "auxiliary_loss_clip": 0.0116318, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.06190455, + "balance_loss_mlp": 1.02365065, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.621041102448477, + "language_loss": 0.80776477, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82981193, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.17895508, + "step": 3658, + "time_per_iteration": 2.5891098976135254 + }, + { + "auxiliary_loss_clip": 0.01100182, + "auxiliary_loss_mlp": 0.01006935, + "balance_loss_clip": 1.06529725, + "balance_loss_mlp": 1.00442231, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7846579236454648, + "language_loss": 0.58229971, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60337085, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.34863281, + "router_z_loss_mlp": 0.02511597, + "step": 3659, + "time_per_iteration": 3.214437246322632 + }, + { + "auxiliary_loss_clip": 0.01154872, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.05960107, + "balance_loss_mlp": 1.01951444, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.1187541444346327, + "language_loss": 0.74671477, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76863271, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.17407227, + "step": 3660, + "time_per_iteration": 2.498208999633789 + }, + { + "auxiliary_loss_clip": 0.01155789, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.05635905, + "balance_loss_mlp": 1.0232594, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 2.172408615504924, + "language_loss": 0.81574667, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83771336, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.17602539, + "step": 3661, + "time_per_iteration": 2.507784605026245 + }, + { + "auxiliary_loss_clip": 0.01173086, + "auxiliary_loss_mlp": 0.01044779, + "balance_loss_clip": 1.07313466, + "balance_loss_mlp": 1.02885294, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.8596686007306797, + "language_loss": 0.73222607, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75440472, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.15917969, + "step": 3662, + "time_per_iteration": 2.5321412086486816 + }, + { + "auxiliary_loss_clip": 0.01159342, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_clip": 1.06350088, + "balance_loss_mlp": 1.03318357, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.4401497830121708, + "language_loss": 0.77978492, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80186981, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.15966797, + "step": 3663, + "time_per_iteration": 2.6538403034210205 + }, + { + "auxiliary_loss_clip": 0.01158955, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.06083918, + "balance_loss_mlp": 1.02858686, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.823568473010468, + "language_loss": 0.80757928, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82962549, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.1706543, + "step": 3664, + "time_per_iteration": 2.5969033241271973 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01042281, + "balance_loss_clip": 1.06251717, + "balance_loss_mlp": 1.02321959, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.462484358599596, + "language_loss": 0.76644439, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78852731, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.19055176, + "step": 3665, + "time_per_iteration": 2.5461971759796143 + }, + { + "auxiliary_loss_clip": 0.01158703, + "auxiliary_loss_mlp": 0.01052133, + "balance_loss_clip": 1.06216145, + "balance_loss_mlp": 1.03499067, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 3.025090930278757, + "language_loss": 0.98559648, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00770485, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.17138672, + "step": 3666, + "time_per_iteration": 2.487170934677124 + }, + { + "auxiliary_loss_clip": 0.01152748, + "auxiliary_loss_mlp": 0.01042153, + "balance_loss_clip": 1.05690658, + "balance_loss_mlp": 1.02495146, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.664005799565827, + "language_loss": 0.80620372, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82815272, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17199707, + "step": 3667, + "time_per_iteration": 2.6747944355010986 + }, + { + "auxiliary_loss_clip": 0.0116461, + "auxiliary_loss_mlp": 0.01047982, + "balance_loss_clip": 1.06486201, + "balance_loss_mlp": 1.02919447, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.662721224058661, + "language_loss": 0.77352417, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79565012, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.18786621, + "step": 3668, + "time_per_iteration": 2.498032331466675 + }, + { + "auxiliary_loss_clip": 0.01161965, + "auxiliary_loss_mlp": 0.01045778, + "balance_loss_clip": 1.06304514, + "balance_loss_mlp": 1.02856398, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.7333801392651356, + "language_loss": 0.71357977, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73565722, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.17224121, + "step": 3669, + "time_per_iteration": 3.9924347400665283 + }, + { + "auxiliary_loss_clip": 0.01166596, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.06852567, + "balance_loss_mlp": 1.02359152, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.1241894100959766, + "language_loss": 0.85819608, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.88026297, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16491699, + "step": 3670, + "time_per_iteration": 2.539273738861084 + }, + { + "auxiliary_loss_clip": 0.01170838, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.0685966, + "balance_loss_mlp": 1.02559137, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 2.3872442294093847, + "language_loss": 0.76962435, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79176044, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.171875, + "step": 3671, + "time_per_iteration": 2.6021671295166016 + }, + { + "auxiliary_loss_clip": 0.01156398, + "auxiliary_loss_mlp": 0.01042566, + "balance_loss_clip": 1.05915499, + "balance_loss_mlp": 1.02575803, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 1.997951005322033, + "language_loss": 0.80887055, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83086014, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.16821289, + "step": 3672, + "time_per_iteration": 2.5131418704986572 + }, + { + "auxiliary_loss_clip": 0.011675, + "auxiliary_loss_mlp": 0.01058868, + "balance_loss_clip": 1.06634998, + "balance_loss_mlp": 1.04199982, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 2.004085094964945, + "language_loss": 0.73309743, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75536108, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.16882324, + "step": 3673, + "time_per_iteration": 2.451914072036743 + }, + { + "auxiliary_loss_clip": 0.01164144, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.06575155, + "balance_loss_mlp": 1.02525353, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 2.2438690556301255, + "language_loss": 0.77108485, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.793176, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.19702148, + "step": 3674, + "time_per_iteration": 2.5986862182617188 + }, + { + "auxiliary_loss_clip": 0.0117765, + "auxiliary_loss_mlp": 0.01045671, + "balance_loss_clip": 1.07597852, + "balance_loss_mlp": 1.0275985, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.2794133121730353, + "language_loss": 0.74357706, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76581025, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.1809082, + "step": 3675, + "time_per_iteration": 2.491406202316284 + }, + { + "auxiliary_loss_clip": 0.01163653, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_clip": 1.06346011, + "balance_loss_mlp": 1.02701306, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.664407257912672, + "language_loss": 0.80268466, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82476139, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.17016602, + "step": 3676, + "time_per_iteration": 2.6425108909606934 + }, + { + "auxiliary_loss_clip": 0.0117292, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.07051885, + "balance_loss_mlp": 1.03323948, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 2.255338078356786, + "language_loss": 0.7584874, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.78072131, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.17248535, + "step": 3677, + "time_per_iteration": 3.8803634643554688 + }, + { + "auxiliary_loss_clip": 0.01165722, + "auxiliary_loss_mlp": 0.01048073, + "balance_loss_clip": 1.06843007, + "balance_loss_mlp": 1.03156233, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.7725769816576844, + "language_loss": 0.83168983, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85382783, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.16516113, + "step": 3678, + "time_per_iteration": 2.517212390899658 + }, + { + "auxiliary_loss_clip": 0.01165343, + "auxiliary_loss_mlp": 0.01045922, + "balance_loss_clip": 1.06716609, + "balance_loss_mlp": 1.02878022, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 2.0877207384795615, + "language_loss": 0.89242333, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.914536, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.17150879, + "step": 3679, + "time_per_iteration": 2.5964667797088623 + }, + { + "auxiliary_loss_clip": 0.01165711, + "auxiliary_loss_mlp": 0.01052719, + "balance_loss_clip": 1.06455541, + "balance_loss_mlp": 1.03455102, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.346456060635005, + "language_loss": 0.86696047, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88914472, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.1817627, + "step": 3680, + "time_per_iteration": 2.5137832164764404 + }, + { + "auxiliary_loss_clip": 0.01155761, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_clip": 1.06072235, + "balance_loss_mlp": 1.0302664, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.0481937404872417, + "language_loss": 0.81957424, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84160417, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.16955566, + "step": 3681, + "time_per_iteration": 5.230751037597656 + }, + { + "auxiliary_loss_clip": 0.01155841, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.06039596, + "balance_loss_mlp": 1.02520442, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.5402458269050485, + "language_loss": 0.79896927, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82094765, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16784668, + "step": 3682, + "time_per_iteration": 2.4907891750335693 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.0640198, + "balance_loss_mlp": 1.02939487, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.7077947192435738, + "language_loss": 0.77422982, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79636323, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.1829834, + "step": 3683, + "time_per_iteration": 2.4746882915496826 + }, + { + "auxiliary_loss_clip": 0.01168277, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.06712687, + "balance_loss_mlp": 1.0249778, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.278258230887239, + "language_loss": 0.72377491, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74588299, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.17541504, + "step": 3684, + "time_per_iteration": 2.5209364891052246 + }, + { + "auxiliary_loss_clip": 0.01169723, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_clip": 1.07031131, + "balance_loss_mlp": 1.02793026, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.95919487794081, + "language_loss": 0.80135554, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82351482, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.18261719, + "step": 3685, + "time_per_iteration": 2.476942539215088 + }, + { + "auxiliary_loss_clip": 0.01157502, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.06154871, + "balance_loss_mlp": 1.02740073, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.3715514403414069, + "language_loss": 0.87576938, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89778012, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16186523, + "step": 3686, + "time_per_iteration": 2.5646636486053467 + }, + { + "auxiliary_loss_clip": 0.01149016, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.05284524, + "balance_loss_mlp": 1.02235222, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 2.0656265730401686, + "language_loss": 0.77604073, + "learning_rate": 3.626824502298707e-06, + "loss": 0.79793179, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.17736816, + "step": 3687, + "time_per_iteration": 2.6352462768554688 + }, + { + "auxiliary_loss_clip": 0.01164111, + "auxiliary_loss_mlp": 0.01049608, + "balance_loss_clip": 1.06238377, + "balance_loss_mlp": 1.03099966, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.866761815030276, + "language_loss": 0.8474133, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86955047, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.18615723, + "step": 3688, + "time_per_iteration": 2.4777796268463135 + }, + { + "auxiliary_loss_clip": 0.01167437, + "auxiliary_loss_mlp": 0.01047595, + "balance_loss_clip": 1.06239271, + "balance_loss_mlp": 1.02961802, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 3.414275016548136, + "language_loss": 0.81293309, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83508348, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.17980957, + "step": 3689, + "time_per_iteration": 2.4785847663879395 + }, + { + "auxiliary_loss_clip": 0.0116142, + "auxiliary_loss_mlp": 0.01047472, + "balance_loss_clip": 1.06325769, + "balance_loss_mlp": 1.02985275, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 6.9409727184037315, + "language_loss": 0.70039696, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72248584, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17614746, + "step": 3690, + "time_per_iteration": 2.509774923324585 + }, + { + "auxiliary_loss_clip": 0.01157851, + "auxiliary_loss_mlp": 0.01042118, + "balance_loss_clip": 1.05661893, + "balance_loss_mlp": 1.02156651, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.8387293171973824, + "language_loss": 0.72413534, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.746135, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.20556641, + "step": 3691, + "time_per_iteration": 2.531125783920288 + }, + { + "auxiliary_loss_clip": 0.01158968, + "auxiliary_loss_mlp": 0.01049437, + "balance_loss_clip": 1.06263828, + "balance_loss_mlp": 1.02957702, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 1.921952712263278, + "language_loss": 0.72158605, + "learning_rate": 3.625691006130477e-06, + "loss": 0.74367011, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.19873047, + "step": 3692, + "time_per_iteration": 2.4926984310150146 + }, + { + "auxiliary_loss_clip": 0.01162271, + "auxiliary_loss_mlp": 0.01045663, + "balance_loss_clip": 1.06165743, + "balance_loss_mlp": 1.02772176, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 1.63474098485646, + "language_loss": 0.87501836, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89709771, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.17932129, + "step": 3693, + "time_per_iteration": 2.480164051055908 + }, + { + "auxiliary_loss_clip": 0.01161974, + "auxiliary_loss_mlp": 0.01041919, + "balance_loss_clip": 1.06397891, + "balance_loss_mlp": 1.02556372, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 2.0298366135870363, + "language_loss": 0.85517937, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87721825, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.16357422, + "step": 3694, + "time_per_iteration": 2.462979793548584 + }, + { + "auxiliary_loss_clip": 0.01162854, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.05957246, + "balance_loss_mlp": 1.02292418, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 3.5928446270140757, + "language_loss": 0.69845319, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.72048563, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.17468262, + "step": 3695, + "time_per_iteration": 2.4888782501220703 + }, + { + "auxiliary_loss_clip": 0.01162209, + "auxiliary_loss_mlp": 0.01040138, + "balance_loss_clip": 1.06941652, + "balance_loss_mlp": 1.02454543, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.730712709906323, + "language_loss": 0.71950459, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.74152803, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15576172, + "step": 3696, + "time_per_iteration": 2.5304155349731445 + }, + { + "auxiliary_loss_clip": 0.0116088, + "auxiliary_loss_mlp": 0.01044029, + "balance_loss_clip": 1.06259847, + "balance_loss_mlp": 1.02644515, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.7049409521607386, + "language_loss": 0.87521005, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89725912, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17590332, + "step": 3697, + "time_per_iteration": 2.5103683471679688 + }, + { + "auxiliary_loss_clip": 0.0115132, + "auxiliary_loss_mlp": 0.01041818, + "balance_loss_clip": 1.05788124, + "balance_loss_mlp": 1.0261184, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.701368152099383, + "language_loss": 0.66542685, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68735826, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.15692139, + "step": 3698, + "time_per_iteration": 2.6638882160186768 + }, + { + "auxiliary_loss_clip": 0.01157076, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.05602908, + "balance_loss_mlp": 1.02225649, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 2.0971655180165176, + "language_loss": 0.820952, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84292662, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.18139648, + "step": 3699, + "time_per_iteration": 2.590402126312256 + }, + { + "auxiliary_loss_clip": 0.01162347, + "auxiliary_loss_mlp": 0.01043955, + "balance_loss_clip": 1.06327176, + "balance_loss_mlp": 1.02662206, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.4964381670520188, + "language_loss": 0.79685807, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81892109, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.17321777, + "step": 3700, + "time_per_iteration": 2.5614469051361084 + }, + { + "auxiliary_loss_clip": 0.01173527, + "auxiliary_loss_mlp": 0.01040891, + "balance_loss_clip": 1.06961226, + "balance_loss_mlp": 1.02252066, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 2.058246783493184, + "language_loss": 0.72085059, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74299479, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.18371582, + "step": 3701, + "time_per_iteration": 2.679603099822998 + }, + { + "auxiliary_loss_clip": 0.01150114, + "auxiliary_loss_mlp": 0.01041535, + "balance_loss_clip": 1.05408168, + "balance_loss_mlp": 1.02471447, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 2.2820643505045886, + "language_loss": 0.80093139, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82284784, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.16796875, + "step": 3702, + "time_per_iteration": 2.504053831100464 + }, + { + "auxiliary_loss_clip": 0.01143961, + "auxiliary_loss_mlp": 0.01037615, + "balance_loss_clip": 1.05282021, + "balance_loss_mlp": 1.0207231, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.7836044198919154, + "language_loss": 0.78344721, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80526298, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16906738, + "step": 3703, + "time_per_iteration": 2.522432327270508 + }, + { + "auxiliary_loss_clip": 0.01164482, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_clip": 1.06317139, + "balance_loss_mlp": 1.02752113, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.931664455194049, + "language_loss": 0.74186802, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76398182, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.19372559, + "step": 3704, + "time_per_iteration": 2.52146053314209 + }, + { + "auxiliary_loss_clip": 0.01153738, + "auxiliary_loss_mlp": 0.01049166, + "balance_loss_clip": 1.05901527, + "balance_loss_mlp": 1.03202379, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.8353555448730947, + "language_loss": 0.64197242, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66400146, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.17126465, + "step": 3705, + "time_per_iteration": 2.7076430320739746 + }, + { + "auxiliary_loss_clip": 0.0108747, + "auxiliary_loss_mlp": 0.01026567, + "balance_loss_clip": 1.05379486, + "balance_loss_mlp": 1.02464211, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.2738055724304147, + "language_loss": 0.6520763, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.6732167, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.33642578, + "router_z_loss_mlp": 0.01922607, + "step": 3706, + "time_per_iteration": 2.949679136276245 + }, + { + "auxiliary_loss_clip": 0.01165561, + "auxiliary_loss_mlp": 0.01042032, + "balance_loss_clip": 1.0664593, + "balance_loss_mlp": 1.02570069, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 2.2719494586173634, + "language_loss": 0.80527985, + "learning_rate": 3.622281274977141e-06, + "loss": 0.8273558, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.16333008, + "step": 3707, + "time_per_iteration": 2.476025342941284 + }, + { + "auxiliary_loss_clip": 0.01162148, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.06640959, + "balance_loss_mlp": 1.02117741, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 1.8304949360845204, + "language_loss": 0.78644061, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80843526, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.16125488, + "step": 3708, + "time_per_iteration": 2.5180716514587402 + }, + { + "auxiliary_loss_clip": 0.01169242, + "auxiliary_loss_mlp": 0.01043886, + "balance_loss_clip": 1.06988072, + "balance_loss_mlp": 1.02691031, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 2.351904336470259, + "language_loss": 0.80006927, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82220054, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.16967773, + "step": 3709, + "time_per_iteration": 2.555669069290161 + }, + { + "auxiliary_loss_clip": 0.01160868, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.06223369, + "balance_loss_mlp": 1.02745438, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 1.912667752081982, + "language_loss": 0.68810594, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.71015757, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16845703, + "step": 3710, + "time_per_iteration": 2.568822145462036 + }, + { + "auxiliary_loss_clip": 0.01161353, + "auxiliary_loss_mlp": 0.01050106, + "balance_loss_clip": 1.06000137, + "balance_loss_mlp": 1.03209305, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.010558975508737, + "language_loss": 0.907785, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92989963, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.18005371, + "step": 3711, + "time_per_iteration": 2.49103045463562 + }, + { + "auxiliary_loss_clip": 0.01161064, + "auxiliary_loss_mlp": 0.01049329, + "balance_loss_clip": 1.0625062, + "balance_loss_mlp": 1.03044581, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.251224903598254, + "language_loss": 0.89493108, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91703504, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.1887207, + "step": 3712, + "time_per_iteration": 3.9688313007354736 + }, + { + "auxiliary_loss_clip": 0.01162147, + "auxiliary_loss_mlp": 0.01059496, + "balance_loss_clip": 1.06421614, + "balance_loss_mlp": 1.03991032, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 2.9913310990782542, + "language_loss": 0.74861443, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77083087, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.19592285, + "step": 3713, + "time_per_iteration": 2.475771188735962 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01044211, + "balance_loss_clip": 1.06204963, + "balance_loss_mlp": 1.02703249, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.7847601934813504, + "language_loss": 0.62695408, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64899564, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.17175293, + "step": 3714, + "time_per_iteration": 2.6779685020446777 + }, + { + "auxiliary_loss_clip": 0.01165148, + "auxiliary_loss_mlp": 0.0104068, + "balance_loss_clip": 1.06324553, + "balance_loss_mlp": 1.02351403, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.932551584254267, + "language_loss": 0.79030657, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81236482, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 1.01806641, + "router_z_loss_mlp": 0.17175293, + "step": 3715, + "time_per_iteration": 2.502610921859741 + }, + { + "auxiliary_loss_clip": 0.01164728, + "auxiliary_loss_mlp": 0.0105276, + "balance_loss_clip": 1.06524432, + "balance_loss_mlp": 1.03533149, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 3.8462780863985917, + "language_loss": 0.77353936, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79571426, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.17431641, + "step": 3716, + "time_per_iteration": 2.425630807876587 + }, + { + "auxiliary_loss_clip": 0.01170084, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.06911027, + "balance_loss_mlp": 1.02921295, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.170092849185032, + "language_loss": 0.78896523, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81113321, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17492676, + "step": 3717, + "time_per_iteration": 2.4482059478759766 + }, + { + "auxiliary_loss_clip": 0.01164291, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.06456256, + "balance_loss_mlp": 1.02514458, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.0162026125216506, + "language_loss": 0.67657137, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.6986438, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.17810059, + "step": 3718, + "time_per_iteration": 2.5091466903686523 + }, + { + "auxiliary_loss_clip": 0.0116823, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.06783414, + "balance_loss_mlp": 1.02280474, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.566180189873668, + "language_loss": 0.80488837, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82698536, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.18664551, + "step": 3719, + "time_per_iteration": 2.5557022094726562 + }, + { + "auxiliary_loss_clip": 0.01170132, + "auxiliary_loss_mlp": 0.0105228, + "balance_loss_clip": 1.06589365, + "balance_loss_mlp": 1.0333612, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 1.8530249513290409, + "language_loss": 0.86644369, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88866782, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18908691, + "step": 3720, + "time_per_iteration": 2.4485764503479004 + }, + { + "auxiliary_loss_clip": 0.01165541, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.06966758, + "balance_loss_mlp": 1.0201329, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.6845012002527646, + "language_loss": 0.74427873, + "learning_rate": 3.619086370692945e-06, + "loss": 0.76630759, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17211914, + "step": 3721, + "time_per_iteration": 4.002334356307983 + }, + { + "auxiliary_loss_clip": 0.01166, + "auxiliary_loss_mlp": 0.01043147, + "balance_loss_clip": 1.06555557, + "balance_loss_mlp": 1.02507508, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.3313912558303125, + "language_loss": 0.79120517, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81329668, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.18078613, + "step": 3722, + "time_per_iteration": 2.528686046600342 + }, + { + "auxiliary_loss_clip": 0.01165833, + "auxiliary_loss_mlp": 0.01038791, + "balance_loss_clip": 1.07098246, + "balance_loss_mlp": 1.02300739, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.1191457955715984, + "language_loss": 0.82628459, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84833086, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.15783691, + "step": 3723, + "time_per_iteration": 2.473240375518799 + }, + { + "auxiliary_loss_clip": 0.01168932, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_clip": 1.06978285, + "balance_loss_mlp": 1.02853215, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 8.929998492344001, + "language_loss": 0.84591019, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86805534, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.17041016, + "step": 3724, + "time_per_iteration": 3.9735193252563477 + }, + { + "auxiliary_loss_clip": 0.01165048, + "auxiliary_loss_mlp": 0.01036592, + "balance_loss_clip": 1.06769013, + "balance_loss_mlp": 1.01909232, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 1.8619571147250116, + "language_loss": 0.79300845, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81502485, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.1751709, + "step": 3725, + "time_per_iteration": 3.9770615100860596 + }, + { + "auxiliary_loss_clip": 0.01161671, + "auxiliary_loss_mlp": 0.0104238, + "balance_loss_clip": 1.06566596, + "balance_loss_mlp": 1.02484441, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.8187665650485187, + "language_loss": 0.77522194, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79726243, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.17541504, + "step": 3726, + "time_per_iteration": 2.511930227279663 + }, + { + "auxiliary_loss_clip": 0.01179373, + "auxiliary_loss_mlp": 0.01053885, + "balance_loss_clip": 1.07237864, + "balance_loss_mlp": 1.03416729, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.882146226986126, + "language_loss": 0.72480929, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74714184, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1973877, + "step": 3727, + "time_per_iteration": 2.4575271606445312 + }, + { + "auxiliary_loss_clip": 0.0117212, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.06924093, + "balance_loss_mlp": 1.02629769, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.332729258541123, + "language_loss": 0.86927688, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.89145732, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.19628906, + "step": 3728, + "time_per_iteration": 2.493227243423462 + }, + { + "auxiliary_loss_clip": 0.01162318, + "auxiliary_loss_mlp": 0.0104854, + "balance_loss_clip": 1.06439567, + "balance_loss_mlp": 1.02895427, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.311277014928317, + "language_loss": 0.80542237, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82753098, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.19580078, + "step": 3729, + "time_per_iteration": 2.4918811321258545 + }, + { + "auxiliary_loss_clip": 0.01154221, + "auxiliary_loss_mlp": 0.01048391, + "balance_loss_clip": 1.06051731, + "balance_loss_mlp": 1.03213131, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.9686090547449162, + "language_loss": 0.86967683, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.89170289, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16271973, + "step": 3730, + "time_per_iteration": 2.650315761566162 + }, + { + "auxiliary_loss_clip": 0.01167411, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.07119536, + "balance_loss_mlp": 1.02472699, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.8182249169382305, + "language_loss": 0.72808266, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75017273, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.1685791, + "step": 3731, + "time_per_iteration": 2.496516704559326 + }, + { + "auxiliary_loss_clip": 0.01165546, + "auxiliary_loss_mlp": 0.0104201, + "balance_loss_clip": 1.06760001, + "balance_loss_mlp": 1.024701, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 1.8300086324912057, + "language_loss": 0.75033945, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77241498, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.17297363, + "step": 3732, + "time_per_iteration": 2.555774450302124 + }, + { + "auxiliary_loss_clip": 0.01159855, + "auxiliary_loss_mlp": 0.0105724, + "balance_loss_clip": 1.06232762, + "balance_loss_mlp": 1.03960919, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 2.4243411285903846, + "language_loss": 0.88149065, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90366161, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.1763916, + "step": 3733, + "time_per_iteration": 2.515941619873047 + }, + { + "auxiliary_loss_clip": 0.01160508, + "auxiliary_loss_mlp": 0.0105328, + "balance_loss_clip": 1.06325412, + "balance_loss_mlp": 1.03303814, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6565020601685474, + "language_loss": 0.84594893, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86808681, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.20227051, + "step": 3734, + "time_per_iteration": 2.541254997253418 + }, + { + "auxiliary_loss_clip": 0.01167055, + "auxiliary_loss_mlp": 0.01054013, + "balance_loss_clip": 1.06733024, + "balance_loss_mlp": 1.03617907, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.5837701543551834, + "language_loss": 0.76672959, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78894025, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.17822266, + "step": 3735, + "time_per_iteration": 2.5344059467315674 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.07877779, + "balance_loss_mlp": 1.023597, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 2.142579334807675, + "language_loss": 0.84472883, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86685908, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.16394043, + "step": 3736, + "time_per_iteration": 2.657240152359009 + }, + { + "auxiliary_loss_clip": 0.01167461, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.06508708, + "balance_loss_mlp": 1.03338385, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.6586030053350758, + "language_loss": 0.86603224, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88821292, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.17224121, + "step": 3737, + "time_per_iteration": 2.470970869064331 + }, + { + "auxiliary_loss_clip": 0.01170237, + "auxiliary_loss_mlp": 0.01047638, + "balance_loss_clip": 1.07094073, + "balance_loss_mlp": 1.02900517, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 1.8006436920299689, + "language_loss": 0.78831327, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81049204, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.18640137, + "step": 3738, + "time_per_iteration": 2.5179264545440674 + }, + { + "auxiliary_loss_clip": 0.01163402, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.06388533, + "balance_loss_mlp": 1.02499604, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.7189033270655858, + "language_loss": 0.76454854, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78659886, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16638184, + "step": 3739, + "time_per_iteration": 2.48457670211792 + }, + { + "auxiliary_loss_clip": 0.01159682, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.06301272, + "balance_loss_mlp": 1.02675569, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.8877050101861532, + "language_loss": 0.74541485, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76745617, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.17675781, + "step": 3740, + "time_per_iteration": 2.49074649810791 + }, + { + "auxiliary_loss_clip": 0.01156661, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.06026745, + "balance_loss_mlp": 1.02498031, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 1.8766154784333244, + "language_loss": 0.76096421, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78295517, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.17456055, + "step": 3741, + "time_per_iteration": 2.472752809524536 + }, + { + "auxiliary_loss_clip": 0.0115499, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.05894732, + "balance_loss_mlp": 1.02277148, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.731641344444913, + "language_loss": 0.86880791, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89075583, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.17028809, + "step": 3742, + "time_per_iteration": 2.479907274246216 + }, + { + "auxiliary_loss_clip": 0.01159622, + "auxiliary_loss_mlp": 0.01049442, + "balance_loss_clip": 1.06376004, + "balance_loss_mlp": 1.0315609, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.382886685448981, + "language_loss": 0.81985247, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84194314, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.17883301, + "step": 3743, + "time_per_iteration": 2.5034420490264893 + }, + { + "auxiliary_loss_clip": 0.01157092, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.0592196, + "balance_loss_mlp": 1.0262785, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.821195257039331, + "language_loss": 0.64075053, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66276795, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.18383789, + "step": 3744, + "time_per_iteration": 2.5527703762054443 + }, + { + "auxiliary_loss_clip": 0.01157556, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.05924797, + "balance_loss_mlp": 1.02829778, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 2.677543863930208, + "language_loss": 0.76492441, + "learning_rate": 3.613581408598489e-06, + "loss": 0.78697658, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.19360352, + "step": 3745, + "time_per_iteration": 2.4728806018829346 + }, + { + "auxiliary_loss_clip": 0.01162451, + "auxiliary_loss_mlp": 0.01050901, + "balance_loss_clip": 1.06277752, + "balance_loss_mlp": 1.03317416, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.7556399337344484, + "language_loss": 0.80658674, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.82872027, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17736816, + "step": 3746, + "time_per_iteration": 2.425463914871216 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.0686785, + "balance_loss_mlp": 1.02823925, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.56020488664346, + "language_loss": 0.86074609, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88291103, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.1763916, + "step": 3747, + "time_per_iteration": 2.4818508625030518 + }, + { + "auxiliary_loss_clip": 0.01159439, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.06120706, + "balance_loss_mlp": 1.01466084, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.8219193491168784, + "language_loss": 0.7639755, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78587902, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16259766, + "step": 3748, + "time_per_iteration": 2.4888765811920166 + }, + { + "auxiliary_loss_clip": 0.01164215, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.06604123, + "balance_loss_mlp": 1.02915621, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.71617187863237, + "language_loss": 0.79447722, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81658804, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17712402, + "step": 3749, + "time_per_iteration": 2.5767197608947754 + }, + { + "auxiliary_loss_clip": 0.01162851, + "auxiliary_loss_mlp": 0.01039542, + "balance_loss_clip": 1.0686233, + "balance_loss_mlp": 1.02250648, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 2.3308638151164702, + "language_loss": 0.79659605, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81861991, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.17041016, + "step": 3750, + "time_per_iteration": 2.479919910430908 + }, + { + "auxiliary_loss_clip": 0.01162616, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.06080627, + "balance_loss_mlp": 1.04377735, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.040522014782336, + "language_loss": 0.82228553, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84453416, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.18469238, + "step": 3751, + "time_per_iteration": 2.5275979042053223 + }, + { + "auxiliary_loss_clip": 0.0116717, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_clip": 1.06632674, + "balance_loss_mlp": 1.02645826, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.8817618810567565, + "language_loss": 0.83548355, + "learning_rate": 3.611969150491165e-06, + "loss": 0.85759342, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.17358398, + "step": 3752, + "time_per_iteration": 2.4329898357391357 + }, + { + "auxiliary_loss_clip": 0.01157407, + "auxiliary_loss_mlp": 0.01060034, + "balance_loss_clip": 1.0594151, + "balance_loss_mlp": 1.04040074, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.845206439287618, + "language_loss": 0.78951901, + "learning_rate": 3.611738583330375e-06, + "loss": 0.81169343, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.19628906, + "step": 3753, + "time_per_iteration": 2.43949031829834 + }, + { + "auxiliary_loss_clip": 0.01156335, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_clip": 1.05729759, + "balance_loss_mlp": 1.02413249, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 2.0254525892107065, + "language_loss": 0.78616154, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80815208, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.18591309, + "step": 3754, + "time_per_iteration": 2.5944738388061523 + }, + { + "auxiliary_loss_clip": 0.0116007, + "auxiliary_loss_mlp": 0.0105094, + "balance_loss_clip": 1.06287599, + "balance_loss_mlp": 1.03327298, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.8440119354229885, + "language_loss": 0.70488882, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72699898, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.17675781, + "step": 3755, + "time_per_iteration": 3.8990659713745117 + }, + { + "auxiliary_loss_clip": 0.01178008, + "auxiliary_loss_mlp": 0.01057544, + "balance_loss_clip": 1.07332563, + "balance_loss_mlp": 1.03945959, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 3.804379989006645, + "language_loss": 0.7758311, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79818666, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.18078613, + "step": 3756, + "time_per_iteration": 2.490563154220581 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0694499, + "balance_loss_mlp": 1.02981019, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.9818279561412564, + "language_loss": 0.82195848, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84414029, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.17822266, + "step": 3757, + "time_per_iteration": 2.512388229370117 + }, + { + "auxiliary_loss_clip": 0.01160864, + "auxiliary_loss_mlp": 0.01048503, + "balance_loss_clip": 1.05930281, + "balance_loss_mlp": 1.02993011, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8615573460352466, + "language_loss": 0.72871304, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75080669, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.18566895, + "step": 3758, + "time_per_iteration": 2.4775474071502686 + }, + { + "auxiliary_loss_clip": 0.01168977, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_clip": 1.06406975, + "balance_loss_mlp": 1.03726828, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.1206183262211966, + "language_loss": 0.77393818, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79619253, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.19189453, + "step": 3759, + "time_per_iteration": 2.4994282722473145 + }, + { + "auxiliary_loss_clip": 0.0117264, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_clip": 1.06890285, + "balance_loss_mlp": 1.02518487, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.8208659000664316, + "language_loss": 0.78340411, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80556405, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.18164062, + "step": 3760, + "time_per_iteration": 2.638490676879883 + }, + { + "auxiliary_loss_clip": 0.01112244, + "auxiliary_loss_mlp": 0.01007876, + "balance_loss_clip": 1.0768888, + "balance_loss_mlp": 1.00569153, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9472536294653654, + "language_loss": 0.60082328, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62202448, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 0.02185059, + "step": 3761, + "time_per_iteration": 3.049152135848999 + }, + { + "auxiliary_loss_clip": 0.01169429, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_clip": 1.0690937, + "balance_loss_mlp": 1.023754, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.5029292566873127, + "language_loss": 0.77527821, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79738939, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.17932129, + "step": 3762, + "time_per_iteration": 2.574373960494995 + }, + { + "auxiliary_loss_clip": 0.01170872, + "auxiliary_loss_mlp": 0.01041516, + "balance_loss_clip": 1.07125306, + "balance_loss_mlp": 1.02319384, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.718835396187585, + "language_loss": 0.78716218, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.809286, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.18322754, + "step": 3763, + "time_per_iteration": 2.4433462619781494 + }, + { + "auxiliary_loss_clip": 0.01167606, + "auxiliary_loss_mlp": 0.01057938, + "balance_loss_clip": 1.06173575, + "balance_loss_mlp": 1.03814936, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.6458600206071217, + "language_loss": 0.91309202, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93534744, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.19787598, + "step": 3764, + "time_per_iteration": 3.8312690258026123 + }, + { + "auxiliary_loss_clip": 0.01161296, + "auxiliary_loss_mlp": 0.01052799, + "balance_loss_clip": 1.06330693, + "balance_loss_mlp": 1.03465545, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 2.0734069511701834, + "language_loss": 0.75294381, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77508479, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.18139648, + "step": 3765, + "time_per_iteration": 2.5224978923797607 + }, + { + "auxiliary_loss_clip": 0.01160353, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_clip": 1.06392312, + "balance_loss_mlp": 1.02625239, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 2.1981401188579004, + "language_loss": 0.89823711, + "learning_rate": 3.608735651752494e-06, + "loss": 0.92027092, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16796875, + "step": 3766, + "time_per_iteration": 2.427642822265625 + }, + { + "auxiliary_loss_clip": 0.01158922, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.06349325, + "balance_loss_mlp": 1.02313876, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.5618918913112099, + "language_loss": 0.74614662, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76814139, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.17407227, + "step": 3767, + "time_per_iteration": 2.528438091278076 + }, + { + "auxiliary_loss_clip": 0.01159348, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_clip": 1.06039441, + "balance_loss_mlp": 1.02383208, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4648867767933391, + "language_loss": 0.71697736, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.738989, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.17980957, + "step": 3768, + "time_per_iteration": 3.989532709121704 + }, + { + "auxiliary_loss_clip": 0.01180963, + "auxiliary_loss_mlp": 0.01059944, + "balance_loss_clip": 1.08059335, + "balance_loss_mlp": 1.04069161, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.625810721430517, + "language_loss": 0.79159015, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.8139993, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.19226074, + "step": 3769, + "time_per_iteration": 2.5225942134857178 + }, + { + "auxiliary_loss_clip": 0.01171, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.06930876, + "balance_loss_mlp": 1.02526915, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.9194422577311037, + "language_loss": 0.68759042, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70973682, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.18359375, + "step": 3770, + "time_per_iteration": 2.543726682662964 + }, + { + "auxiliary_loss_clip": 0.011704, + "auxiliary_loss_mlp": 0.01049078, + "balance_loss_clip": 1.06901526, + "balance_loss_mlp": 1.03155422, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 7.009108922650328, + "language_loss": 0.80285954, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82505429, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.1751709, + "step": 3771, + "time_per_iteration": 2.5799832344055176 + }, + { + "auxiliary_loss_clip": 0.01164541, + "auxiliary_loss_mlp": 0.01051081, + "balance_loss_clip": 1.06716704, + "balance_loss_mlp": 1.0337832, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.5223476581285957, + "language_loss": 0.78946781, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81162405, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.17297363, + "step": 3772, + "time_per_iteration": 2.5278120040893555 + }, + { + "auxiliary_loss_clip": 0.01101366, + "auxiliary_loss_mlp": 0.01007972, + "balance_loss_clip": 1.06640291, + "balance_loss_mlp": 1.00603819, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6503842359429629, + "language_loss": 0.54400241, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56509578, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 0.01931763, + "step": 3773, + "time_per_iteration": 3.2061767578125 + }, + { + "auxiliary_loss_clip": 0.01162665, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.06699157, + "balance_loss_mlp": 1.01899791, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.768367381396106, + "language_loss": 0.70627874, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72827077, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17541504, + "step": 3774, + "time_per_iteration": 2.4810335636138916 + }, + { + "auxiliary_loss_clip": 0.0115667, + "auxiliary_loss_mlp": 0.01042087, + "balance_loss_clip": 1.05859518, + "balance_loss_mlp": 1.02468228, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 1.9689012719634564, + "language_loss": 0.7439099, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76589751, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17407227, + "step": 3775, + "time_per_iteration": 2.518113136291504 + }, + { + "auxiliary_loss_clip": 0.01162483, + "auxiliary_loss_mlp": 0.0104479, + "balance_loss_clip": 1.06398273, + "balance_loss_mlp": 1.02820849, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.076799829556623, + "language_loss": 0.82326835, + "learning_rate": 3.606418687985928e-06, + "loss": 0.84534109, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.16589355, + "step": 3776, + "time_per_iteration": 2.5104498863220215 + }, + { + "auxiliary_loss_clip": 0.01162606, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.06129146, + "balance_loss_mlp": 1.02959859, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.3354646776982704, + "language_loss": 0.82384515, + "learning_rate": 3.606186656428641e-06, + "loss": 0.84594429, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.17712402, + "step": 3777, + "time_per_iteration": 2.4847910404205322 + }, + { + "auxiliary_loss_clip": 0.01153162, + "auxiliary_loss_mlp": 0.0104178, + "balance_loss_clip": 1.05663466, + "balance_loss_mlp": 1.0244112, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.037867177279833, + "language_loss": 0.72272623, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74467558, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.17358398, + "step": 3778, + "time_per_iteration": 2.5264651775360107 + }, + { + "auxiliary_loss_clip": 0.01154212, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.0558852, + "balance_loss_mlp": 1.02060878, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.071288957373733, + "language_loss": 0.64313555, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66506147, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17773438, + "step": 3779, + "time_per_iteration": 2.5494983196258545 + }, + { + "auxiliary_loss_clip": 0.01153433, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.05866444, + "balance_loss_mlp": 1.03149676, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.7483104734807984, + "language_loss": 0.70985401, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.7318697, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16638184, + "step": 3780, + "time_per_iteration": 2.4808993339538574 + }, + { + "auxiliary_loss_clip": 0.01158581, + "auxiliary_loss_mlp": 0.01045226, + "balance_loss_clip": 1.06296301, + "balance_loss_mlp": 1.02695107, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.953221685790195, + "language_loss": 0.89831913, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.92035723, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.18273926, + "step": 3781, + "time_per_iteration": 2.4961302280426025 + }, + { + "auxiliary_loss_clip": 0.01160111, + "auxiliary_loss_mlp": 0.01046183, + "balance_loss_clip": 1.05954432, + "balance_loss_mlp": 1.02843285, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.378508910153326, + "language_loss": 0.74826789, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.77033091, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17736816, + "step": 3782, + "time_per_iteration": 2.4849448204040527 + }, + { + "auxiliary_loss_clip": 0.01158081, + "auxiliary_loss_mlp": 0.01045478, + "balance_loss_clip": 1.06216788, + "balance_loss_mlp": 1.02914596, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 2.002151270706542, + "language_loss": 0.82582867, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84786427, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16333008, + "step": 3783, + "time_per_iteration": 2.5088279247283936 + }, + { + "auxiliary_loss_clip": 0.01155629, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_clip": 1.05816495, + "balance_loss_mlp": 1.02539611, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 1.8304577817395453, + "language_loss": 0.7569803, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.77897024, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.1796875, + "step": 3784, + "time_per_iteration": 2.493295431137085 + }, + { + "auxiliary_loss_clip": 0.01156678, + "auxiliary_loss_mlp": 0.01042795, + "balance_loss_clip": 1.06078613, + "balance_loss_mlp": 1.02556944, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.839519432440929, + "language_loss": 0.71110928, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73310405, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17224121, + "step": 3785, + "time_per_iteration": 2.511303663253784 + }, + { + "auxiliary_loss_clip": 0.0109229, + "auxiliary_loss_mlp": 0.01004704, + "balance_loss_clip": 1.05720305, + "balance_loss_mlp": 1.00239456, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8354731417406378, + "language_loss": 0.61882734, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63979727, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.35107422, + "router_z_loss_mlp": 0.02310181, + "step": 3786, + "time_per_iteration": 3.081270694732666 + }, + { + "auxiliary_loss_clip": 0.01157722, + "auxiliary_loss_mlp": 0.01045994, + "balance_loss_clip": 1.05804944, + "balance_loss_mlp": 1.02808845, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.5819375333975865, + "language_loss": 0.87527859, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.89731574, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.17907715, + "step": 3787, + "time_per_iteration": 2.481170892715454 + }, + { + "auxiliary_loss_clip": 0.01149269, + "auxiliary_loss_mlp": 0.01038244, + "balance_loss_clip": 1.05470991, + "balance_loss_mlp": 1.02140009, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 2.024248488143491, + "language_loss": 0.72876871, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.75064385, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.16833496, + "step": 3788, + "time_per_iteration": 2.507741928100586 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.05765617, + "balance_loss_mlp": 1.01738858, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.6889814617051098, + "language_loss": 0.67598927, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69786739, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16699219, + "step": 3789, + "time_per_iteration": 2.441486120223999 + }, + { + "auxiliary_loss_clip": 0.01160649, + "auxiliary_loss_mlp": 0.01046144, + "balance_loss_clip": 1.0632664, + "balance_loss_mlp": 1.02877498, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 1.8481585008114778, + "language_loss": 0.75962538, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78169334, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.17382812, + "step": 3790, + "time_per_iteration": 2.4919121265411377 + }, + { + "auxiliary_loss_clip": 0.01151337, + "auxiliary_loss_mlp": 0.0104465, + "balance_loss_clip": 1.05480397, + "balance_loss_mlp": 1.02711415, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 3.0433374125147155, + "language_loss": 0.9144696, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93642944, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.17541504, + "step": 3791, + "time_per_iteration": 2.5406627655029297 + }, + { + "auxiliary_loss_clip": 0.0115307, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.05462956, + "balance_loss_mlp": 1.02044952, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 3.1578776248632234, + "language_loss": 0.82529354, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84720647, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17773438, + "step": 3792, + "time_per_iteration": 2.537308692932129 + }, + { + "auxiliary_loss_clip": 0.01087725, + "auxiliary_loss_mlp": 0.01010545, + "balance_loss_clip": 1.0511924, + "balance_loss_mlp": 1.00841427, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.148262207583307, + "language_loss": 0.65605974, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67704242, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.36621094, + "router_z_loss_mlp": 0.02130127, + "step": 3793, + "time_per_iteration": 2.827017068862915 + }, + { + "auxiliary_loss_clip": 0.01165446, + "auxiliary_loss_mlp": 0.01053108, + "balance_loss_clip": 1.06285977, + "balance_loss_mlp": 1.03484488, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.0391702689182547, + "language_loss": 0.77695048, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79913604, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.18286133, + "step": 3794, + "time_per_iteration": 2.507894515991211 + }, + { + "auxiliary_loss_clip": 0.01158441, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.06089139, + "balance_loss_mlp": 1.02614141, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 1.6919152251798022, + "language_loss": 0.80436367, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82638019, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17089844, + "step": 3795, + "time_per_iteration": 2.4969422817230225 + }, + { + "auxiliary_loss_clip": 0.01162165, + "auxiliary_loss_mlp": 0.01048564, + "balance_loss_clip": 1.06629562, + "balance_loss_mlp": 1.03107619, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.6083554595025726, + "language_loss": 0.77014399, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79225129, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17492676, + "step": 3796, + "time_per_iteration": 2.4821505546569824 + }, + { + "auxiliary_loss_clip": 0.01157663, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0592761, + "balance_loss_mlp": 1.0264957, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.4134394697481674, + "language_loss": 0.9548254, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.97683465, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.16772461, + "step": 3797, + "time_per_iteration": 2.432433843612671 + }, + { + "auxiliary_loss_clip": 0.01158143, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.06263089, + "balance_loss_mlp": 1.02473402, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.623663088700137, + "language_loss": 0.81037641, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83237338, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16821289, + "step": 3798, + "time_per_iteration": 2.49991774559021 + }, + { + "auxiliary_loss_clip": 0.01154087, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.05574489, + "balance_loss_mlp": 1.02190411, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 1.9872602850023413, + "language_loss": 0.78699309, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.8089329, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17980957, + "step": 3799, + "time_per_iteration": 4.179476976394653 + }, + { + "auxiliary_loss_clip": 0.01171653, + "auxiliary_loss_mlp": 0.01048595, + "balance_loss_clip": 1.0733043, + "balance_loss_mlp": 1.03122628, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.856452927933955, + "language_loss": 0.75190997, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77411246, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17358398, + "step": 3800, + "time_per_iteration": 2.5387320518493652 + }, + { + "auxiliary_loss_clip": 0.0115265, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.05759501, + "balance_loss_mlp": 1.02294612, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.8106016264490534, + "language_loss": 0.64096594, + "learning_rate": 3.600599647297484e-06, + "loss": 0.66288078, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15905762, + "step": 3801, + "time_per_iteration": 2.5262653827667236 + }, + { + "auxiliary_loss_clip": 0.01148969, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.05590439, + "balance_loss_mlp": 1.02140915, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.615761115504479, + "language_loss": 0.81869942, + "learning_rate": 3.60036609571682e-06, + "loss": 0.8405596, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.15637207, + "step": 3802, + "time_per_iteration": 2.5638394355773926 + }, + { + "auxiliary_loss_clip": 0.01150521, + "auxiliary_loss_mlp": 0.01048289, + "balance_loss_clip": 1.05445898, + "balance_loss_mlp": 1.0307889, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7141049405765536, + "language_loss": 0.79280818, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81479633, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.1751709, + "step": 3803, + "time_per_iteration": 2.5750935077667236 + }, + { + "auxiliary_loss_clip": 0.01158442, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.05691206, + "balance_loss_mlp": 1.02903497, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.8057424624594045, + "language_loss": 0.85283518, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87489235, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.18225098, + "step": 3804, + "time_per_iteration": 2.523197889328003 + }, + { + "auxiliary_loss_clip": 0.01169026, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.06807184, + "balance_loss_mlp": 1.0247246, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.318884283847781, + "language_loss": 0.76124758, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78335673, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.17163086, + "step": 3805, + "time_per_iteration": 2.4555184841156006 + }, + { + "auxiliary_loss_clip": 0.01165178, + "auxiliary_loss_mlp": 0.01048227, + "balance_loss_clip": 1.0642451, + "balance_loss_mlp": 1.02972603, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.2488637366711957, + "language_loss": 0.79013705, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81227112, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.18493652, + "step": 3806, + "time_per_iteration": 2.465956687927246 + }, + { + "auxiliary_loss_clip": 0.01161833, + "auxiliary_loss_mlp": 0.01055629, + "balance_loss_clip": 1.06166911, + "balance_loss_mlp": 1.03692508, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.117495113142631, + "language_loss": 0.69847292, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.72064757, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.18713379, + "step": 3807, + "time_per_iteration": 2.6168465614318848 + }, + { + "auxiliary_loss_clip": 0.01165845, + "auxiliary_loss_mlp": 0.01054487, + "balance_loss_clip": 1.06317854, + "balance_loss_mlp": 1.03596163, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.5766191001712615, + "language_loss": 0.64860773, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67081106, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.18518066, + "step": 3808, + "time_per_iteration": 3.8830909729003906 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.06246245, + "balance_loss_mlp": 1.03145254, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 5.6275497219393, + "language_loss": 0.74818397, + "learning_rate": 3.598729535939222e-06, + "loss": 0.77028775, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.17614746, + "step": 3809, + "time_per_iteration": 2.477208375930786 + }, + { + "auxiliary_loss_clip": 0.01173313, + "auxiliary_loss_mlp": 0.01046485, + "balance_loss_clip": 1.07299101, + "balance_loss_mlp": 1.02971244, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.5894925229514283, + "language_loss": 0.81609046, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83828843, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.16760254, + "step": 3810, + "time_per_iteration": 2.5049846172332764 + }, + { + "auxiliary_loss_clip": 0.01154608, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_clip": 1.05735183, + "balance_loss_mlp": 1.0321039, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.8928919694762203, + "language_loss": 0.786484, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80853325, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.18188477, + "step": 3811, + "time_per_iteration": 3.9999074935913086 + }, + { + "auxiliary_loss_clip": 0.01156571, + "auxiliary_loss_mlp": 0.01042625, + "balance_loss_clip": 1.06038487, + "balance_loss_mlp": 1.02529192, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.6640854583476172, + "language_loss": 0.8292222, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.85121417, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.17321777, + "step": 3812, + "time_per_iteration": 2.4777705669403076 + }, + { + "auxiliary_loss_clip": 0.01163694, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.06155992, + "balance_loss_mlp": 1.03477478, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.1238618180625575, + "language_loss": 0.83121055, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.85336673, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.17138672, + "step": 3813, + "time_per_iteration": 2.537266254425049 + }, + { + "auxiliary_loss_clip": 0.01161038, + "auxiliary_loss_mlp": 0.01047545, + "balance_loss_clip": 1.06172943, + "balance_loss_mlp": 1.03122568, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 2.6205354380578356, + "language_loss": 0.700984, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72306991, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.16320801, + "step": 3814, + "time_per_iteration": 2.6028940677642822 + }, + { + "auxiliary_loss_clip": 0.01156022, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.06011701, + "balance_loss_mlp": 1.02961898, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.6416228035158813, + "language_loss": 0.67150354, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69352877, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16882324, + "step": 3815, + "time_per_iteration": 2.500624179840088 + }, + { + "auxiliary_loss_clip": 0.01161433, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.06432438, + "balance_loss_mlp": 1.02920866, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.6681651619132898, + "language_loss": 0.83224201, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85431206, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.16333008, + "step": 3816, + "time_per_iteration": 2.561598300933838 + }, + { + "auxiliary_loss_clip": 0.01164478, + "auxiliary_loss_mlp": 0.01047544, + "balance_loss_clip": 1.06488776, + "balance_loss_mlp": 1.02951908, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.0666143674000894, + "language_loss": 0.87343961, + "learning_rate": 3.596855544646742e-06, + "loss": 0.89555979, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.18017578, + "step": 3817, + "time_per_iteration": 2.459993362426758 + }, + { + "auxiliary_loss_clip": 0.01166376, + "auxiliary_loss_mlp": 0.01049086, + "balance_loss_clip": 1.06396961, + "balance_loss_mlp": 1.03178859, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 1.7067349532675076, + "language_loss": 0.7479822, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77013683, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 1.02441406, + "router_z_loss_mlp": 0.17297363, + "step": 3818, + "time_per_iteration": 2.591017961502075 + }, + { + "auxiliary_loss_clip": 0.01159641, + "auxiliary_loss_mlp": 0.01043277, + "balance_loss_clip": 1.06167269, + "balance_loss_mlp": 1.02581298, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.8599106047335368, + "language_loss": 0.74555683, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76758599, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.17468262, + "step": 3819, + "time_per_iteration": 2.5369434356689453 + }, + { + "auxiliary_loss_clip": 0.01154702, + "auxiliary_loss_mlp": 0.01042566, + "balance_loss_clip": 1.05922282, + "balance_loss_mlp": 1.02587676, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 2.108688236033096, + "language_loss": 0.80757463, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.82954729, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.16699219, + "step": 3820, + "time_per_iteration": 2.6868386268615723 + }, + { + "auxiliary_loss_clip": 0.01160686, + "auxiliary_loss_mlp": 0.01050807, + "balance_loss_clip": 1.0595839, + "balance_loss_mlp": 1.03224635, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.7918239678765833, + "language_loss": 0.68838835, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71050334, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.18566895, + "step": 3821, + "time_per_iteration": 2.538726568222046 + }, + { + "auxiliary_loss_clip": 0.01160143, + "auxiliary_loss_mlp": 0.01040713, + "balance_loss_clip": 1.0623343, + "balance_loss_mlp": 1.02353537, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.6487899970627369, + "language_loss": 0.82993865, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85194719, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.171875, + "step": 3822, + "time_per_iteration": 2.587218999862671 + }, + { + "auxiliary_loss_clip": 0.0115811, + "auxiliary_loss_mlp": 0.01039579, + "balance_loss_clip": 1.06144166, + "balance_loss_mlp": 1.02216291, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.836085157417302, + "language_loss": 0.66558719, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68756413, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17419434, + "step": 3823, + "time_per_iteration": 2.601776361465454 + }, + { + "auxiliary_loss_clip": 0.01085963, + "auxiliary_loss_mlp": 0.01005575, + "balance_loss_clip": 1.05085754, + "balance_loss_mlp": 1.00344968, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8026310065163665, + "language_loss": 0.56762803, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58854342, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.35058594, + "router_z_loss_mlp": 0.02127075, + "step": 3824, + "time_per_iteration": 3.176443576812744 + }, + { + "auxiliary_loss_clip": 0.01151608, + "auxiliary_loss_mlp": 0.0104337, + "balance_loss_clip": 1.05626225, + "balance_loss_mlp": 1.02510762, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 1.9906835565383305, + "language_loss": 0.72984159, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75179136, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.18261719, + "step": 3825, + "time_per_iteration": 2.575723171234131 + }, + { + "auxiliary_loss_clip": 0.01159387, + "auxiliary_loss_mlp": 0.01047069, + "balance_loss_clip": 1.06015968, + "balance_loss_mlp": 1.02933085, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 2.285787716603154, + "language_loss": 0.87890923, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.9009738, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.17736816, + "step": 3826, + "time_per_iteration": 2.567401170730591 + }, + { + "auxiliary_loss_clip": 0.01163238, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.06178069, + "balance_loss_mlp": 1.02389967, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.673156459399536, + "language_loss": 0.81510395, + "learning_rate": 3.594507606303083e-06, + "loss": 0.83715427, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.17895508, + "step": 3827, + "time_per_iteration": 2.434699296951294 + }, + { + "auxiliary_loss_clip": 0.01146395, + "auxiliary_loss_mlp": 0.01045746, + "balance_loss_clip": 1.05385804, + "balance_loss_mlp": 1.02865171, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.913568947098553, + "language_loss": 0.86939901, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.89132047, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.17102051, + "step": 3828, + "time_per_iteration": 2.464400053024292 + }, + { + "auxiliary_loss_clip": 0.01170243, + "auxiliary_loss_mlp": 0.01046517, + "balance_loss_clip": 1.07173014, + "balance_loss_mlp": 1.02899313, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.25245392613583, + "language_loss": 0.70558029, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72774792, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.17529297, + "step": 3829, + "time_per_iteration": 2.457972764968872 + }, + { + "auxiliary_loss_clip": 0.0116124, + "auxiliary_loss_mlp": 0.01050185, + "balance_loss_clip": 1.06775534, + "balance_loss_mlp": 1.03229213, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5345112225562856, + "language_loss": 0.84685946, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86897373, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.17871094, + "step": 3830, + "time_per_iteration": 2.514909029006958 + }, + { + "auxiliary_loss_clip": 0.0116072, + "auxiliary_loss_mlp": 0.01049179, + "balance_loss_clip": 1.06145501, + "balance_loss_mlp": 1.03313363, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.7247775861313317, + "language_loss": 0.67400157, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69610053, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.16040039, + "step": 3831, + "time_per_iteration": 2.64463472366333 + }, + { + "auxiliary_loss_clip": 0.01176221, + "auxiliary_loss_mlp": 0.01046671, + "balance_loss_clip": 1.07661343, + "balance_loss_mlp": 1.02993369, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.123218560563812, + "language_loss": 0.74636823, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.76859713, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.1673584, + "step": 3832, + "time_per_iteration": 2.495807409286499 + }, + { + "auxiliary_loss_clip": 0.0115797, + "auxiliary_loss_mlp": 0.01046743, + "balance_loss_clip": 1.06273246, + "balance_loss_mlp": 1.02912426, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 2.7290025651522667, + "language_loss": 0.86852229, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89056945, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.17614746, + "step": 3833, + "time_per_iteration": 2.442239284515381 + }, + { + "auxiliary_loss_clip": 0.01159997, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.06488824, + "balance_loss_mlp": 1.02535725, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 1.702556289148058, + "language_loss": 0.75015211, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77217036, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.16479492, + "step": 3834, + "time_per_iteration": 2.5079452991485596 + }, + { + "auxiliary_loss_clip": 0.01146298, + "auxiliary_loss_mlp": 0.01052041, + "balance_loss_clip": 1.05308211, + "balance_loss_mlp": 1.03377795, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.8010279320217264, + "language_loss": 0.86388224, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88586557, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.18286133, + "step": 3835, + "time_per_iteration": 2.4734227657318115 + }, + { + "auxiliary_loss_clip": 0.01159251, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.06022489, + "balance_loss_mlp": 1.02853084, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.099268957899418, + "language_loss": 0.82058382, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84263659, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.1751709, + "step": 3836, + "time_per_iteration": 2.59615159034729 + }, + { + "auxiliary_loss_clip": 0.01166868, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.06715834, + "balance_loss_mlp": 1.02821481, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.5716471625895105, + "language_loss": 0.79770076, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81982088, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.16918945, + "step": 3837, + "time_per_iteration": 2.5143253803253174 + }, + { + "auxiliary_loss_clip": 0.01103137, + "auxiliary_loss_mlp": 0.01019011, + "balance_loss_clip": 1.06755161, + "balance_loss_mlp": 1.01714587, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9054741215790042, + "language_loss": 0.65373802, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67495954, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.01864624, + "step": 3838, + "time_per_iteration": 3.0087361335754395 + }, + { + "auxiliary_loss_clip": 0.01155925, + "auxiliary_loss_mlp": 0.0104945, + "balance_loss_clip": 1.06214976, + "balance_loss_mlp": 1.03355908, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 1.885393171635673, + "language_loss": 0.75582969, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77788341, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15893555, + "step": 3839, + "time_per_iteration": 2.460496187210083 + }, + { + "auxiliary_loss_clip": 0.01152733, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.05647385, + "balance_loss_mlp": 1.02976108, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 2.1135111844962937, + "language_loss": 0.68721652, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70922124, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.17980957, + "step": 3840, + "time_per_iteration": 2.560317039489746 + }, + { + "auxiliary_loss_clip": 0.01167753, + "auxiliary_loss_mlp": 0.01048339, + "balance_loss_clip": 1.06777692, + "balance_loss_mlp": 1.0295639, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 1.988327318437386, + "language_loss": 0.79286861, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81502956, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.18786621, + "step": 3841, + "time_per_iteration": 2.494222402572632 + }, + { + "auxiliary_loss_clip": 0.01157216, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.06049836, + "balance_loss_mlp": 1.02744675, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.7687317078318097, + "language_loss": 0.83186138, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85385907, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.15100098, + "step": 3842, + "time_per_iteration": 2.5238516330718994 + }, + { + "auxiliary_loss_clip": 0.01153529, + "auxiliary_loss_mlp": 0.01043176, + "balance_loss_clip": 1.05838823, + "balance_loss_mlp": 1.02575958, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.5170985587745949, + "language_loss": 0.66559142, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68755847, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.17431641, + "step": 3843, + "time_per_iteration": 4.148302316665649 + }, + { + "auxiliary_loss_clip": 0.01151729, + "auxiliary_loss_mlp": 0.01045544, + "balance_loss_clip": 1.05985332, + "balance_loss_mlp": 1.02890253, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.6731970917670584, + "language_loss": 0.77468443, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79665709, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.16638184, + "step": 3844, + "time_per_iteration": 2.573371648788452 + }, + { + "auxiliary_loss_clip": 0.01154651, + "auxiliary_loss_mlp": 0.01049391, + "balance_loss_clip": 1.05704618, + "balance_loss_mlp": 1.02965021, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6478799414837377, + "language_loss": 0.78304124, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80508167, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.1973877, + "step": 3845, + "time_per_iteration": 2.488835096359253 + }, + { + "auxiliary_loss_clip": 0.01153279, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.06174612, + "balance_loss_mlp": 1.02254236, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.3176445456303445, + "language_loss": 0.76663059, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78854257, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15362549, + "step": 3846, + "time_per_iteration": 2.4953784942626953 + }, + { + "auxiliary_loss_clip": 0.01160455, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.0635097, + "balance_loss_mlp": 1.02934051, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.047546376064053, + "language_loss": 0.69480658, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71687084, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.16625977, + "step": 3847, + "time_per_iteration": 2.457710027694702 + }, + { + "auxiliary_loss_clip": 0.01109975, + "auxiliary_loss_mlp": 0.01015089, + "balance_loss_clip": 1.07343125, + "balance_loss_mlp": 1.01304173, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7917082865559307, + "language_loss": 0.6103071, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63155782, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.02047729, + "step": 3848, + "time_per_iteration": 3.104973316192627 + }, + { + "auxiliary_loss_clip": 0.01149974, + "auxiliary_loss_mlp": 0.01046736, + "balance_loss_clip": 1.05400074, + "balance_loss_mlp": 1.02812719, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.0970232458101363, + "language_loss": 0.78350341, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80547059, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.18603516, + "step": 3849, + "time_per_iteration": 2.476198673248291 + }, + { + "auxiliary_loss_clip": 0.0117532, + "auxiliary_loss_mlp": 0.01041011, + "balance_loss_clip": 1.07638109, + "balance_loss_mlp": 1.02414298, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 2.5144152431796494, + "language_loss": 0.71430552, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73646879, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.16864014, + "step": 3850, + "time_per_iteration": 2.5829944610595703 + }, + { + "auxiliary_loss_clip": 0.011647, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.06856942, + "balance_loss_mlp": 1.02081752, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 2.2365745206006196, + "language_loss": 0.7646594, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78668058, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16577148, + "step": 3851, + "time_per_iteration": 2.4725708961486816 + }, + { + "auxiliary_loss_clip": 0.01153173, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.05919373, + "balance_loss_mlp": 1.02411044, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.686911747112298, + "language_loss": 0.69907105, + "learning_rate": 3.588611327033723e-06, + "loss": 0.72101241, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.16833496, + "step": 3852, + "time_per_iteration": 4.060085773468018 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.06125426, + "balance_loss_mlp": 1.02613568, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 5.173779062611918, + "language_loss": 0.67641819, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69843554, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.17248535, + "step": 3853, + "time_per_iteration": 2.463052749633789 + }, + { + "auxiliary_loss_clip": 0.01162204, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.06306887, + "balance_loss_mlp": 1.02044463, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.7051602318921815, + "language_loss": 0.7990157, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82102108, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.17895508, + "step": 3854, + "time_per_iteration": 4.013747215270996 + }, + { + "auxiliary_loss_clip": 0.01161955, + "auxiliary_loss_mlp": 0.01050044, + "balance_loss_clip": 1.05770993, + "balance_loss_mlp": 1.03144777, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 5.443862814787617, + "language_loss": 0.65315741, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67527735, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.18603516, + "step": 3855, + "time_per_iteration": 2.482537031173706 + }, + { + "auxiliary_loss_clip": 0.01163576, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.06366467, + "balance_loss_mlp": 1.02945828, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 3.1097384450573298, + "language_loss": 0.70856249, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73065901, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.99755859, + "router_z_loss_mlp": 0.16625977, + "step": 3856, + "time_per_iteration": 3.9450793266296387 + }, + { + "auxiliary_loss_clip": 0.01155833, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.06219137, + "balance_loss_mlp": 1.02024829, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 1.6391517411179548, + "language_loss": 0.77381283, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79572719, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.15356445, + "step": 3857, + "time_per_iteration": 2.5651612281799316 + }, + { + "auxiliary_loss_clip": 0.01161216, + "auxiliary_loss_mlp": 0.01048771, + "balance_loss_clip": 1.061234, + "balance_loss_mlp": 1.03052044, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.5287427296609484, + "language_loss": 0.91221052, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93431044, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.18249512, + "step": 3858, + "time_per_iteration": 2.4171783924102783 + }, + { + "auxiliary_loss_clip": 0.01156338, + "auxiliary_loss_mlp": 0.01043353, + "balance_loss_clip": 1.06351066, + "balance_loss_mlp": 1.02647889, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.971342034897987, + "language_loss": 0.76615405, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78815091, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.16876221, + "step": 3859, + "time_per_iteration": 2.5693137645721436 + }, + { + "auxiliary_loss_clip": 0.01152811, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.05710697, + "balance_loss_mlp": 1.01938426, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.716014679550892, + "language_loss": 0.84216487, + "learning_rate": 3.58671655924898e-06, + "loss": 0.8640576, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.1708374, + "step": 3860, + "time_per_iteration": 2.4532153606414795 + }, + { + "auxiliary_loss_clip": 0.01159563, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.06029058, + "balance_loss_mlp": 1.02816653, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 1.932591364301357, + "language_loss": 0.82875538, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85081029, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.1776123, + "step": 3861, + "time_per_iteration": 2.502225160598755 + }, + { + "auxiliary_loss_clip": 0.01170016, + "auxiliary_loss_mlp": 0.01044623, + "balance_loss_clip": 1.07090247, + "balance_loss_mlp": 1.02780247, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.7352237875895318, + "language_loss": 0.85513598, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87728238, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.16821289, + "step": 3862, + "time_per_iteration": 2.4860787391662598 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.06810832, + "balance_loss_mlp": 1.02863431, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.5209692438839046, + "language_loss": 0.74843228, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.77049005, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15270996, + "step": 3863, + "time_per_iteration": 2.499694585800171 + }, + { + "auxiliary_loss_clip": 0.01155898, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.06270683, + "balance_loss_mlp": 1.02695823, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.6633802537214635, + "language_loss": 0.74589449, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76788259, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15966797, + "step": 3864, + "time_per_iteration": 2.4436511993408203 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.05824053, + "balance_loss_mlp": 1.02229166, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 2.0886357976286463, + "language_loss": 0.70862067, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.73054826, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.1673584, + "step": 3865, + "time_per_iteration": 2.7121734619140625 + }, + { + "auxiliary_loss_clip": 0.01173675, + "auxiliary_loss_mlp": 0.01047212, + "balance_loss_clip": 1.06870961, + "balance_loss_mlp": 1.02880645, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.8060380411023926, + "language_loss": 0.94906718, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.9712761, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.18383789, + "step": 3866, + "time_per_iteration": 2.5777575969696045 + }, + { + "auxiliary_loss_clip": 0.01156044, + "auxiliary_loss_mlp": 0.0104469, + "balance_loss_clip": 1.06260228, + "balance_loss_mlp": 1.0288353, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 3.535760515734554, + "language_loss": 0.73556697, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75757432, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.15856934, + "step": 3867, + "time_per_iteration": 2.562391757965088 + }, + { + "auxiliary_loss_clip": 0.01154625, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.05815279, + "balance_loss_mlp": 1.02762842, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.684818615607505, + "language_loss": 0.82515955, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84715712, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.17492676, + "step": 3868, + "time_per_iteration": 2.498121976852417 + }, + { + "auxiliary_loss_clip": 0.01163119, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.06651199, + "balance_loss_mlp": 1.02324045, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.6597724593273362, + "language_loss": 0.72967172, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75170189, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.16650391, + "step": 3869, + "time_per_iteration": 2.5695102214813232 + }, + { + "auxiliary_loss_clip": 0.01163064, + "auxiliary_loss_mlp": 0.01048657, + "balance_loss_clip": 1.0677712, + "balance_loss_mlp": 1.03257525, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.7071419421644096, + "language_loss": 0.79216719, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81428444, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.16088867, + "step": 3870, + "time_per_iteration": 2.563793420791626 + }, + { + "auxiliary_loss_clip": 0.01163702, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.06513834, + "balance_loss_mlp": 1.02766514, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 2.392429190314773, + "language_loss": 0.70683372, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72891903, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17163086, + "step": 3871, + "time_per_iteration": 2.54011869430542 + }, + { + "auxiliary_loss_clip": 0.01173462, + "auxiliary_loss_mlp": 0.01053319, + "balance_loss_clip": 1.07457781, + "balance_loss_mlp": 1.03431749, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.14444843480305, + "language_loss": 0.69148552, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71375334, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.19006348, + "step": 3872, + "time_per_iteration": 2.6156129837036133 + }, + { + "auxiliary_loss_clip": 0.0116566, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_clip": 1.06218696, + "balance_loss_mlp": 1.02964413, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.5314525411869868, + "language_loss": 0.77508688, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.79722232, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.18249512, + "step": 3873, + "time_per_iteration": 2.607174873352051 + }, + { + "auxiliary_loss_clip": 0.0112651, + "auxiliary_loss_mlp": 0.01017132, + "balance_loss_clip": 1.09002972, + "balance_loss_mlp": 1.01452994, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8515272532661018, + "language_loss": 0.60597783, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62741423, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 0.02606201, + "step": 3874, + "time_per_iteration": 2.9999072551727295 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01048551, + "balance_loss_clip": 1.06048751, + "balance_loss_mlp": 1.03137338, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.763145938775669, + "language_loss": 0.80431104, + "learning_rate": 3.583153494218927e-06, + "loss": 0.82637376, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.171875, + "step": 3875, + "time_per_iteration": 2.4715044498443604 + }, + { + "auxiliary_loss_clip": 0.0116122, + "auxiliary_loss_mlp": 0.01039694, + "balance_loss_clip": 1.06656539, + "balance_loss_mlp": 1.02429855, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.592800776726731, + "language_loss": 0.61346745, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63547659, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15411377, + "step": 3876, + "time_per_iteration": 2.531109094619751 + }, + { + "auxiliary_loss_clip": 0.01166963, + "auxiliary_loss_mlp": 0.01045141, + "balance_loss_clip": 1.06858492, + "balance_loss_mlp": 1.02730703, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.9907205960223215, + "language_loss": 0.70941997, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.73154104, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17834473, + "step": 3877, + "time_per_iteration": 2.4736671447753906 + }, + { + "auxiliary_loss_clip": 0.01175551, + "auxiliary_loss_mlp": 0.01049403, + "balance_loss_clip": 1.07481265, + "balance_loss_mlp": 1.03156877, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.5097413712039494, + "language_loss": 0.81422824, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83647776, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.1784668, + "step": 3878, + "time_per_iteration": 2.4289090633392334 + }, + { + "auxiliary_loss_clip": 0.01162173, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.06317878, + "balance_loss_mlp": 1.02705681, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.6069997430735792, + "language_loss": 0.75383812, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.7759167, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.18615723, + "step": 3879, + "time_per_iteration": 2.580007553100586 + }, + { + "auxiliary_loss_clip": 0.01158919, + "auxiliary_loss_mlp": 0.01051843, + "balance_loss_clip": 1.05907238, + "balance_loss_mlp": 1.03297234, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.3054924445290723, + "language_loss": 0.89624566, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.91835332, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.1887207, + "step": 3880, + "time_per_iteration": 2.475895404815674 + }, + { + "auxiliary_loss_clip": 0.0117752, + "auxiliary_loss_mlp": 0.01052387, + "balance_loss_clip": 1.07579112, + "balance_loss_mlp": 1.03551877, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.5877635749565453, + "language_loss": 0.7165513, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.73885047, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.16870117, + "step": 3881, + "time_per_iteration": 2.5053796768188477 + }, + { + "auxiliary_loss_clip": 0.0115996, + "auxiliary_loss_mlp": 0.01048134, + "balance_loss_clip": 1.06280875, + "balance_loss_mlp": 1.02940631, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.699469518446959, + "language_loss": 0.68280149, + "learning_rate": 3.581486106120537e-06, + "loss": 0.7048825, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.18713379, + "step": 3882, + "time_per_iteration": 2.5392649173736572 + }, + { + "auxiliary_loss_clip": 0.01172521, + "auxiliary_loss_mlp": 0.0105085, + "balance_loss_clip": 1.07156706, + "balance_loss_mlp": 1.03330266, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 1.9208459871284467, + "language_loss": 0.7696299, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79186362, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17541504, + "step": 3883, + "time_per_iteration": 2.578145742416382 + }, + { + "auxiliary_loss_clip": 0.01108328, + "auxiliary_loss_mlp": 0.01028457, + "balance_loss_clip": 1.07377958, + "balance_loss_mlp": 1.02643049, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7981350583039054, + "language_loss": 0.59130645, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61267424, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.34521484, + "router_z_loss_mlp": 0.02026367, + "step": 3884, + "time_per_iteration": 3.3510189056396484 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.05960751, + "balance_loss_mlp": 1.02161074, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 2.20418114554595, + "language_loss": 0.8028909, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82482266, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.15820312, + "step": 3885, + "time_per_iteration": 2.5145864486694336 + }, + { + "auxiliary_loss_clip": 0.01162844, + "auxiliary_loss_mlp": 0.01038734, + "balance_loss_clip": 1.06670499, + "balance_loss_mlp": 1.02159142, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.127113209267429, + "language_loss": 0.88169938, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90371513, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.17138672, + "step": 3886, + "time_per_iteration": 2.5064890384674072 + }, + { + "auxiliary_loss_clip": 0.0116863, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.07052922, + "balance_loss_mlp": 1.02272296, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 1.8756329700275056, + "language_loss": 0.73467183, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75675094, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.16552734, + "step": 3887, + "time_per_iteration": 3.962700605392456 + }, + { + "auxiliary_loss_clip": 0.01164444, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.0651052, + "balance_loss_mlp": 1.02627397, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8181963574286566, + "language_loss": 0.83971608, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86179209, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.16882324, + "step": 3888, + "time_per_iteration": 2.5970616340637207 + }, + { + "auxiliary_loss_clip": 0.01158448, + "auxiliary_loss_mlp": 0.01058764, + "balance_loss_clip": 1.0588913, + "balance_loss_mlp": 1.04100168, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 4.632902663661644, + "language_loss": 0.87741721, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.8995893, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.1776123, + "step": 3889, + "time_per_iteration": 2.562196731567383 + }, + { + "auxiliary_loss_clip": 0.0115607, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.05848384, + "balance_loss_mlp": 1.02405477, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 2.4241774529881197, + "language_loss": 0.76843899, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79040641, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.1661377, + "step": 3890, + "time_per_iteration": 2.458508014678955 + }, + { + "auxiliary_loss_clip": 0.01160861, + "auxiliary_loss_mlp": 0.01047483, + "balance_loss_clip": 1.0618732, + "balance_loss_mlp": 1.03009057, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.8676374855963789, + "language_loss": 0.72966731, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75175071, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.1739502, + "step": 3891, + "time_per_iteration": 2.6859617233276367 + }, + { + "auxiliary_loss_clip": 0.01161082, + "auxiliary_loss_mlp": 0.01041329, + "balance_loss_clip": 1.0665158, + "balance_loss_mlp": 1.02449703, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.8326866691623913, + "language_loss": 0.82753408, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84955817, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.16809082, + "step": 3892, + "time_per_iteration": 2.4823195934295654 + }, + { + "auxiliary_loss_clip": 0.01161089, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.06178975, + "balance_loss_mlp": 1.02546322, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 1.5245969904531744, + "language_loss": 0.65107238, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67311239, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.17443848, + "step": 3893, + "time_per_iteration": 2.7553577423095703 + }, + { + "auxiliary_loss_clip": 0.01171334, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.07392526, + "balance_loss_mlp": 1.02107882, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 1.8519812554557307, + "language_loss": 0.79046005, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81255984, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.17565918, + "step": 3894, + "time_per_iteration": 2.516016960144043 + }, + { + "auxiliary_loss_clip": 0.01153954, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.05977774, + "balance_loss_mlp": 1.02314973, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.9478930875260143, + "language_loss": 0.82282394, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84475279, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.15771484, + "step": 3895, + "time_per_iteration": 2.514538288116455 + }, + { + "auxiliary_loss_clip": 0.0115607, + "auxiliary_loss_mlp": 0.01043952, + "balance_loss_clip": 1.0600518, + "balance_loss_mlp": 1.02683306, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 1.902345713419344, + "language_loss": 0.8027432, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82474339, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.17102051, + "step": 3896, + "time_per_iteration": 3.8810372352600098 + }, + { + "auxiliary_loss_clip": 0.01162373, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.06261837, + "balance_loss_mlp": 1.02269936, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.5789578718758777, + "language_loss": 0.83146131, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85349596, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.18383789, + "step": 3897, + "time_per_iteration": 2.4867801666259766 + }, + { + "auxiliary_loss_clip": 0.01158404, + "auxiliary_loss_mlp": 0.010462, + "balance_loss_clip": 1.05830479, + "balance_loss_mlp": 1.02843738, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.810313192307975, + "language_loss": 0.79230791, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81435394, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.1776123, + "step": 3898, + "time_per_iteration": 3.8654985427856445 + }, + { + "auxiliary_loss_clip": 0.0115576, + "auxiliary_loss_mlp": 0.01047759, + "balance_loss_clip": 1.06125534, + "balance_loss_mlp": 1.03054571, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 2.10967797523088, + "language_loss": 0.74171424, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76374948, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.17199707, + "step": 3899, + "time_per_iteration": 2.459822177886963 + }, + { + "auxiliary_loss_clip": 0.01155882, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.05923343, + "balance_loss_mlp": 1.03117919, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.6532665834315365, + "language_loss": 0.75346875, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77551842, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.17919922, + "step": 3900, + "time_per_iteration": 3.894078493118286 + }, + { + "auxiliary_loss_clip": 0.01157909, + "auxiliary_loss_mlp": 0.01053716, + "balance_loss_clip": 1.06249261, + "balance_loss_mlp": 1.03485703, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 2.509604137688346, + "language_loss": 0.66819489, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69031119, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.18847656, + "step": 3901, + "time_per_iteration": 2.4212160110473633 + }, + { + "auxiliary_loss_clip": 0.01086353, + "auxiliary_loss_mlp": 0.01017497, + "balance_loss_clip": 1.05152285, + "balance_loss_mlp": 1.01503277, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.763718520421567, + "language_loss": 0.58226085, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60329938, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.34863281, + "router_z_loss_mlp": 0.02462769, + "step": 3902, + "time_per_iteration": 3.036947250366211 + }, + { + "auxiliary_loss_clip": 0.01160553, + "auxiliary_loss_mlp": 0.01048761, + "balance_loss_clip": 1.06250596, + "balance_loss_mlp": 1.03112984, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.8857045572186144, + "language_loss": 0.8020817, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82417482, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.1762085, + "step": 3903, + "time_per_iteration": 2.4654438495635986 + }, + { + "auxiliary_loss_clip": 0.01156825, + "auxiliary_loss_mlp": 0.0104112, + "balance_loss_clip": 1.05709362, + "balance_loss_mlp": 1.02364373, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 2.066869849306528, + "language_loss": 0.81957674, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84155613, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.99755859, + "router_z_loss_mlp": 0.17480469, + "step": 3904, + "time_per_iteration": 2.535811185836792 + }, + { + "auxiliary_loss_clip": 0.01153266, + "auxiliary_loss_mlp": 0.010617, + "balance_loss_clip": 1.05631304, + "balance_loss_mlp": 1.04236412, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9402127876367528, + "language_loss": 0.71850908, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7406587, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.19335938, + "step": 3905, + "time_per_iteration": 2.5621225833892822 + }, + { + "auxiliary_loss_clip": 0.01160014, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.0631578, + "balance_loss_mlp": 1.02139735, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.7098676227347545, + "language_loss": 0.71053314, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73251307, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.16589355, + "step": 3906, + "time_per_iteration": 2.5181057453155518 + }, + { + "auxiliary_loss_clip": 0.01156767, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.0541414, + "balance_loss_mlp": 1.01792979, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 1.9618876685643467, + "language_loss": 0.73505574, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75699288, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.19006348, + "step": 3907, + "time_per_iteration": 2.5538556575775146 + }, + { + "auxiliary_loss_clip": 0.01155587, + "auxiliary_loss_mlp": 0.01048921, + "balance_loss_clip": 1.05834699, + "balance_loss_mlp": 1.03133762, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.674911766076113, + "language_loss": 0.72805196, + "learning_rate": 3.575267247755601e-06, + "loss": 0.75009704, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.17565918, + "step": 3908, + "time_per_iteration": 2.575852155685425 + }, + { + "auxiliary_loss_clip": 0.01089812, + "auxiliary_loss_mlp": 0.01006351, + "balance_loss_clip": 1.05398941, + "balance_loss_mlp": 1.00415194, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0284873133180097, + "language_loss": 0.73383623, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75479788, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.35791016, + "router_z_loss_mlp": 0.02200317, + "step": 3909, + "time_per_iteration": 2.8399269580841064 + }, + { + "auxiliary_loss_clip": 0.01160284, + "auxiliary_loss_mlp": 0.01041676, + "balance_loss_clip": 1.0637517, + "balance_loss_mlp": 1.02515399, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5935073391500667, + "language_loss": 0.87990683, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9019264, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16516113, + "step": 3910, + "time_per_iteration": 2.5212581157684326 + }, + { + "auxiliary_loss_clip": 0.01166306, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.06644154, + "balance_loss_mlp": 1.02176833, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.681335925943631, + "language_loss": 0.76782954, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78988028, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.16992188, + "step": 3911, + "time_per_iteration": 2.5781664848327637 + }, + { + "auxiliary_loss_clip": 0.01157138, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_clip": 1.06383383, + "balance_loss_mlp": 1.02656507, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.5408558452708498, + "language_loss": 0.81684709, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83883905, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15496826, + "step": 3912, + "time_per_iteration": 2.489284038543701 + }, + { + "auxiliary_loss_clip": 0.01151515, + "auxiliary_loss_mlp": 0.0105797, + "balance_loss_clip": 1.05645132, + "balance_loss_mlp": 1.03976715, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.8851167139303984, + "language_loss": 0.71282542, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73492026, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.18212891, + "step": 3913, + "time_per_iteration": 2.4981191158294678 + }, + { + "auxiliary_loss_clip": 0.01166197, + "auxiliary_loss_mlp": 0.01054391, + "balance_loss_clip": 1.06279635, + "balance_loss_mlp": 1.03464961, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.6999706708676583, + "language_loss": 0.76516616, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78737199, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.1973877, + "step": 3914, + "time_per_iteration": 2.521068572998047 + }, + { + "auxiliary_loss_clip": 0.01156314, + "auxiliary_loss_mlp": 0.01039068, + "balance_loss_clip": 1.06027627, + "balance_loss_mlp": 1.02168703, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.382251359434108, + "language_loss": 0.89934278, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9212966, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.17382812, + "step": 3915, + "time_per_iteration": 2.54433012008667 + }, + { + "auxiliary_loss_clip": 0.01113817, + "auxiliary_loss_mlp": 0.01002562, + "balance_loss_clip": 1.07929277, + "balance_loss_mlp": 1.00039256, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8110767363182411, + "language_loss": 0.59407973, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61524349, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.34521484, + "router_z_loss_mlp": 0.021698, + "step": 3916, + "time_per_iteration": 3.0234928131103516 + }, + { + "auxiliary_loss_clip": 0.0110507, + "auxiliary_loss_mlp": 0.01009191, + "balance_loss_clip": 1.07044172, + "balance_loss_mlp": 1.00705457, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.8523591834089571, + "language_loss": 0.49490187, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.5160445, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.34619141, + "router_z_loss_mlp": 0.02133179, + "step": 3917, + "time_per_iteration": 3.1107470989227295 + }, + { + "auxiliary_loss_clip": 0.01160022, + "auxiliary_loss_mlp": 0.01058156, + "balance_loss_clip": 1.05893588, + "balance_loss_mlp": 1.04045355, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.0994877345610123, + "language_loss": 0.76573992, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78792173, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.17687988, + "step": 3918, + "time_per_iteration": 2.4814038276672363 + }, + { + "auxiliary_loss_clip": 0.01160889, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.05966806, + "balance_loss_mlp": 1.02767277, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 6.865626834340814, + "language_loss": 0.69171762, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71376967, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.1663208, + "step": 3919, + "time_per_iteration": 2.4652388095855713 + }, + { + "auxiliary_loss_clip": 0.01162912, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.06640077, + "balance_loss_mlp": 1.01999688, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 2.2131912291672458, + "language_loss": 0.70200276, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72399974, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16796875, + "step": 3920, + "time_per_iteration": 2.6849474906921387 + }, + { + "auxiliary_loss_clip": 0.01162053, + "auxiliary_loss_mlp": 0.01046984, + "balance_loss_clip": 1.0661391, + "balance_loss_mlp": 1.03077185, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 2.028580246053085, + "language_loss": 0.77467984, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79677022, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.16210938, + "step": 3921, + "time_per_iteration": 2.487290143966675 + }, + { + "auxiliary_loss_clip": 0.01158646, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.06168175, + "balance_loss_mlp": 1.02726793, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.734312648102551, + "language_loss": 0.75150764, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77353728, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.17077637, + "step": 3922, + "time_per_iteration": 2.431321144104004 + }, + { + "auxiliary_loss_clip": 0.01170432, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.07222497, + "balance_loss_mlp": 1.0272212, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 1.925211500547173, + "language_loss": 0.80223179, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82436323, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.1550293, + "step": 3923, + "time_per_iteration": 2.5063235759735107 + }, + { + "auxiliary_loss_clip": 0.01155353, + "auxiliary_loss_mlp": 0.01052371, + "balance_loss_clip": 1.05742383, + "balance_loss_mlp": 1.03515708, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.8090789047307656, + "language_loss": 0.74464881, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76672614, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.17199707, + "step": 3924, + "time_per_iteration": 2.4305946826934814 + }, + { + "auxiliary_loss_clip": 0.011663, + "auxiliary_loss_mlp": 0.01048935, + "balance_loss_clip": 1.06602788, + "balance_loss_mlp": 1.03281832, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 1.5575351547104526, + "language_loss": 0.83029771, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.85245013, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.16113281, + "step": 3925, + "time_per_iteration": 2.437426805496216 + }, + { + "auxiliary_loss_clip": 0.0115919, + "auxiliary_loss_mlp": 0.01048798, + "balance_loss_clip": 1.06128454, + "balance_loss_mlp": 1.03182316, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.9044935101195763, + "language_loss": 0.59620172, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.6182816, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.1697998, + "step": 3926, + "time_per_iteration": 2.4547042846679688 + }, + { + "auxiliary_loss_clip": 0.01153831, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.06035173, + "balance_loss_mlp": 1.02548623, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.7760645159789825, + "language_loss": 0.71441495, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73636603, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.15814209, + "step": 3927, + "time_per_iteration": 2.5312085151672363 + }, + { + "auxiliary_loss_clip": 0.01156754, + "auxiliary_loss_mlp": 0.01046293, + "balance_loss_clip": 1.06054008, + "balance_loss_mlp": 1.03095102, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8673600559373955, + "language_loss": 0.75207478, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77410531, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.15325928, + "step": 3928, + "time_per_iteration": 2.4947714805603027 + }, + { + "auxiliary_loss_clip": 0.01156384, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_clip": 1.05888534, + "balance_loss_mlp": 1.02823639, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.9390476091157085, + "language_loss": 0.8163631, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83838534, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.17614746, + "step": 3929, + "time_per_iteration": 2.463343620300293 + }, + { + "auxiliary_loss_clip": 0.01169171, + "auxiliary_loss_mlp": 0.01052168, + "balance_loss_clip": 1.06432319, + "balance_loss_mlp": 1.03320217, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 2.031512963017421, + "language_loss": 0.72086418, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74307752, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.18969727, + "step": 3930, + "time_per_iteration": 2.488124370574951 + }, + { + "auxiliary_loss_clip": 0.0116066, + "auxiliary_loss_mlp": 0.01047359, + "balance_loss_clip": 1.05923605, + "balance_loss_mlp": 1.02904892, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 1.811423863292384, + "language_loss": 0.74228692, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7643671, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.1829834, + "step": 3931, + "time_per_iteration": 4.147634744644165 + }, + { + "auxiliary_loss_clip": 0.01163966, + "auxiliary_loss_mlp": 0.01050981, + "balance_loss_clip": 1.06391704, + "balance_loss_mlp": 1.03230071, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.2206461231091277, + "language_loss": 0.80900705, + "learning_rate": 3.569490918967136e-06, + "loss": 0.83115655, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18676758, + "step": 3932, + "time_per_iteration": 2.519989252090454 + }, + { + "auxiliary_loss_clip": 0.01171323, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.0742861, + "balance_loss_mlp": 1.02111626, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5005072360612473, + "language_loss": 0.85898179, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.88105345, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.1472168, + "step": 3933, + "time_per_iteration": 2.5979981422424316 + }, + { + "auxiliary_loss_clip": 0.01170082, + "auxiliary_loss_mlp": 0.01039371, + "balance_loss_clip": 1.07103682, + "balance_loss_mlp": 1.02163315, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.141708985120048, + "language_loss": 0.83211911, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85421365, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.17724609, + "step": 3934, + "time_per_iteration": 2.512845516204834 + }, + { + "auxiliary_loss_clip": 0.01166682, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.06693375, + "balance_loss_mlp": 1.0273453, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.6443267805439714, + "language_loss": 0.78554118, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.8076542, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.17272949, + "step": 3935, + "time_per_iteration": 2.4615073204040527 + }, + { + "auxiliary_loss_clip": 0.01160989, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.06295633, + "balance_loss_mlp": 1.02963579, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7509323512840957, + "language_loss": 0.79510748, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81718653, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.17285156, + "step": 3936, + "time_per_iteration": 2.4903812408447266 + }, + { + "auxiliary_loss_clip": 0.01161392, + "auxiliary_loss_mlp": 0.01039643, + "balance_loss_clip": 1.0617317, + "balance_loss_mlp": 1.02227437, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.728545523392625, + "language_loss": 0.79335642, + "learning_rate": 3.568283198083826e-06, + "loss": 0.8153668, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17382812, + "step": 3937, + "time_per_iteration": 2.517737865447998 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.06416106, + "balance_loss_mlp": 1.02423334, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 1.8876608106721637, + "language_loss": 0.85033357, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87230814, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15533447, + "step": 3938, + "time_per_iteration": 2.492176055908203 + }, + { + "auxiliary_loss_clip": 0.0116076, + "auxiliary_loss_mlp": 0.01047598, + "balance_loss_clip": 1.06621194, + "balance_loss_mlp": 1.03078914, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.235020706961057, + "language_loss": 0.93664992, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.95873344, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.16796875, + "step": 3939, + "time_per_iteration": 2.3895041942596436 + }, + { + "auxiliary_loss_clip": 0.01162473, + "auxiliary_loss_mlp": 0.01049567, + "balance_loss_clip": 1.05927014, + "balance_loss_mlp": 1.0318172, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6282809648787053, + "language_loss": 0.82549655, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84761697, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.17736816, + "step": 3940, + "time_per_iteration": 3.871469497680664 + }, + { + "auxiliary_loss_clip": 0.01163257, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_clip": 1.06200826, + "balance_loss_mlp": 1.02434826, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 34.43155101148817, + "language_loss": 0.8960216, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91807139, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17370605, + "step": 3941, + "time_per_iteration": 3.8359251022338867 + }, + { + "auxiliary_loss_clip": 0.01156246, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.05762649, + "balance_loss_mlp": 1.02325618, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.97548433548067, + "language_loss": 0.85022175, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.87220144, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.18481445, + "step": 3942, + "time_per_iteration": 2.450885534286499 + }, + { + "auxiliary_loss_clip": 0.01170479, + "auxiliary_loss_mlp": 0.01047847, + "balance_loss_clip": 1.07084072, + "balance_loss_mlp": 1.02926278, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 2.159521619753979, + "language_loss": 0.80616939, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.82835269, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.18591309, + "step": 3943, + "time_per_iteration": 3.9300239086151123 + }, + { + "auxiliary_loss_clip": 0.01162801, + "auxiliary_loss_mlp": 0.01042809, + "balance_loss_clip": 1.06076837, + "balance_loss_mlp": 1.02433181, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 3.4047540997640144, + "language_loss": 0.66898161, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69103777, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.18469238, + "step": 3944, + "time_per_iteration": 2.5764100551605225 + }, + { + "auxiliary_loss_clip": 0.01155762, + "auxiliary_loss_mlp": 0.010428, + "balance_loss_clip": 1.05656409, + "balance_loss_mlp": 1.02433479, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 1.7750601318403239, + "language_loss": 0.75002313, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77200878, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.18469238, + "step": 3945, + "time_per_iteration": 2.476266384124756 + }, + { + "auxiliary_loss_clip": 0.01165318, + "auxiliary_loss_mlp": 0.01041938, + "balance_loss_clip": 1.06418204, + "balance_loss_mlp": 1.02485502, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.5373187477382761, + "language_loss": 0.638762, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.66083449, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.17089844, + "step": 3946, + "time_per_iteration": 2.56978440284729 + }, + { + "auxiliary_loss_clip": 0.01153048, + "auxiliary_loss_mlp": 0.01041927, + "balance_loss_clip": 1.05522215, + "balance_loss_mlp": 1.02330625, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.2821726893743564, + "language_loss": 0.77871585, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.80066562, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.1862793, + "step": 3947, + "time_per_iteration": 2.4352951049804688 + }, + { + "auxiliary_loss_clip": 0.01154729, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.0565412, + "balance_loss_mlp": 1.02698708, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.6759742979496681, + "language_loss": 0.80783689, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82983881, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.18481445, + "step": 3948, + "time_per_iteration": 2.6170170307159424 + }, + { + "auxiliary_loss_clip": 0.01159059, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.06039584, + "balance_loss_mlp": 1.02668762, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 1.8094210709433682, + "language_loss": 0.80561566, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.8276493, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.1763916, + "step": 3949, + "time_per_iteration": 2.4797725677490234 + }, + { + "auxiliary_loss_clip": 0.01155012, + "auxiliary_loss_mlp": 0.01043765, + "balance_loss_clip": 1.05384636, + "balance_loss_mlp": 1.02546656, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.5973632854691064, + "language_loss": 0.73186636, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75385416, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.18286133, + "step": 3950, + "time_per_iteration": 2.547656536102295 + }, + { + "auxiliary_loss_clip": 0.01165299, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.0675019, + "balance_loss_mlp": 1.02181113, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.6861380565049906, + "language_loss": 0.73316216, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75519556, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.16223145, + "step": 3951, + "time_per_iteration": 2.5192208290100098 + }, + { + "auxiliary_loss_clip": 0.01167051, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.06423378, + "balance_loss_mlp": 1.02082121, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7924091534346103, + "language_loss": 0.73763818, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75970548, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.18847656, + "step": 3952, + "time_per_iteration": 2.4725027084350586 + }, + { + "auxiliary_loss_clip": 0.01158088, + "auxiliary_loss_mlp": 0.01048963, + "balance_loss_clip": 1.05659199, + "balance_loss_mlp": 1.03117657, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.5623819439346995, + "language_loss": 0.70857036, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73064089, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.17785645, + "step": 3953, + "time_per_iteration": 2.499495267868042 + }, + { + "auxiliary_loss_clip": 0.01164105, + "auxiliary_loss_mlp": 0.01050778, + "balance_loss_clip": 1.06219172, + "balance_loss_mlp": 1.03152561, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9366121309160231, + "language_loss": 0.81542206, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83757085, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.19262695, + "step": 3954, + "time_per_iteration": 2.497485637664795 + }, + { + "auxiliary_loss_clip": 0.01166959, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.06328201, + "balance_loss_mlp": 1.02610993, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 30.55833347285654, + "language_loss": 0.65982747, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.6819514, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.1932373, + "step": 3955, + "time_per_iteration": 2.474949836730957 + }, + { + "auxiliary_loss_clip": 0.01166702, + "auxiliary_loss_mlp": 0.0105031, + "balance_loss_clip": 1.06871009, + "balance_loss_mlp": 1.03220236, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.4026666643682775, + "language_loss": 0.84102833, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.8631984, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.18115234, + "step": 3956, + "time_per_iteration": 2.5646235942840576 + }, + { + "auxiliary_loss_clip": 0.01160298, + "auxiliary_loss_mlp": 0.01041052, + "balance_loss_clip": 1.06554556, + "balance_loss_mlp": 1.02394509, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 7.552098347420211, + "language_loss": 0.84690976, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.86892325, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.17114258, + "step": 3957, + "time_per_iteration": 2.5761520862579346 + }, + { + "auxiliary_loss_clip": 0.01159444, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.06011093, + "balance_loss_mlp": 1.02645683, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.1219604914050314, + "language_loss": 0.70573258, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72775251, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16088867, + "step": 3958, + "time_per_iteration": 2.634958267211914 + }, + { + "auxiliary_loss_clip": 0.01157532, + "auxiliary_loss_mlp": 0.01040942, + "balance_loss_clip": 1.05898142, + "balance_loss_mlp": 1.02359068, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.4907807666601287, + "language_loss": 0.6692729, + "learning_rate": 3.562951579215745e-06, + "loss": 0.6912576, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.17352295, + "step": 3959, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.01162648, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_clip": 1.06357861, + "balance_loss_mlp": 1.02615142, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.9381956395443678, + "language_loss": 0.72925138, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.75130582, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16625977, + "step": 3960, + "time_per_iteration": 2.506117820739746 + }, + { + "auxiliary_loss_clip": 0.01163761, + "auxiliary_loss_mlp": 0.01044209, + "balance_loss_clip": 1.06349671, + "balance_loss_mlp": 1.02626824, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.6033677019698795, + "language_loss": 0.74228746, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7643671, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17956543, + "step": 3961, + "time_per_iteration": 2.490171194076538 + }, + { + "auxiliary_loss_clip": 0.01152531, + "auxiliary_loss_mlp": 0.01052987, + "balance_loss_clip": 1.05302382, + "balance_loss_mlp": 1.03387809, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.555329356884934, + "language_loss": 0.66091204, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68296719, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.19116211, + "step": 3962, + "time_per_iteration": 2.5076191425323486 + }, + { + "auxiliary_loss_clip": 0.01164467, + "auxiliary_loss_mlp": 0.01053303, + "balance_loss_clip": 1.06518888, + "balance_loss_mlp": 1.0355885, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.674741828167429, + "language_loss": 0.74736339, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76954114, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.17712402, + "step": 3963, + "time_per_iteration": 2.506150007247925 + }, + { + "auxiliary_loss_clip": 0.01167104, + "auxiliary_loss_mlp": 0.01045453, + "balance_loss_clip": 1.06600904, + "balance_loss_mlp": 1.02781034, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.181036710193518, + "language_loss": 0.77398628, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79611182, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.17651367, + "step": 3964, + "time_per_iteration": 2.4794459342956543 + }, + { + "auxiliary_loss_clip": 0.01157691, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.06149232, + "balance_loss_mlp": 1.02752519, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 23.157784348036383, + "language_loss": 0.70962524, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73165047, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.1730957, + "step": 3965, + "time_per_iteration": 2.5146353244781494 + }, + { + "auxiliary_loss_clip": 0.01150147, + "auxiliary_loss_mlp": 0.01048731, + "balance_loss_clip": 1.05519485, + "balance_loss_mlp": 1.03118396, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.89008456483417, + "language_loss": 0.78329015, + "learning_rate": 3.561249134732282e-06, + "loss": 0.8052789, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.17541504, + "step": 3966, + "time_per_iteration": 2.471378803253174 + }, + { + "auxiliary_loss_clip": 0.01153845, + "auxiliary_loss_mlp": 0.01043848, + "balance_loss_clip": 1.05900049, + "balance_loss_mlp": 1.02723062, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 1.7194209881420073, + "language_loss": 0.68458056, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70655751, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16625977, + "step": 3967, + "time_per_iteration": 2.539947032928467 + }, + { + "auxiliary_loss_clip": 0.01154131, + "auxiliary_loss_mlp": 0.01051559, + "balance_loss_clip": 1.05687261, + "balance_loss_mlp": 1.03411913, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 1.7942432117123406, + "language_loss": 0.67702764, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.69908452, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.17431641, + "step": 3968, + "time_per_iteration": 2.4796290397644043 + }, + { + "auxiliary_loss_clip": 0.01158349, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.06096792, + "balance_loss_mlp": 1.02567863, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.9864933201512853, + "language_loss": 0.7717247, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.79373872, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.17370605, + "step": 3969, + "time_per_iteration": 2.5281026363372803 + }, + { + "auxiliary_loss_clip": 0.01160263, + "auxiliary_loss_mlp": 0.01038908, + "balance_loss_clip": 1.06457663, + "balance_loss_mlp": 1.02184916, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 1.9913379568532128, + "language_loss": 0.76144838, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78344005, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.17053223, + "step": 3970, + "time_per_iteration": 2.465121030807495 + }, + { + "auxiliary_loss_clip": 0.0115916, + "auxiliary_loss_mlp": 0.01045728, + "balance_loss_clip": 1.06162417, + "balance_loss_mlp": 1.02754879, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 1.96145010252458, + "language_loss": 0.85134852, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87339735, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.18188477, + "step": 3971, + "time_per_iteration": 2.5998735427856445 + }, + { + "auxiliary_loss_clip": 0.01084317, + "auxiliary_loss_mlp": 0.01021994, + "balance_loss_clip": 1.0500356, + "balance_loss_mlp": 1.0194968, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.753721661909446, + "language_loss": 0.62752545, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64858854, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.34277344, + "router_z_loss_mlp": 0.0249939, + "step": 3972, + "time_per_iteration": 3.172828435897827 + }, + { + "auxiliary_loss_clip": 0.01158944, + "auxiliary_loss_mlp": 0.01043944, + "balance_loss_clip": 1.06164837, + "balance_loss_mlp": 1.02490592, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 3.6046602253719326, + "language_loss": 0.81278014, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.83480906, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.19018555, + "step": 3973, + "time_per_iteration": 2.459292411804199 + }, + { + "auxiliary_loss_clip": 0.01155175, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.05952549, + "balance_loss_mlp": 1.03041291, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.5465423269204266, + "language_loss": 0.7935009, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81554842, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.19152832, + "step": 3974, + "time_per_iteration": 2.521951913833618 + }, + { + "auxiliary_loss_clip": 0.01159979, + "auxiliary_loss_mlp": 0.01058741, + "balance_loss_clip": 1.06082058, + "balance_loss_mlp": 1.03866601, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.962435587625986, + "language_loss": 0.84842902, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.8706162, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.20068359, + "step": 3975, + "time_per_iteration": 3.916417360305786 + }, + { + "auxiliary_loss_clip": 0.01158359, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.06177545, + "balance_loss_mlp": 1.02500439, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 1.988184802894768, + "language_loss": 0.83744532, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85945785, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17858887, + "step": 3976, + "time_per_iteration": 2.4734268188476562 + }, + { + "auxiliary_loss_clip": 0.01156206, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.06289208, + "balance_loss_mlp": 1.01797748, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.7482725516799646, + "language_loss": 0.74849367, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.77038825, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.15270996, + "step": 3977, + "time_per_iteration": 2.5297985076904297 + }, + { + "auxiliary_loss_clip": 0.01164842, + "auxiliary_loss_mlp": 0.01046153, + "balance_loss_clip": 1.06574297, + "balance_loss_mlp": 1.02830696, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 3.0776328437167955, + "language_loss": 0.72203952, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74414951, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.1784668, + "step": 3978, + "time_per_iteration": 2.503661632537842 + }, + { + "auxiliary_loss_clip": 0.01172452, + "auxiliary_loss_mlp": 0.01045883, + "balance_loss_clip": 1.07241583, + "balance_loss_mlp": 1.02860999, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.0839195358676186, + "language_loss": 0.79210526, + "learning_rate": 3.558079758168997e-06, + "loss": 0.81428862, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.17272949, + "step": 3979, + "time_per_iteration": 2.5409207344055176 + }, + { + "auxiliary_loss_clip": 0.01160034, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.06224847, + "balance_loss_mlp": 1.0354054, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6467856662187499, + "language_loss": 0.81760442, + "learning_rate": 3.557835546134977e-06, + "loss": 0.83974046, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.18188477, + "step": 3980, + "time_per_iteration": 2.5405168533325195 + }, + { + "auxiliary_loss_clip": 0.0115952, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.06496215, + "balance_loss_mlp": 1.02001464, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.8650221143160348, + "language_loss": 0.83853245, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86050057, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.17285156, + "step": 3981, + "time_per_iteration": 2.48457670211792 + }, + { + "auxiliary_loss_clip": 0.0116646, + "auxiliary_loss_mlp": 0.01048278, + "balance_loss_clip": 1.06588316, + "balance_loss_mlp": 1.02951431, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 1.742602523828159, + "language_loss": 0.76699501, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.78914237, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.18762207, + "step": 3982, + "time_per_iteration": 2.500091075897217 + }, + { + "auxiliary_loss_clip": 0.01165826, + "auxiliary_loss_mlp": 0.01048488, + "balance_loss_clip": 1.07079649, + "balance_loss_mlp": 1.03190625, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 2.4419478896740214, + "language_loss": 0.77998835, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80213153, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.16577148, + "step": 3983, + "time_per_iteration": 2.575242042541504 + }, + { + "auxiliary_loss_clip": 0.01155749, + "auxiliary_loss_mlp": 0.01047186, + "balance_loss_clip": 1.05892813, + "balance_loss_mlp": 1.02953064, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9506827732589247, + "language_loss": 0.73317748, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75520682, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17651367, + "step": 3984, + "time_per_iteration": 3.883979320526123 + }, + { + "auxiliary_loss_clip": 0.01162729, + "auxiliary_loss_mlp": 0.0104769, + "balance_loss_clip": 1.06352651, + "balance_loss_mlp": 1.0308578, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.5352641107721654, + "language_loss": 0.78611171, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.80821598, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16833496, + "step": 3985, + "time_per_iteration": 3.908886194229126 + }, + { + "auxiliary_loss_clip": 0.01162537, + "auxiliary_loss_mlp": 0.01058067, + "balance_loss_clip": 1.0644275, + "balance_loss_mlp": 1.03960156, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 1.8243475253328392, + "language_loss": 0.7339958, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7562018, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.18457031, + "step": 3986, + "time_per_iteration": 2.5409536361694336 + }, + { + "auxiliary_loss_clip": 0.01158415, + "auxiliary_loss_mlp": 0.01054672, + "balance_loss_clip": 1.05921376, + "balance_loss_mlp": 1.03652847, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.0922505632776045, + "language_loss": 0.88016409, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90229499, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.18127441, + "step": 3987, + "time_per_iteration": 3.8452064990997314 + }, + { + "auxiliary_loss_clip": 0.0116112, + "auxiliary_loss_mlp": 0.01045166, + "balance_loss_clip": 1.06770003, + "balance_loss_mlp": 1.02974653, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.520378586021324, + "language_loss": 0.82995194, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85201478, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.15423584, + "step": 3988, + "time_per_iteration": 2.416109323501587 + }, + { + "auxiliary_loss_clip": 0.0116025, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.06403601, + "balance_loss_mlp": 1.02134156, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.6634654362933448, + "language_loss": 0.85365415, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87564397, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.17401123, + "step": 3989, + "time_per_iteration": 2.430194616317749 + }, + { + "auxiliary_loss_clip": 0.01159264, + "auxiliary_loss_mlp": 0.01046205, + "balance_loss_clip": 1.06316519, + "balance_loss_mlp": 1.0291338, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.688781721097779, + "language_loss": 0.84681249, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86886716, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.1706543, + "step": 3990, + "time_per_iteration": 2.4337124824523926 + }, + { + "auxiliary_loss_clip": 0.0115958, + "auxiliary_loss_mlp": 0.01044886, + "balance_loss_clip": 1.06251097, + "balance_loss_mlp": 1.0288527, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.5327061771999875, + "language_loss": 0.75577766, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77782232, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.16027832, + "step": 3991, + "time_per_iteration": 2.500211477279663 + }, + { + "auxiliary_loss_clip": 0.01081881, + "auxiliary_loss_mlp": 0.01005715, + "balance_loss_clip": 1.0483923, + "balance_loss_mlp": 1.00370026, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.9143492223218251, + "language_loss": 0.63757539, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65845138, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 0.0201416, + "step": 3992, + "time_per_iteration": 3.047124147415161 + }, + { + "auxiliary_loss_clip": 0.01090585, + "auxiliary_loss_mlp": 0.01006394, + "balance_loss_clip": 1.05761862, + "balance_loss_mlp": 1.00403619, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.8110483843328994, + "language_loss": 0.62946057, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65043032, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.02355957, + "step": 3993, + "time_per_iteration": 3.1489012241363525 + }, + { + "auxiliary_loss_clip": 0.01162122, + "auxiliary_loss_mlp": 0.01063801, + "balance_loss_clip": 1.06256962, + "balance_loss_mlp": 1.04361928, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.699819289245678, + "language_loss": 0.77131689, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79357606, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.2019043, + "step": 3994, + "time_per_iteration": 2.5906155109405518 + }, + { + "auxiliary_loss_clip": 0.01160593, + "auxiliary_loss_mlp": 0.01056944, + "balance_loss_clip": 1.06104553, + "balance_loss_mlp": 1.03809667, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.8102719293129146, + "language_loss": 0.78340268, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80557805, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.18847656, + "step": 3995, + "time_per_iteration": 2.5299270153045654 + }, + { + "auxiliary_loss_clip": 0.01093802, + "auxiliary_loss_mlp": 0.01005339, + "balance_loss_clip": 1.06102324, + "balance_loss_mlp": 1.00330687, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.9109082865435668, + "language_loss": 0.63518679, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65617824, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.02032471, + "step": 3996, + "time_per_iteration": 3.1358180046081543 + }, + { + "auxiliary_loss_clip": 0.01167079, + "auxiliary_loss_mlp": 0.01042364, + "balance_loss_clip": 1.06664062, + "balance_loss_mlp": 1.02505445, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.62699688612979, + "language_loss": 0.69606799, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.71816242, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.1730957, + "step": 3997, + "time_per_iteration": 2.4716238975524902 + }, + { + "auxiliary_loss_clip": 0.01160343, + "auxiliary_loss_mlp": 0.0104432, + "balance_loss_clip": 1.06212568, + "balance_loss_mlp": 1.02734506, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.7897163567265675, + "language_loss": 0.87288415, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89493078, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.16967773, + "step": 3998, + "time_per_iteration": 2.5120861530303955 + }, + { + "auxiliary_loss_clip": 0.01160147, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.05668187, + "balance_loss_mlp": 1.02744365, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.5809110337267183, + "language_loss": 0.75950563, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78154755, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.1661377, + "step": 3999, + "time_per_iteration": 2.5838401317596436 + }, + { + "auxiliary_loss_clip": 0.01168822, + "auxiliary_loss_mlp": 0.01041974, + "balance_loss_clip": 1.07026196, + "balance_loss_mlp": 1.02568984, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 2.0272753134847292, + "language_loss": 0.72291112, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74501908, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.1628418, + "step": 4000, + "time_per_iteration": 2.581805944442749 + }, + { + "auxiliary_loss_clip": 0.01179172, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.07507992, + "balance_loss_mlp": 1.02225184, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 1.9998591605547582, + "language_loss": 0.66706961, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68925434, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.17053223, + "step": 4001, + "time_per_iteration": 2.5085389614105225 + }, + { + "auxiliary_loss_clip": 0.01160631, + "auxiliary_loss_mlp": 0.01039015, + "balance_loss_clip": 1.06457353, + "balance_loss_mlp": 1.02178884, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 1.7527192781520184, + "language_loss": 0.82729948, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84929591, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.17211914, + "step": 4002, + "time_per_iteration": 2.490490436553955 + }, + { + "auxiliary_loss_clip": 0.01175948, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.07696307, + "balance_loss_mlp": 1.02260995, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 2.004087294048114, + "language_loss": 0.82690835, + "learning_rate": 3.552202383898897e-06, + "loss": 0.84905601, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16223145, + "step": 4003, + "time_per_iteration": 2.4955694675445557 + }, + { + "auxiliary_loss_clip": 0.0115686, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.06052065, + "balance_loss_mlp": 1.02257109, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.2625723906062962, + "language_loss": 0.87203002, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89399153, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.1673584, + "step": 4004, + "time_per_iteration": 2.468013286590576 + }, + { + "auxiliary_loss_clip": 0.01160001, + "auxiliary_loss_mlp": 0.010496, + "balance_loss_clip": 1.06009889, + "balance_loss_mlp": 1.0322907, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 2.505923275681979, + "language_loss": 0.78425932, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80635536, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.1730957, + "step": 4005, + "time_per_iteration": 2.513181447982788 + }, + { + "auxiliary_loss_clip": 0.01160075, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_clip": 1.06191969, + "balance_loss_mlp": 1.02731442, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.5832299585387803, + "language_loss": 0.79008967, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81214428, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.18066406, + "step": 4006, + "time_per_iteration": 2.438950300216675 + }, + { + "auxiliary_loss_clip": 0.01173531, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.0692215, + "balance_loss_mlp": 1.02255774, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.7659590387341448, + "language_loss": 0.71992075, + "learning_rate": 3.551219521907302e-06, + "loss": 0.7420637, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.18200684, + "step": 4007, + "time_per_iteration": 2.5941946506500244 + }, + { + "auxiliary_loss_clip": 0.01163614, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.06933236, + "balance_loss_mlp": 1.02833533, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 2.790653680624622, + "language_loss": 0.76290852, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78499073, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16271973, + "step": 4008, + "time_per_iteration": 2.4469611644744873 + }, + { + "auxiliary_loss_clip": 0.01165445, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.06491828, + "balance_loss_mlp": 1.02067161, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 3.001032060327389, + "language_loss": 0.74991035, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.77194786, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.17626953, + "step": 4009, + "time_per_iteration": 2.459841728210449 + }, + { + "auxiliary_loss_clip": 0.01162495, + "auxiliary_loss_mlp": 0.01044898, + "balance_loss_clip": 1.06536865, + "balance_loss_mlp": 1.0290904, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.8905514105197123, + "language_loss": 0.80086178, + "learning_rate": 3.550481757745804e-06, + "loss": 0.8229357, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.1583252, + "step": 4010, + "time_per_iteration": 2.45600962638855 + }, + { + "auxiliary_loss_clip": 0.01161981, + "auxiliary_loss_mlp": 0.01050819, + "balance_loss_clip": 1.06338382, + "balance_loss_mlp": 1.03156686, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 1.9326500186169884, + "language_loss": 0.7066505, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72877842, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.19226074, + "step": 4011, + "time_per_iteration": 2.4913012981414795 + }, + { + "auxiliary_loss_clip": 0.01161771, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.06389999, + "balance_loss_mlp": 1.02527642, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.5307724586083709, + "language_loss": 0.69214249, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71418077, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.16784668, + "step": 4012, + "time_per_iteration": 2.478198289871216 + }, + { + "auxiliary_loss_clip": 0.01175113, + "auxiliary_loss_mlp": 0.01042075, + "balance_loss_clip": 1.07660472, + "balance_loss_mlp": 1.02270317, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.1986311337274222, + "language_loss": 0.73548234, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75765419, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.19372559, + "step": 4013, + "time_per_iteration": 2.632720708847046 + }, + { + "auxiliary_loss_clip": 0.01185873, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.08253741, + "balance_loss_mlp": 1.02391458, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8898072827949526, + "language_loss": 0.87866855, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90093648, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.17004395, + "step": 4014, + "time_per_iteration": 2.544177532196045 + }, + { + "auxiliary_loss_clip": 0.01158543, + "auxiliary_loss_mlp": 0.01045113, + "balance_loss_clip": 1.05684447, + "balance_loss_mlp": 1.02655244, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.5318916097597914, + "language_loss": 0.9494772, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97151375, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.18554688, + "step": 4015, + "time_per_iteration": 2.511340618133545 + }, + { + "auxiliary_loss_clip": 0.01162218, + "auxiliary_loss_mlp": 0.01042245, + "balance_loss_clip": 1.06302392, + "balance_loss_mlp": 1.02513802, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.6927926660696866, + "language_loss": 0.82460958, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84665424, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17102051, + "step": 4016, + "time_per_iteration": 2.5439536571502686 + }, + { + "auxiliary_loss_clip": 0.01150472, + "auxiliary_loss_mlp": 0.01045276, + "balance_loss_clip": 1.05686545, + "balance_loss_mlp": 1.02831221, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.764418257238623, + "language_loss": 0.68824744, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71020496, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16967773, + "step": 4017, + "time_per_iteration": 2.7092902660369873 + }, + { + "auxiliary_loss_clip": 0.01162109, + "auxiliary_loss_mlp": 0.01055452, + "balance_loss_clip": 1.06105947, + "balance_loss_mlp": 1.03567493, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.8177927123759967, + "language_loss": 0.84934247, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87151808, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.19763184, + "step": 4018, + "time_per_iteration": 3.9273533821105957 + }, + { + "auxiliary_loss_clip": 0.01085196, + "auxiliary_loss_mlp": 0.01014881, + "balance_loss_clip": 1.05089092, + "balance_loss_mlp": 1.01241946, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.814575356958419, + "language_loss": 0.60594714, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62694794, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.34277344, + "router_z_loss_mlp": 0.02459717, + "step": 4019, + "time_per_iteration": 3.14817214012146 + }, + { + "auxiliary_loss_clip": 0.01170108, + "auxiliary_loss_mlp": 0.01046327, + "balance_loss_clip": 1.06729484, + "balance_loss_mlp": 1.02906537, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.9446396693775523, + "language_loss": 0.73636127, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75852555, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.17260742, + "step": 4020, + "time_per_iteration": 2.545562982559204 + }, + { + "auxiliary_loss_clip": 0.01165688, + "auxiliary_loss_mlp": 0.01037135, + "balance_loss_clip": 1.06818223, + "balance_loss_mlp": 1.02019572, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 2.7728017500360815, + "language_loss": 0.81513405, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83716226, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16943359, + "step": 4021, + "time_per_iteration": 2.4848718643188477 + }, + { + "auxiliary_loss_clip": 0.01170497, + "auxiliary_loss_mlp": 0.01064732, + "balance_loss_clip": 1.06514513, + "balance_loss_mlp": 1.0435487, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.9412038865613719, + "language_loss": 0.76086587, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78321815, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.21203613, + "step": 4022, + "time_per_iteration": 2.56955623626709 + }, + { + "auxiliary_loss_clip": 0.01176771, + "auxiliary_loss_mlp": 0.01044424, + "balance_loss_clip": 1.06971955, + "balance_loss_mlp": 1.02577984, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.9524194717401544, + "language_loss": 0.75420219, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.7764141, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.18652344, + "step": 4023, + "time_per_iteration": 2.5391523838043213 + }, + { + "auxiliary_loss_clip": 0.01161726, + "auxiliary_loss_mlp": 0.01047214, + "balance_loss_clip": 1.06483364, + "balance_loss_mlp": 1.02957058, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 2.1335180477315348, + "language_loss": 0.82443559, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84652495, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.17651367, + "step": 4024, + "time_per_iteration": 2.487692356109619 + }, + { + "auxiliary_loss_clip": 0.01169454, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_clip": 1.0727725, + "balance_loss_mlp": 1.02954531, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 2.044273159794767, + "language_loss": 0.86120826, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.18151855, + "step": 4025, + "time_per_iteration": 2.4248249530792236 + }, + { + "auxiliary_loss_clip": 0.01172413, + "auxiliary_loss_mlp": 0.01044194, + "balance_loss_clip": 1.07241249, + "balance_loss_mlp": 1.02577662, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.9962217541709086, + "language_loss": 0.7136519, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73581791, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18432617, + "step": 4026, + "time_per_iteration": 2.479154109954834 + }, + { + "auxiliary_loss_clip": 0.01159526, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.06398773, + "balance_loss_mlp": 1.02519417, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.8990016650859023, + "language_loss": 0.64303327, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66504854, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.16784668, + "step": 4027, + "time_per_iteration": 4.000975847244263 + }, + { + "auxiliary_loss_clip": 0.01169866, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.06827736, + "balance_loss_mlp": 1.02495527, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.3554033006897632, + "language_loss": 0.70533717, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72744918, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.16381836, + "step": 4028, + "time_per_iteration": 2.4384641647338867 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.01013835, + "balance_loss_clip": 1.07057881, + "balance_loss_mlp": 1.01107824, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8519022869777354, + "language_loss": 0.55326635, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57444048, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 0.02755737, + "step": 4029, + "time_per_iteration": 3.067269802093506 + }, + { + "auxiliary_loss_clip": 0.01163997, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.06517744, + "balance_loss_mlp": 1.02853394, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 1.5433070796437252, + "language_loss": 0.74147689, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76358783, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.18579102, + "step": 4030, + "time_per_iteration": 2.4826650619506836 + }, + { + "auxiliary_loss_clip": 0.01166035, + "auxiliary_loss_mlp": 0.01052182, + "balance_loss_clip": 1.06611633, + "balance_loss_mlp": 1.03443205, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 1.7995808778412967, + "language_loss": 0.76819676, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.79037893, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17773438, + "step": 4031, + "time_per_iteration": 4.008038282394409 + }, + { + "auxiliary_loss_clip": 0.01178025, + "auxiliary_loss_mlp": 0.010459, + "balance_loss_clip": 1.07359529, + "balance_loss_mlp": 1.02822161, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.135656893796346, + "language_loss": 0.65256989, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.6748091, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.17651367, + "step": 4032, + "time_per_iteration": 2.4713051319122314 + }, + { + "auxiliary_loss_clip": 0.01167758, + "auxiliary_loss_mlp": 0.01049601, + "balance_loss_clip": 1.06953883, + "balance_loss_mlp": 1.03146958, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 5.12569471308584, + "language_loss": 0.81421316, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83638674, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.18127441, + "step": 4033, + "time_per_iteration": 2.419600486755371 + }, + { + "auxiliary_loss_clip": 0.0116865, + "auxiliary_loss_mlp": 0.01051715, + "balance_loss_clip": 1.06839514, + "balance_loss_mlp": 1.03417909, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 1.9566836331534792, + "language_loss": 0.69093776, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71314132, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17541504, + "step": 4034, + "time_per_iteration": 2.584465503692627 + }, + { + "auxiliary_loss_clip": 0.01159731, + "auxiliary_loss_mlp": 0.01039032, + "balance_loss_clip": 1.06105804, + "balance_loss_mlp": 1.02051854, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.243822309452869, + "language_loss": 0.96666437, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98865199, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.18505859, + "step": 4035, + "time_per_iteration": 2.450596332550049 + }, + { + "auxiliary_loss_clip": 0.01163654, + "auxiliary_loss_mlp": 0.01048383, + "balance_loss_clip": 1.06707287, + "balance_loss_mlp": 1.03246868, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 1.71820255384571, + "language_loss": 0.78257149, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80469191, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.15899658, + "step": 4036, + "time_per_iteration": 2.5200791358947754 + }, + { + "auxiliary_loss_clip": 0.01162359, + "auxiliary_loss_mlp": 0.01060081, + "balance_loss_clip": 1.06203961, + "balance_loss_mlp": 1.03998196, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.8291643559391508, + "language_loss": 0.74466574, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76689005, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.20092773, + "step": 4037, + "time_per_iteration": 2.4945528507232666 + }, + { + "auxiliary_loss_clip": 0.01177981, + "auxiliary_loss_mlp": 0.01044641, + "balance_loss_clip": 1.07540584, + "balance_loss_mlp": 1.02611613, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.116752004009889, + "language_loss": 0.76288188, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78510809, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.18530273, + "step": 4038, + "time_per_iteration": 2.497976303100586 + }, + { + "auxiliary_loss_clip": 0.01169145, + "auxiliary_loss_mlp": 0.01050515, + "balance_loss_clip": 1.06680715, + "balance_loss_mlp": 1.03201342, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 1.8913046129574944, + "language_loss": 0.72102499, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74322158, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.18493652, + "step": 4039, + "time_per_iteration": 2.5104761123657227 + }, + { + "auxiliary_loss_clip": 0.01162344, + "auxiliary_loss_mlp": 0.01045422, + "balance_loss_clip": 1.06403875, + "balance_loss_mlp": 1.02848208, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.851903377767678, + "language_loss": 0.78437191, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80644965, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16943359, + "step": 4040, + "time_per_iteration": 2.5081937313079834 + }, + { + "auxiliary_loss_clip": 0.01164688, + "auxiliary_loss_mlp": 0.0104189, + "balance_loss_clip": 1.06795299, + "balance_loss_mlp": 1.02623844, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8243511504653802, + "language_loss": 0.80560887, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82767463, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.15649414, + "step": 4041, + "time_per_iteration": 2.5090644359588623 + }, + { + "auxiliary_loss_clip": 0.01165192, + "auxiliary_loss_mlp": 0.01047187, + "balance_loss_clip": 1.0669663, + "balance_loss_mlp": 1.03085494, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 7.029984697364883, + "language_loss": 0.77012062, + "learning_rate": 3.542579399075957e-06, + "loss": 0.79224443, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.16345215, + "step": 4042, + "time_per_iteration": 2.5120480060577393 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.06188905, + "balance_loss_mlp": 1.02005053, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.778724381448577, + "language_loss": 0.81665814, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83858025, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15234375, + "step": 4043, + "time_per_iteration": 2.5548532009124756 + }, + { + "auxiliary_loss_clip": 0.01168143, + "auxiliary_loss_mlp": 0.01041761, + "balance_loss_clip": 1.06640649, + "balance_loss_mlp": 1.02348614, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.2199015553809374, + "language_loss": 0.72854173, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75064075, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.18286133, + "step": 4044, + "time_per_iteration": 2.586176872253418 + }, + { + "auxiliary_loss_clip": 0.0116988, + "auxiliary_loss_mlp": 0.01053542, + "balance_loss_clip": 1.07156181, + "balance_loss_mlp": 1.03384864, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.9241937189991016, + "language_loss": 0.83226168, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85449594, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.19677734, + "step": 4045, + "time_per_iteration": 2.502931833267212 + }, + { + "auxiliary_loss_clip": 0.01171046, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.07089424, + "balance_loss_mlp": 1.03158426, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.5469501369475207, + "language_loss": 0.86654508, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88874614, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17462158, + "step": 4046, + "time_per_iteration": 2.4685182571411133 + }, + { + "auxiliary_loss_clip": 0.01159374, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_clip": 1.06286836, + "balance_loss_mlp": 1.02524567, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.7814279919827083, + "language_loss": 0.72666901, + "learning_rate": 3.5413392369578e-06, + "loss": 0.74869072, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.17565918, + "step": 4047, + "time_per_iteration": 2.4769670963287354 + }, + { + "auxiliary_loss_clip": 0.01166749, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_clip": 1.06687546, + "balance_loss_mlp": 1.02553344, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 3.830788280784194, + "language_loss": 0.72600937, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.74811429, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.18225098, + "step": 4048, + "time_per_iteration": 2.499579906463623 + }, + { + "auxiliary_loss_clip": 0.01167032, + "auxiliary_loss_mlp": 0.0105158, + "balance_loss_clip": 1.06818676, + "balance_loss_mlp": 1.03483117, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 2.3666310480878567, + "language_loss": 0.72816491, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75035101, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16748047, + "step": 4049, + "time_per_iteration": 2.449763298034668 + }, + { + "auxiliary_loss_clip": 0.01155263, + "auxiliary_loss_mlp": 0.01054044, + "balance_loss_clip": 1.05958271, + "balance_loss_mlp": 1.03528035, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 1.6859021862301966, + "language_loss": 0.73777032, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75986344, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.18774414, + "step": 4050, + "time_per_iteration": 2.4680287837982178 + }, + { + "auxiliary_loss_clip": 0.0115926, + "auxiliary_loss_mlp": 0.0104736, + "balance_loss_clip": 1.06503785, + "balance_loss_mlp": 1.03123116, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.9166558599779933, + "language_loss": 0.75372034, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77578652, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.16137695, + "step": 4051, + "time_per_iteration": 2.4292235374450684 + }, + { + "auxiliary_loss_clip": 0.01154741, + "auxiliary_loss_mlp": 0.01044341, + "balance_loss_clip": 1.05900252, + "balance_loss_mlp": 1.02687693, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.1432820873165976, + "language_loss": 0.70463467, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72662544, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.17468262, + "step": 4052, + "time_per_iteration": 2.5534541606903076 + }, + { + "auxiliary_loss_clip": 0.01164039, + "auxiliary_loss_mlp": 0.01050605, + "balance_loss_clip": 1.06482923, + "balance_loss_mlp": 1.03273547, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.709787378729045, + "language_loss": 0.81293571, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83508217, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17871094, + "step": 4053, + "time_per_iteration": 2.5067713260650635 + }, + { + "auxiliary_loss_clip": 0.0116538, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.06681967, + "balance_loss_mlp": 1.01785016, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.6797193923408957, + "language_loss": 0.78109121, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80309647, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17297363, + "step": 4054, + "time_per_iteration": 2.5052363872528076 + }, + { + "auxiliary_loss_clip": 0.01159468, + "auxiliary_loss_mlp": 0.01046361, + "balance_loss_clip": 1.06338596, + "balance_loss_mlp": 1.02994561, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.6490018187294122, + "language_loss": 0.84313977, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86519808, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16418457, + "step": 4055, + "time_per_iteration": 2.5502822399139404 + }, + { + "auxiliary_loss_clip": 0.01168688, + "auxiliary_loss_mlp": 0.01047245, + "balance_loss_clip": 1.06726933, + "balance_loss_mlp": 1.02911282, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 4.33434688480209, + "language_loss": 0.55104077, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57320011, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.18139648, + "step": 4056, + "time_per_iteration": 2.584890842437744 + }, + { + "auxiliary_loss_clip": 0.01168691, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.06755733, + "balance_loss_mlp": 1.02711105, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.3249770776255323, + "language_loss": 0.80156392, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82370067, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.1786499, + "step": 4057, + "time_per_iteration": 2.579329252243042 + }, + { + "auxiliary_loss_clip": 0.01161361, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_clip": 1.06541824, + "balance_loss_mlp": 1.02603936, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.718662141329938, + "language_loss": 0.79679358, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81883943, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.17175293, + "step": 4058, + "time_per_iteration": 2.493870735168457 + }, + { + "auxiliary_loss_clip": 0.01158435, + "auxiliary_loss_mlp": 0.010452, + "balance_loss_clip": 1.05776691, + "balance_loss_mlp": 1.02677059, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.6800487239140518, + "language_loss": 0.85677099, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87880731, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.1842041, + "step": 4059, + "time_per_iteration": 2.5215883255004883 + }, + { + "auxiliary_loss_clip": 0.01161089, + "auxiliary_loss_mlp": 0.01043087, + "balance_loss_clip": 1.06617451, + "balance_loss_mlp": 1.02691078, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6053532464140479, + "language_loss": 0.73971695, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76175869, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16186523, + "step": 4060, + "time_per_iteration": 2.5418083667755127 + }, + { + "auxiliary_loss_clip": 0.01166657, + "auxiliary_loss_mlp": 0.01062622, + "balance_loss_clip": 1.06369662, + "balance_loss_mlp": 1.04046106, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 2.3108281233181596, + "language_loss": 0.73266196, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.7549547, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.22167969, + "step": 4061, + "time_per_iteration": 2.57745099067688 + }, + { + "auxiliary_loss_clip": 0.01164497, + "auxiliary_loss_mlp": 0.01048111, + "balance_loss_clip": 1.06781471, + "balance_loss_mlp": 1.03167236, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.8729777217377979, + "language_loss": 0.76281589, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78494197, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.16430664, + "step": 4062, + "time_per_iteration": 3.8922226428985596 + }, + { + "auxiliary_loss_clip": 0.01162088, + "auxiliary_loss_mlp": 0.01054234, + "balance_loss_clip": 1.06526875, + "balance_loss_mlp": 1.03483868, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.5242305079732295, + "language_loss": 0.85165131, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87381458, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.19384766, + "step": 4063, + "time_per_iteration": 2.5793778896331787 + }, + { + "auxiliary_loss_clip": 0.01176889, + "auxiliary_loss_mlp": 0.0104653, + "balance_loss_clip": 1.07419109, + "balance_loss_mlp": 1.02888703, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 2.5088672199689865, + "language_loss": 0.68326783, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70550203, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.17626953, + "step": 4064, + "time_per_iteration": 2.4943389892578125 + }, + { + "auxiliary_loss_clip": 0.01161878, + "auxiliary_loss_mlp": 0.01042649, + "balance_loss_clip": 1.06198645, + "balance_loss_mlp": 1.02542365, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.690056703835077, + "language_loss": 0.7032361, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72528136, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17211914, + "step": 4065, + "time_per_iteration": 2.547224283218384 + }, + { + "auxiliary_loss_clip": 0.01165931, + "auxiliary_loss_mlp": 0.01049465, + "balance_loss_clip": 1.06332469, + "balance_loss_mlp": 1.02971172, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 1.72540062171109, + "language_loss": 0.84225941, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86441338, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.19763184, + "step": 4066, + "time_per_iteration": 2.7035887241363525 + }, + { + "auxiliary_loss_clip": 0.01084371, + "auxiliary_loss_mlp": 0.01008169, + "balance_loss_clip": 1.05036581, + "balance_loss_mlp": 1.00537634, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7312559393153121, + "language_loss": 0.52265692, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54358232, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.02792358, + "step": 4067, + "time_per_iteration": 3.0213308334350586 + }, + { + "auxiliary_loss_clip": 0.01159883, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.06136, + "balance_loss_mlp": 1.02785015, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 2.769074624440973, + "language_loss": 0.72972071, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.75177515, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.17712402, + "step": 4068, + "time_per_iteration": 2.4521918296813965 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01046907, + "balance_loss_clip": 1.06252623, + "balance_loss_mlp": 1.02872801, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.6917502141885838, + "language_loss": 0.77767885, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79974461, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.1817627, + "step": 4069, + "time_per_iteration": 2.624419689178467 + }, + { + "auxiliary_loss_clip": 0.0115746, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.06023502, + "balance_loss_mlp": 1.02881122, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.726777486011305, + "language_loss": 0.80727518, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82932585, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.18786621, + "step": 4070, + "time_per_iteration": 3.880272150039673 + }, + { + "auxiliary_loss_clip": 0.01157497, + "auxiliary_loss_mlp": 0.0104624, + "balance_loss_clip": 1.05776668, + "balance_loss_mlp": 1.02834678, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.4625153343448856, + "language_loss": 0.84599209, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86802948, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.17883301, + "step": 4071, + "time_per_iteration": 2.5174334049224854 + }, + { + "auxiliary_loss_clip": 0.01166138, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_clip": 1.06180239, + "balance_loss_mlp": 1.02950048, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.7821045965243474, + "language_loss": 0.79879022, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82093441, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.18774414, + "step": 4072, + "time_per_iteration": 3.9193108081817627 + }, + { + "auxiliary_loss_clip": 0.01162922, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_clip": 1.06677413, + "balance_loss_mlp": 1.03068817, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.5060130472984106, + "language_loss": 0.70191061, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72400194, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.15533447, + "step": 4073, + "time_per_iteration": 2.4525206089019775 + }, + { + "auxiliary_loss_clip": 0.01158245, + "auxiliary_loss_mlp": 0.01053522, + "balance_loss_clip": 1.06355262, + "balance_loss_mlp": 1.03577173, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.295734842649942, + "language_loss": 0.67595547, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69807315, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.17749023, + "step": 4074, + "time_per_iteration": 3.8831417560577393 + }, + { + "auxiliary_loss_clip": 0.01084225, + "auxiliary_loss_mlp": 0.01011626, + "balance_loss_clip": 1.04959154, + "balance_loss_mlp": 1.00885463, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.8987624754254766, + "language_loss": 0.68688625, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70784473, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.02770996, + "step": 4075, + "time_per_iteration": 3.191643238067627 + }, + { + "auxiliary_loss_clip": 0.0115604, + "auxiliary_loss_mlp": 0.01051454, + "balance_loss_clip": 1.06121063, + "balance_loss_mlp": 1.03297615, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 1.8234794546736426, + "language_loss": 0.79633403, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81840891, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.18481445, + "step": 4076, + "time_per_iteration": 2.5057759284973145 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_clip": 1.06791139, + "balance_loss_mlp": 1.03334832, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 1.902205694472396, + "language_loss": 0.82153416, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84378266, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.19335938, + "step": 4077, + "time_per_iteration": 2.566998243331909 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.06323659, + "balance_loss_mlp": 1.03021312, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 1.7853229450530443, + "language_loss": 0.62133062, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64344931, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.19555664, + "step": 4078, + "time_per_iteration": 2.549222946166992 + }, + { + "auxiliary_loss_clip": 0.01162277, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.06519878, + "balance_loss_mlp": 1.02332199, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.598988159494396, + "language_loss": 0.75374264, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77576607, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.16748047, + "step": 4079, + "time_per_iteration": 2.5886218547821045 + }, + { + "auxiliary_loss_clip": 0.0116375, + "auxiliary_loss_mlp": 0.01043562, + "balance_loss_clip": 1.06790662, + "balance_loss_mlp": 1.0250845, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8339250495073716, + "language_loss": 0.75354624, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77561939, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.18481445, + "step": 4080, + "time_per_iteration": 2.5516507625579834 + }, + { + "auxiliary_loss_clip": 0.0115042, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.05759549, + "balance_loss_mlp": 1.02643919, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 1.7358328288248999, + "language_loss": 0.83328629, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85522324, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16821289, + "step": 4081, + "time_per_iteration": 2.441330909729004 + }, + { + "auxiliary_loss_clip": 0.01159165, + "auxiliary_loss_mlp": 0.01050258, + "balance_loss_clip": 1.06220031, + "balance_loss_mlp": 1.03374755, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 1.7525112250750472, + "language_loss": 0.73568833, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75778258, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16516113, + "step": 4082, + "time_per_iteration": 2.582491397857666 + }, + { + "auxiliary_loss_clip": 0.01166791, + "auxiliary_loss_mlp": 0.01044958, + "balance_loss_clip": 1.07152081, + "balance_loss_mlp": 1.02956772, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.5472923218763086, + "language_loss": 0.72033668, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74245417, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15393066, + "step": 4083, + "time_per_iteration": 2.5244009494781494 + }, + { + "auxiliary_loss_clip": 0.01159146, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.06036353, + "balance_loss_mlp": 1.02672338, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 1.960080033521119, + "language_loss": 0.7475242, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76956892, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.18591309, + "step": 4084, + "time_per_iteration": 2.476088285446167 + }, + { + "auxiliary_loss_clip": 0.01154162, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.05978298, + "balance_loss_mlp": 1.02522445, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 1.7743102524090522, + "language_loss": 0.85220909, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87417972, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.17675781, + "step": 4085, + "time_per_iteration": 2.476012945175171 + }, + { + "auxiliary_loss_clip": 0.01160092, + "auxiliary_loss_mlp": 0.01051156, + "balance_loss_clip": 1.061854, + "balance_loss_mlp": 1.03354883, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 2.1208113571952873, + "language_loss": 0.79040438, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.81251681, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17614746, + "step": 4086, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01154543, + "auxiliary_loss_mlp": 0.01044142, + "balance_loss_clip": 1.05994523, + "balance_loss_mlp": 1.02699995, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.5208160431204332, + "language_loss": 0.75025821, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77224505, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.17126465, + "step": 4087, + "time_per_iteration": 2.521217107772827 + }, + { + "auxiliary_loss_clip": 0.01159167, + "auxiliary_loss_mlp": 0.01047421, + "balance_loss_clip": 1.06301475, + "balance_loss_mlp": 1.02968252, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.328429709099073, + "language_loss": 0.7984674, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.82053328, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.17749023, + "step": 4088, + "time_per_iteration": 2.433931827545166 + }, + { + "auxiliary_loss_clip": 0.01149567, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.05721128, + "balance_loss_mlp": 1.02017987, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.4804304034628917, + "language_loss": 0.77363569, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79549038, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15722656, + "step": 4089, + "time_per_iteration": 2.5561983585357666 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01047541, + "balance_loss_clip": 1.06233799, + "balance_loss_mlp": 1.02971935, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 1.81919493965081, + "language_loss": 0.8115707, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83363575, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.17834473, + "step": 4090, + "time_per_iteration": 2.617936134338379 + }, + { + "auxiliary_loss_clip": 0.01155248, + "auxiliary_loss_mlp": 0.01052154, + "balance_loss_clip": 1.05690849, + "balance_loss_mlp": 1.03393877, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.8878390613669578, + "language_loss": 0.73588026, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75795418, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.18237305, + "step": 4091, + "time_per_iteration": 2.4708120822906494 + }, + { + "auxiliary_loss_clip": 0.01165058, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.07153702, + "balance_loss_mlp": 1.02849984, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9092603788450586, + "language_loss": 0.76874566, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79084122, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15991211, + "step": 4092, + "time_per_iteration": 2.4580841064453125 + }, + { + "auxiliary_loss_clip": 0.01157962, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.05818319, + "balance_loss_mlp": 1.02360308, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 4.1666410999435435, + "language_loss": 0.81482124, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83680737, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.17016602, + "step": 4093, + "time_per_iteration": 2.5822300910949707 + }, + { + "auxiliary_loss_clip": 0.01158321, + "auxiliary_loss_mlp": 0.0104945, + "balance_loss_clip": 1.05926609, + "balance_loss_mlp": 1.03078222, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.8632043274906385, + "language_loss": 0.871562, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89363974, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.18652344, + "step": 4094, + "time_per_iteration": 2.4491920471191406 + }, + { + "auxiliary_loss_clip": 0.01075861, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.04156137, + "balance_loss_mlp": 1.03121424, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7634589186079679, + "language_loss": 0.57503384, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59612536, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.34277344, + "router_z_loss_mlp": 0.02078247, + "step": 4095, + "time_per_iteration": 3.1372883319854736 + }, + { + "auxiliary_loss_clip": 0.01078115, + "auxiliary_loss_mlp": 0.01016187, + "balance_loss_clip": 1.04482138, + "balance_loss_mlp": 1.01414812, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.6410457798130051, + "language_loss": 0.56234848, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58329153, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.33300781, + "router_z_loss_mlp": 0.02038574, + "step": 4096, + "time_per_iteration": 3.1711266040802 + }, + { + "auxiliary_loss_clip": 0.01158788, + "auxiliary_loss_mlp": 0.01039013, + "balance_loss_clip": 1.06368124, + "balance_loss_mlp": 1.02281249, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 1.8010893034278912, + "language_loss": 0.77452999, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79650795, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16174316, + "step": 4097, + "time_per_iteration": 2.5257561206817627 + }, + { + "auxiliary_loss_clip": 0.01157949, + "auxiliary_loss_mlp": 0.01049741, + "balance_loss_clip": 1.05722964, + "balance_loss_mlp": 1.02959466, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 1.7208329812130774, + "language_loss": 0.76177919, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78385603, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.20153809, + "step": 4098, + "time_per_iteration": 2.494203567504883 + }, + { + "auxiliary_loss_clip": 0.01160979, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.06197572, + "balance_loss_mlp": 1.03194177, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.5603517751679963, + "language_loss": 0.67990136, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70199656, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.1661377, + "step": 4099, + "time_per_iteration": 2.5245301723480225 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.05979252, + "balance_loss_mlp": 1.03029013, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.4113202972893193, + "language_loss": 0.65779406, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.67979658, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.17016602, + "step": 4100, + "time_per_iteration": 2.540879726409912 + }, + { + "auxiliary_loss_clip": 0.01078429, + "auxiliary_loss_mlp": 0.01010264, + "balance_loss_clip": 1.04474401, + "balance_loss_mlp": 1.00813031, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7135597593764286, + "language_loss": 0.61522245, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63610935, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.33691406, + "router_z_loss_mlp": 0.0213623, + "step": 4101, + "time_per_iteration": 3.185302257537842 + }, + { + "auxiliary_loss_clip": 0.011562, + "auxiliary_loss_mlp": 0.01043199, + "balance_loss_clip": 1.06035185, + "balance_loss_mlp": 1.025401, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.5489138258818853, + "language_loss": 0.73609775, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75809169, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.17797852, + "step": 4102, + "time_per_iteration": 2.4855496883392334 + }, + { + "auxiliary_loss_clip": 0.01164896, + "auxiliary_loss_mlp": 0.01038554, + "balance_loss_clip": 1.06476092, + "balance_loss_mlp": 1.02142406, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.283597255108338, + "language_loss": 0.76208717, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78412169, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17150879, + "step": 4103, + "time_per_iteration": 2.536311388015747 + }, + { + "auxiliary_loss_clip": 0.01163928, + "auxiliary_loss_mlp": 0.01042252, + "balance_loss_clip": 1.06698799, + "balance_loss_mlp": 1.0244298, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.2590400722764947, + "language_loss": 0.78380549, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80586731, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17822266, + "step": 4104, + "time_per_iteration": 2.4719345569610596 + }, + { + "auxiliary_loss_clip": 0.01160898, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.06650102, + "balance_loss_mlp": 1.0237186, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.8022161243895243, + "language_loss": 0.8384189, + "learning_rate": 3.526846877170133e-06, + "loss": 0.86045831, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.1932373, + "step": 4105, + "time_per_iteration": 2.5636563301086426 + }, + { + "auxiliary_loss_clip": 0.01158829, + "auxiliary_loss_mlp": 0.01048259, + "balance_loss_clip": 1.06244493, + "balance_loss_mlp": 1.03208196, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.9626049351974957, + "language_loss": 0.76233554, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78440636, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16162109, + "step": 4106, + "time_per_iteration": 3.9214398860931396 + }, + { + "auxiliary_loss_clip": 0.01164952, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.06982493, + "balance_loss_mlp": 1.02579236, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.306921875899284, + "language_loss": 0.72542465, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74751723, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.18505859, + "step": 4107, + "time_per_iteration": 2.431368827819824 + }, + { + "auxiliary_loss_clip": 0.01170311, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_clip": 1.07174814, + "balance_loss_mlp": 1.03228402, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 1.5981261730721512, + "language_loss": 0.65485275, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67705834, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.1796875, + "step": 4108, + "time_per_iteration": 2.5648818016052246 + }, + { + "auxiliary_loss_clip": 0.01158251, + "auxiliary_loss_mlp": 0.010427, + "balance_loss_clip": 1.06069016, + "balance_loss_mlp": 1.02437806, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.7976012175346987, + "language_loss": 0.72894776, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.75095725, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.18322754, + "step": 4109, + "time_per_iteration": 2.641995668411255 + }, + { + "auxiliary_loss_clip": 0.01157156, + "auxiliary_loss_mlp": 0.01051539, + "balance_loss_clip": 1.05775785, + "balance_loss_mlp": 1.03362155, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 2.3595754505783777, + "language_loss": 0.7928834, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81497037, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.17919922, + "step": 4110, + "time_per_iteration": 2.4697206020355225 + }, + { + "auxiliary_loss_clip": 0.01156318, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.06017661, + "balance_loss_mlp": 1.01858616, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.069373770661675, + "language_loss": 0.80799949, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.8299219, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.17321777, + "step": 4111, + "time_per_iteration": 2.4959585666656494 + }, + { + "auxiliary_loss_clip": 0.01160589, + "auxiliary_loss_mlp": 0.01048978, + "balance_loss_clip": 1.06200516, + "balance_loss_mlp": 1.03243184, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.9811447885611735, + "language_loss": 0.7535823, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77567792, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16552734, + "step": 4112, + "time_per_iteration": 2.613935947418213 + }, + { + "auxiliary_loss_clip": 0.01159574, + "auxiliary_loss_mlp": 0.01058758, + "balance_loss_clip": 1.0588181, + "balance_loss_mlp": 1.04098403, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9216928158478919, + "language_loss": 0.82502258, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84720594, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.17773438, + "step": 4113, + "time_per_iteration": 2.4744248390197754 + }, + { + "auxiliary_loss_clip": 0.01155231, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.05865002, + "balance_loss_mlp": 1.02332556, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.083480564402251, + "language_loss": 0.87228405, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.8942464, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17675781, + "step": 4114, + "time_per_iteration": 3.925828695297241 + }, + { + "auxiliary_loss_clip": 0.01155667, + "auxiliary_loss_mlp": 0.0104143, + "balance_loss_clip": 1.06003952, + "balance_loss_mlp": 1.02465677, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.5843395210241369, + "language_loss": 0.75369954, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77567053, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.16784668, + "step": 4115, + "time_per_iteration": 2.532344341278076 + }, + { + "auxiliary_loss_clip": 0.01078593, + "auxiliary_loss_mlp": 0.01011737, + "balance_loss_clip": 1.04487324, + "balance_loss_mlp": 1.00942695, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6651022629060691, + "language_loss": 0.58208668, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60298997, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 0.02307129, + "step": 4116, + "time_per_iteration": 4.621931552886963 + }, + { + "auxiliary_loss_clip": 0.01158408, + "auxiliary_loss_mlp": 0.01050028, + "balance_loss_clip": 1.05848026, + "balance_loss_mlp": 1.0321703, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.4094492231620028, + "language_loss": 0.83569741, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85778177, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.1784668, + "step": 4117, + "time_per_iteration": 4.056411266326904 + }, + { + "auxiliary_loss_clip": 0.01085552, + "auxiliary_loss_mlp": 0.0100912, + "balance_loss_clip": 1.05094218, + "balance_loss_mlp": 1.00694418, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9086586313690334, + "language_loss": 0.63462472, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.6555714, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.34667969, + "router_z_loss_mlp": 0.02175903, + "step": 4118, + "time_per_iteration": 2.971768379211426 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01052695, + "balance_loss_clip": 1.0579834, + "balance_loss_mlp": 1.03543377, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5731659901075332, + "language_loss": 0.79569232, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81775665, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.17260742, + "step": 4119, + "time_per_iteration": 2.502105712890625 + }, + { + "auxiliary_loss_clip": 0.01162373, + "auxiliary_loss_mlp": 0.01047241, + "balance_loss_clip": 1.06358004, + "balance_loss_mlp": 1.0301106, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.6916725950784555, + "language_loss": 0.74600005, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76809615, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.17138672, + "step": 4120, + "time_per_iteration": 2.500265598297119 + }, + { + "auxiliary_loss_clip": 0.01174737, + "auxiliary_loss_mlp": 0.01051891, + "balance_loss_clip": 1.07371521, + "balance_loss_mlp": 1.03356814, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 2.4544557070464283, + "language_loss": 0.88437027, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90663654, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.18347168, + "step": 4121, + "time_per_iteration": 2.492316246032715 + }, + { + "auxiliary_loss_clip": 0.01164292, + "auxiliary_loss_mlp": 0.0104745, + "balance_loss_clip": 1.06586802, + "balance_loss_mlp": 1.02899623, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 1.9358342464425653, + "language_loss": 0.69539666, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71751404, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.18444824, + "step": 4122, + "time_per_iteration": 2.51352858543396 + }, + { + "auxiliary_loss_clip": 0.01172902, + "auxiliary_loss_mlp": 0.01041851, + "balance_loss_clip": 1.0712142, + "balance_loss_mlp": 1.0228014, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 1.9720470904426934, + "language_loss": 0.80258143, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82472903, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.19042969, + "step": 4123, + "time_per_iteration": 2.496201992034912 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01047855, + "balance_loss_clip": 1.06599188, + "balance_loss_mlp": 1.03140378, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.0817316714997993, + "language_loss": 0.74895304, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77105689, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16442871, + "step": 4124, + "time_per_iteration": 2.508317232131958 + }, + { + "auxiliary_loss_clip": 0.01152505, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.06045353, + "balance_loss_mlp": 1.02355409, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.42316172151276, + "language_loss": 0.73830158, + "learning_rate": 3.521804257268357e-06, + "loss": 0.760221, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.15881348, + "step": 4125, + "time_per_iteration": 2.6496663093566895 + }, + { + "auxiliary_loss_clip": 0.01162352, + "auxiliary_loss_mlp": 0.01048037, + "balance_loss_clip": 1.06298161, + "balance_loss_mlp": 1.02997661, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.128192341006253, + "language_loss": 0.69711149, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71921539, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.18078613, + "step": 4126, + "time_per_iteration": 2.4643948078155518 + }, + { + "auxiliary_loss_clip": 0.01154491, + "auxiliary_loss_mlp": 0.01051143, + "balance_loss_clip": 1.05904794, + "balance_loss_mlp": 1.03283215, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 3.5972422819204883, + "language_loss": 0.81312871, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83518505, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.18310547, + "step": 4127, + "time_per_iteration": 2.477674722671509 + }, + { + "auxiliary_loss_clip": 0.01164741, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.06684184, + "balance_loss_mlp": 1.03435063, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 2.309762160040026, + "language_loss": 0.83899081, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86115253, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.17089844, + "step": 4128, + "time_per_iteration": 2.4473366737365723 + }, + { + "auxiliary_loss_clip": 0.01163095, + "auxiliary_loss_mlp": 0.01051185, + "balance_loss_clip": 1.06444979, + "balance_loss_mlp": 1.03381658, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 2.0512575572916676, + "language_loss": 0.65643388, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67857671, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17358398, + "step": 4129, + "time_per_iteration": 2.5901100635528564 + }, + { + "auxiliary_loss_clip": 0.01167895, + "auxiliary_loss_mlp": 0.01048237, + "balance_loss_clip": 1.0687449, + "balance_loss_mlp": 1.02916384, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 3.209998020091185, + "language_loss": 0.75604165, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77820289, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.19067383, + "step": 4130, + "time_per_iteration": 2.503894567489624 + }, + { + "auxiliary_loss_clip": 0.0117419, + "auxiliary_loss_mlp": 0.01051751, + "balance_loss_clip": 1.07414722, + "balance_loss_mlp": 1.03333282, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.2234254187903373, + "language_loss": 0.77041751, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79267693, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.18408203, + "step": 4131, + "time_per_iteration": 2.4486286640167236 + }, + { + "auxiliary_loss_clip": 0.01159766, + "auxiliary_loss_mlp": 0.01043569, + "balance_loss_clip": 1.06395674, + "balance_loss_mlp": 1.02661777, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5667279736999258, + "language_loss": 0.83914816, + "learning_rate": 3.520033883075255e-06, + "loss": 0.8611815, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16943359, + "step": 4132, + "time_per_iteration": 2.5663139820098877 + }, + { + "auxiliary_loss_clip": 0.01159357, + "auxiliary_loss_mlp": 0.01041656, + "balance_loss_clip": 1.06424022, + "balance_loss_mlp": 1.02344084, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 1.632457655744136, + "language_loss": 0.71239698, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73440707, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.18225098, + "step": 4133, + "time_per_iteration": 2.506679058074951 + }, + { + "auxiliary_loss_clip": 0.01174789, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_clip": 1.07241726, + "balance_loss_mlp": 1.02382779, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.784683236064753, + "language_loss": 0.61668694, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63887501, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.20202637, + "step": 4134, + "time_per_iteration": 2.562060594558716 + }, + { + "auxiliary_loss_clip": 0.01157503, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.06075692, + "balance_loss_mlp": 1.0228045, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 1.9088289257521274, + "language_loss": 0.78366613, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80563921, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16992188, + "step": 4135, + "time_per_iteration": 2.4393980503082275 + }, + { + "auxiliary_loss_clip": 0.01168578, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.07242537, + "balance_loss_mlp": 1.0198189, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.434331988714916, + "language_loss": 0.82864678, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85069215, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16156006, + "step": 4136, + "time_per_iteration": 2.5140626430511475 + }, + { + "auxiliary_loss_clip": 0.01169581, + "auxiliary_loss_mlp": 0.01049895, + "balance_loss_clip": 1.06743121, + "balance_loss_mlp": 1.03247809, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7919372651225647, + "language_loss": 0.71029222, + "learning_rate": 3.518767600693314e-06, + "loss": 0.73248696, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.17419434, + "step": 4137, + "time_per_iteration": 2.5719153881073 + }, + { + "auxiliary_loss_clip": 0.0115588, + "auxiliary_loss_mlp": 0.01044894, + "balance_loss_clip": 1.05691385, + "balance_loss_mlp": 1.02813327, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 1.8457503583099488, + "language_loss": 0.66539168, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68739945, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16760254, + "step": 4138, + "time_per_iteration": 2.413674831390381 + }, + { + "auxiliary_loss_clip": 0.01165622, + "auxiliary_loss_mlp": 0.01037651, + "balance_loss_clip": 1.06979465, + "balance_loss_mlp": 1.02137327, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 1.873000478844512, + "language_loss": 0.84022754, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86226022, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.16265869, + "step": 4139, + "time_per_iteration": 2.4995369911193848 + }, + { + "auxiliary_loss_clip": 0.01162083, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.06409049, + "balance_loss_mlp": 1.02286243, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.485747215852392, + "language_loss": 0.78466117, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80669308, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.18237305, + "step": 4140, + "time_per_iteration": 2.4497907161712646 + }, + { + "auxiliary_loss_clip": 0.01094992, + "auxiliary_loss_mlp": 0.01013684, + "balance_loss_clip": 1.06189108, + "balance_loss_mlp": 1.01143074, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8319578486280161, + "language_loss": 0.60992604, + "learning_rate": 3.51775353807742e-06, + "loss": 0.6310128, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.33154297, + "router_z_loss_mlp": 0.02252197, + "step": 4141, + "time_per_iteration": 3.127885580062866 + }, + { + "auxiliary_loss_clip": 0.01158043, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.06055486, + "balance_loss_mlp": 1.02920532, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.0532904381020587, + "language_loss": 0.73251551, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75456381, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17602539, + "step": 4142, + "time_per_iteration": 2.6369903087615967 + }, + { + "auxiliary_loss_clip": 0.01167501, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.06907225, + "balance_loss_mlp": 1.02302432, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.8563298983327943, + "language_loss": 0.81307429, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83515131, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.171875, + "step": 4143, + "time_per_iteration": 2.4659042358398438 + }, + { + "auxiliary_loss_clip": 0.01154051, + "auxiliary_loss_mlp": 0.01037823, + "balance_loss_clip": 1.06065929, + "balance_loss_mlp": 1.02178931, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.770447879004783, + "language_loss": 0.59364927, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61556798, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.16040039, + "step": 4144, + "time_per_iteration": 2.509331703186035 + }, + { + "auxiliary_loss_clip": 0.01155125, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.05923998, + "balance_loss_mlp": 1.02445173, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.2353835565166085, + "language_loss": 0.78769261, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80966002, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17175293, + "step": 4145, + "time_per_iteration": 2.570991277694702 + }, + { + "auxiliary_loss_clip": 0.01162853, + "auxiliary_loss_mlp": 0.01050596, + "balance_loss_clip": 1.05830753, + "balance_loss_mlp": 1.03065193, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.127256705590814, + "language_loss": 0.65119672, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67333114, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.19958496, + "step": 4146, + "time_per_iteration": 2.510448694229126 + }, + { + "auxiliary_loss_clip": 0.0110011, + "auxiliary_loss_mlp": 0.01014611, + "balance_loss_clip": 1.06501544, + "balance_loss_mlp": 1.01250648, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9477694240001451, + "language_loss": 0.67397571, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.6951229, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.35009766, + "router_z_loss_mlp": 0.02105713, + "step": 4147, + "time_per_iteration": 3.2327158451080322 + }, + { + "auxiliary_loss_clip": 0.01154548, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.05942452, + "balance_loss_mlp": 1.03168845, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 1.7566741246553257, + "language_loss": 0.89027607, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91231173, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.17321777, + "step": 4148, + "time_per_iteration": 4.0861427783966064 + }, + { + "auxiliary_loss_clip": 0.01164726, + "auxiliary_loss_mlp": 0.01046747, + "balance_loss_clip": 1.06392574, + "balance_loss_mlp": 1.02654099, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 2.3541091988293132, + "language_loss": 0.67902255, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.7011373, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.20227051, + "step": 4149, + "time_per_iteration": 2.446950674057007 + }, + { + "auxiliary_loss_clip": 0.01177945, + "auxiliary_loss_mlp": 0.01052901, + "balance_loss_clip": 1.07783151, + "balance_loss_mlp": 1.03554416, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.6645530634992822, + "language_loss": 0.71480978, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73711824, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17352295, + "step": 4150, + "time_per_iteration": 2.5068233013153076 + }, + { + "auxiliary_loss_clip": 0.01156547, + "auxiliary_loss_mlp": 0.01045258, + "balance_loss_clip": 1.05913579, + "balance_loss_mlp": 1.02750754, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.8083958340724964, + "language_loss": 0.72788119, + "learning_rate": 3.515214354149478e-06, + "loss": 0.74989927, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17749023, + "step": 4151, + "time_per_iteration": 2.43355131149292 + }, + { + "auxiliary_loss_clip": 0.01164844, + "auxiliary_loss_mlp": 0.01052271, + "balance_loss_clip": 1.06225204, + "balance_loss_mlp": 1.03391314, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 2.8814586789066974, + "language_loss": 0.63385415, + "learning_rate": 3.514960119583781e-06, + "loss": 0.65602529, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.18347168, + "step": 4152, + "time_per_iteration": 2.509474277496338 + }, + { + "auxiliary_loss_clip": 0.01152815, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.05969715, + "balance_loss_mlp": 1.02460909, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.858580353352988, + "language_loss": 0.76943266, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79137653, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16967773, + "step": 4153, + "time_per_iteration": 2.4511406421661377 + }, + { + "auxiliary_loss_clip": 0.01155751, + "auxiliary_loss_mlp": 0.01042272, + "balance_loss_clip": 1.06177521, + "balance_loss_mlp": 1.02489161, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 1.9359543411730022, + "language_loss": 0.7613821, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78336227, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.17382812, + "step": 4154, + "time_per_iteration": 2.5146470069885254 + }, + { + "auxiliary_loss_clip": 0.01168319, + "auxiliary_loss_mlp": 0.01044821, + "balance_loss_clip": 1.06716728, + "balance_loss_mlp": 1.0250206, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.0534421896319794, + "language_loss": 0.70589006, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72802138, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.19812012, + "step": 4155, + "time_per_iteration": 2.460134744644165 + }, + { + "auxiliary_loss_clip": 0.01157421, + "auxiliary_loss_mlp": 0.01047402, + "balance_loss_clip": 1.05928254, + "balance_loss_mlp": 1.02977061, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.5972097298630832, + "language_loss": 0.75441813, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77646643, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.1763916, + "step": 4156, + "time_per_iteration": 2.485534191131592 + }, + { + "auxiliary_loss_clip": 0.01160191, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.06528115, + "balance_loss_mlp": 1.02415228, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.5499807146374978, + "language_loss": 0.7769962, + "learning_rate": 3.513688085236591e-06, + "loss": 0.79900181, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.16210938, + "step": 4157, + "time_per_iteration": 2.4722530841827393 + }, + { + "auxiliary_loss_clip": 0.01155628, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.05983019, + "balance_loss_mlp": 1.02354479, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 2.1484613352821826, + "language_loss": 0.81314027, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83510774, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17578125, + "step": 4158, + "time_per_iteration": 3.9544031620025635 + }, + { + "auxiliary_loss_clip": 0.01158933, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.0603385, + "balance_loss_mlp": 1.02168703, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 1.8199135899502055, + "language_loss": 0.75790405, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77988207, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.171875, + "step": 4159, + "time_per_iteration": 3.9674012660980225 + }, + { + "auxiliary_loss_clip": 0.01159639, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.05911911, + "balance_loss_mlp": 1.02220476, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 1.635447013669922, + "language_loss": 0.72102386, + "learning_rate": 3.512924175760649e-06, + "loss": 0.74302703, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.18493652, + "step": 4160, + "time_per_iteration": 2.4719579219818115 + }, + { + "auxiliary_loss_clip": 0.0108471, + "auxiliary_loss_mlp": 0.01005911, + "balance_loss_clip": 1.05133295, + "balance_loss_mlp": 1.00373292, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7434408365810727, + "language_loss": 0.567433, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58833921, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.02178955, + "step": 4161, + "time_per_iteration": 3.139643430709839 + }, + { + "auxiliary_loss_clip": 0.01165156, + "auxiliary_loss_mlp": 0.01057246, + "balance_loss_clip": 1.06143141, + "balance_loss_mlp": 1.03664696, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.845302096283586, + "language_loss": 0.81118071, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.83340472, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.20605469, + "step": 4162, + "time_per_iteration": 3.967043161392212 + }, + { + "auxiliary_loss_clip": 0.0116007, + "auxiliary_loss_mlp": 0.01044722, + "balance_loss_clip": 1.05817688, + "balance_loss_mlp": 1.02697217, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.699668184930343, + "language_loss": 0.87958258, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.90163046, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.1776123, + "step": 4163, + "time_per_iteration": 2.4191641807556152 + }, + { + "auxiliary_loss_clip": 0.01157099, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_clip": 1.05723858, + "balance_loss_mlp": 1.02257824, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.7754862313294892, + "language_loss": 0.82961863, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85159373, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.17834473, + "step": 4164, + "time_per_iteration": 2.4845988750457764 + }, + { + "auxiliary_loss_clip": 0.01154791, + "auxiliary_loss_mlp": 0.01049002, + "balance_loss_clip": 1.0626992, + "balance_loss_mlp": 1.03284907, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 2.483128829243156, + "language_loss": 0.74055541, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76259327, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.16149902, + "step": 4165, + "time_per_iteration": 2.585763692855835 + }, + { + "auxiliary_loss_clip": 0.01153764, + "auxiliary_loss_mlp": 0.01048733, + "balance_loss_clip": 1.05425143, + "balance_loss_mlp": 1.03017187, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.7918764358409571, + "language_loss": 0.74354255, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.7655676, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.1854248, + "step": 4166, + "time_per_iteration": 2.500399112701416 + }, + { + "auxiliary_loss_clip": 0.01151515, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.0572257, + "balance_loss_mlp": 1.02958894, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.7467449916079367, + "language_loss": 0.82149827, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84347641, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.16723633, + "step": 4167, + "time_per_iteration": 2.5031774044036865 + }, + { + "auxiliary_loss_clip": 0.01150487, + "auxiliary_loss_mlp": 0.01045683, + "balance_loss_clip": 1.05481911, + "balance_loss_mlp": 1.02823019, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 1.9186287364758978, + "language_loss": 0.79516876, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81713051, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.17456055, + "step": 4168, + "time_per_iteration": 2.518582582473755 + }, + { + "auxiliary_loss_clip": 0.01165952, + "auxiliary_loss_mlp": 0.01056958, + "balance_loss_clip": 1.05925477, + "balance_loss_mlp": 1.03565502, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.8652132167392743, + "language_loss": 0.69539881, + "learning_rate": 3.510629350383849e-06, + "loss": 0.71762788, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.2130127, + "step": 4169, + "time_per_iteration": 2.6301753520965576 + }, + { + "auxiliary_loss_clip": 0.01156268, + "auxiliary_loss_mlp": 0.01055279, + "balance_loss_clip": 1.05811346, + "balance_loss_mlp": 1.03813684, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.7464378450792584, + "language_loss": 0.77283907, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79495448, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.17138672, + "step": 4170, + "time_per_iteration": 2.5416736602783203 + }, + { + "auxiliary_loss_clip": 0.01157684, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.06072581, + "balance_loss_mlp": 1.02678847, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.3592810621353073, + "language_loss": 0.76339877, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78540862, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16516113, + "step": 4171, + "time_per_iteration": 2.4780120849609375 + }, + { + "auxiliary_loss_clip": 0.01088733, + "auxiliary_loss_mlp": 0.01016864, + "balance_loss_clip": 1.05463696, + "balance_loss_mlp": 1.01470971, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8456366429444446, + "language_loss": 0.60024613, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62130213, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.34082031, + "router_z_loss_mlp": 0.02154541, + "step": 4172, + "time_per_iteration": 3.1423892974853516 + }, + { + "auxiliary_loss_clip": 0.01162148, + "auxiliary_loss_mlp": 0.01044081, + "balance_loss_clip": 1.0651809, + "balance_loss_mlp": 1.02679586, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4999133636116946, + "language_loss": 0.79097283, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81303507, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.17285156, + "step": 4173, + "time_per_iteration": 2.52085018157959 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.05895853, + "balance_loss_mlp": 1.02954638, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.197498383736976, + "language_loss": 0.83306694, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85510093, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.17529297, + "step": 4174, + "time_per_iteration": 2.4450523853302 + }, + { + "auxiliary_loss_clip": 0.01162001, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.06320405, + "balance_loss_mlp": 1.02363861, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.4610829908411826, + "language_loss": 0.71297038, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73500407, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.17724609, + "step": 4175, + "time_per_iteration": 2.45527720451355 + }, + { + "auxiliary_loss_clip": 0.01155825, + "auxiliary_loss_mlp": 0.01041168, + "balance_loss_clip": 1.05577421, + "balance_loss_mlp": 1.02306032, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.0929588380658304, + "language_loss": 0.80791026, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82988018, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.18115234, + "step": 4176, + "time_per_iteration": 2.5267844200134277 + }, + { + "auxiliary_loss_clip": 0.01159871, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.06071901, + "balance_loss_mlp": 1.02435255, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.2509913917002033, + "language_loss": 0.82652903, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.84856069, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.1895752, + "step": 4177, + "time_per_iteration": 2.4405109882354736 + }, + { + "auxiliary_loss_clip": 0.01163558, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.06814027, + "balance_loss_mlp": 1.02926219, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.015318244980063, + "language_loss": 0.8262167, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84831893, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.17431641, + "step": 4178, + "time_per_iteration": 2.588589668273926 + }, + { + "auxiliary_loss_clip": 0.01149638, + "auxiliary_loss_mlp": 0.01047918, + "balance_loss_clip": 1.0560261, + "balance_loss_mlp": 1.03134811, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.160979802604384, + "language_loss": 0.7579459, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77992147, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16577148, + "step": 4179, + "time_per_iteration": 2.4593982696533203 + }, + { + "auxiliary_loss_clip": 0.01156587, + "auxiliary_loss_mlp": 0.01052283, + "balance_loss_clip": 1.05781376, + "balance_loss_mlp": 1.03337657, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 5.683909622521087, + "language_loss": 0.69928432, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72137308, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.18896484, + "step": 4180, + "time_per_iteration": 2.52925705909729 + }, + { + "auxiliary_loss_clip": 0.01159015, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.06334472, + "balance_loss_mlp": 1.02929628, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.8009330429511394, + "language_loss": 0.85975391, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88180459, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.16772461, + "step": 4181, + "time_per_iteration": 2.454995632171631 + }, + { + "auxiliary_loss_clip": 0.01152957, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.05809855, + "balance_loss_mlp": 1.02600551, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 2.2816724600827203, + "language_loss": 0.68981946, + "learning_rate": 3.507306412966238e-06, + "loss": 0.71178079, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17163086, + "step": 4182, + "time_per_iteration": 2.621509313583374 + }, + { + "auxiliary_loss_clip": 0.01075987, + "auxiliary_loss_mlp": 0.01001466, + "balance_loss_clip": 1.04351854, + "balance_loss_mlp": 0.99955261, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8438751954638879, + "language_loss": 0.7011013, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72187585, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.32470703, + "router_z_loss_mlp": 0.019104, + "step": 4183, + "time_per_iteration": 3.119410991668701 + }, + { + "auxiliary_loss_clip": 0.01152182, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.05408323, + "balance_loss_mlp": 1.02241683, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.7264352961336886, + "language_loss": 0.74216789, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76409793, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.18395996, + "step": 4184, + "time_per_iteration": 2.4262590408325195 + }, + { + "auxiliary_loss_clip": 0.01168177, + "auxiliary_loss_mlp": 0.01052, + "balance_loss_clip": 1.06995726, + "balance_loss_mlp": 1.0348103, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 2.72296104053349, + "language_loss": 0.83499753, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85719931, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17175293, + "step": 4185, + "time_per_iteration": 2.529611587524414 + }, + { + "auxiliary_loss_clip": 0.01087255, + "auxiliary_loss_mlp": 0.0100409, + "balance_loss_clip": 1.05525255, + "balance_loss_mlp": 1.00234056, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7854716009668494, + "language_loss": 0.61509079, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63600421, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 0.01751709, + "step": 4186, + "time_per_iteration": 3.015244245529175 + }, + { + "auxiliary_loss_clip": 0.01155599, + "auxiliary_loss_mlp": 0.0103882, + "balance_loss_clip": 1.05910742, + "balance_loss_mlp": 1.0209868, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 1.8830986817610826, + "language_loss": 0.79120415, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81314838, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.1784668, + "step": 4187, + "time_per_iteration": 2.4628636837005615 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01057343, + "balance_loss_clip": 1.06964672, + "balance_loss_mlp": 1.03952169, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.6286561190206217, + "language_loss": 0.79870737, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82094544, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.17797852, + "step": 4188, + "time_per_iteration": 2.551236152648926 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.05720389, + "balance_loss_mlp": 1.02660418, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.8155738427298111, + "language_loss": 0.74609959, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76805824, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.16821289, + "step": 4189, + "time_per_iteration": 2.5209567546844482 + }, + { + "auxiliary_loss_clip": 0.01148999, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.05665815, + "balance_loss_mlp": 1.02907324, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.9153381847597777, + "language_loss": 0.84483093, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86678421, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.17272949, + "step": 4190, + "time_per_iteration": 2.6139657497406006 + }, + { + "auxiliary_loss_clip": 0.01151776, + "auxiliary_loss_mlp": 0.01045134, + "balance_loss_clip": 1.05662847, + "balance_loss_mlp": 1.02554739, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 2.0640662289698586, + "language_loss": 0.7523663, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77433538, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.19580078, + "step": 4191, + "time_per_iteration": 2.5183558464050293 + }, + { + "auxiliary_loss_clip": 0.01079818, + "auxiliary_loss_mlp": 0.0100751, + "balance_loss_clip": 1.04814219, + "balance_loss_mlp": 1.00576401, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7372325291673688, + "language_loss": 0.57188034, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59275365, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.31689453, + "router_z_loss_mlp": 0.01748657, + "step": 4192, + "time_per_iteration": 4.6232709884643555 + }, + { + "auxiliary_loss_clip": 0.01154431, + "auxiliary_loss_mlp": 0.01046987, + "balance_loss_clip": 1.05850697, + "balance_loss_mlp": 1.02910519, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 1.7454130547688649, + "language_loss": 0.76096523, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78297937, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17883301, + "step": 4193, + "time_per_iteration": 2.48897123336792 + }, + { + "auxiliary_loss_clip": 0.0116257, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.06422043, + "balance_loss_mlp": 1.02681923, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.271893267000244, + "language_loss": 0.84171224, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86377811, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.17224121, + "step": 4194, + "time_per_iteration": 2.4697134494781494 + }, + { + "auxiliary_loss_clip": 0.01182588, + "auxiliary_loss_mlp": 0.01048458, + "balance_loss_clip": 1.07856297, + "balance_loss_mlp": 1.03158998, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.4216079562127208, + "language_loss": 0.88436514, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90667564, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.1685791, + "step": 4195, + "time_per_iteration": 2.513444662094116 + }, + { + "auxiliary_loss_clip": 0.01150163, + "auxiliary_loss_mlp": 0.01044389, + "balance_loss_clip": 1.05319548, + "balance_loss_mlp": 1.02448082, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 1.8911560250501802, + "language_loss": 0.85352147, + "learning_rate": 3.503717062883053e-06, + "loss": 0.875467, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.19897461, + "step": 4196, + "time_per_iteration": 2.53324818611145 + }, + { + "auxiliary_loss_clip": 0.01157758, + "auxiliary_loss_mlp": 0.01044692, + "balance_loss_clip": 1.05948973, + "balance_loss_mlp": 1.02763343, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6714768762540124, + "language_loss": 0.83678985, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85881436, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17053223, + "step": 4197, + "time_per_iteration": 2.513490676879883 + }, + { + "auxiliary_loss_clip": 0.01155645, + "auxiliary_loss_mlp": 0.01049046, + "balance_loss_clip": 1.05815697, + "balance_loss_mlp": 1.02947211, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 1.7674985675331976, + "language_loss": 0.72624159, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74828851, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.19567871, + "step": 4198, + "time_per_iteration": 2.634012222290039 + }, + { + "auxiliary_loss_clip": 0.01151236, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_clip": 1.05259371, + "balance_loss_mlp": 1.03090417, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 4.600745611205045, + "language_loss": 0.77546978, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79747784, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.18664551, + "step": 4199, + "time_per_iteration": 2.475839853286743 + }, + { + "auxiliary_loss_clip": 0.01167213, + "auxiliary_loss_mlp": 0.01046942, + "balance_loss_clip": 1.06694043, + "balance_loss_mlp": 1.03008556, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 2.7621018829392363, + "language_loss": 0.73364866, + "learning_rate": 3.502689480360739e-06, + "loss": 0.75579011, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.16845703, + "step": 4200, + "time_per_iteration": 2.5754806995391846 + }, + { + "auxiliary_loss_clip": 0.01157625, + "auxiliary_loss_mlp": 0.01051223, + "balance_loss_clip": 1.05485559, + "balance_loss_mlp": 1.03422415, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.6048022132614816, + "language_loss": 0.82795417, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.8500427, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.17016602, + "step": 4201, + "time_per_iteration": 4.2020955085754395 + }, + { + "auxiliary_loss_clip": 0.01161293, + "auxiliary_loss_mlp": 0.01048284, + "balance_loss_clip": 1.06132722, + "balance_loss_mlp": 1.03050995, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.7276711579280997, + "language_loss": 0.75337815, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77547395, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17773438, + "step": 4202, + "time_per_iteration": 2.6008718013763428 + }, + { + "auxiliary_loss_clip": 0.01167864, + "auxiliary_loss_mlp": 0.01041029, + "balance_loss_clip": 1.06996894, + "balance_loss_mlp": 1.02429223, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.8839224066527844, + "language_loss": 0.7356568, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75774574, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16741943, + "step": 4203, + "time_per_iteration": 3.872321605682373 + }, + { + "auxiliary_loss_clip": 0.01159181, + "auxiliary_loss_mlp": 0.01045537, + "balance_loss_clip": 1.06175959, + "balance_loss_mlp": 1.0289433, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.8104439445066998, + "language_loss": 0.77593577, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79798287, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16595459, + "step": 4204, + "time_per_iteration": 2.507899284362793 + }, + { + "auxiliary_loss_clip": 0.01163291, + "auxiliary_loss_mlp": 0.01051878, + "balance_loss_clip": 1.06490254, + "balance_loss_mlp": 1.03394866, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 2.248200550666839, + "language_loss": 0.72046608, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74261779, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17932129, + "step": 4205, + "time_per_iteration": 3.8197712898254395 + }, + { + "auxiliary_loss_clip": 0.01145432, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.05507588, + "balance_loss_mlp": 1.02209044, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.44641732787052, + "language_loss": 0.75963581, + "learning_rate": 3.50114639730826e-06, + "loss": 0.78146935, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.1583252, + "step": 4206, + "time_per_iteration": 2.727931499481201 + }, + { + "auxiliary_loss_clip": 0.01147867, + "auxiliary_loss_mlp": 0.01043169, + "balance_loss_clip": 1.05306923, + "balance_loss_mlp": 1.02706409, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 2.126933358050079, + "language_loss": 0.78856099, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81047136, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.16088867, + "step": 4207, + "time_per_iteration": 2.5582492351531982 + }, + { + "auxiliary_loss_clip": 0.01152053, + "auxiliary_loss_mlp": 0.01050443, + "balance_loss_clip": 1.05831504, + "balance_loss_mlp": 1.03362226, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5386842545098403, + "language_loss": 0.76118535, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78321028, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16821289, + "step": 4208, + "time_per_iteration": 2.53332781791687 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.0567857, + "balance_loss_mlp": 1.02356195, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.9842535618133017, + "language_loss": 0.69847739, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72037411, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16491699, + "step": 4209, + "time_per_iteration": 2.5255308151245117 + }, + { + "auxiliary_loss_clip": 0.01089474, + "auxiliary_loss_mlp": 0.01008272, + "balance_loss_clip": 1.05636775, + "balance_loss_mlp": 1.00605488, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7947272509771331, + "language_loss": 0.551566, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.5725435, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.33154297, + "router_z_loss_mlp": 0.02218628, + "step": 4210, + "time_per_iteration": 3.245244264602661 + }, + { + "auxiliary_loss_clip": 0.01155094, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.05883527, + "balance_loss_mlp": 1.01967072, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.7206941930919295, + "language_loss": 0.80251271, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82442164, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16137695, + "step": 4211, + "time_per_iteration": 2.464367389678955 + }, + { + "auxiliary_loss_clip": 0.01151586, + "auxiliary_loss_mlp": 0.01041451, + "balance_loss_clip": 1.05824327, + "balance_loss_mlp": 1.02548933, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.5086453866496765, + "language_loss": 0.7837044, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80563474, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.15948486, + "step": 4212, + "time_per_iteration": 2.5141069889068604 + }, + { + "auxiliary_loss_clip": 0.01149455, + "auxiliary_loss_mlp": 0.01051127, + "balance_loss_clip": 1.05370033, + "balance_loss_mlp": 1.03232777, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 3.1783946302815056, + "language_loss": 0.53731221, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55931807, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.18811035, + "step": 4213, + "time_per_iteration": 2.676110029220581 + }, + { + "auxiliary_loss_clip": 0.01156226, + "auxiliary_loss_mlp": 0.01045394, + "balance_loss_clip": 1.06203389, + "balance_loss_mlp": 1.02790594, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 3.2159812944088486, + "language_loss": 0.6484679, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67048407, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.17480469, + "step": 4214, + "time_per_iteration": 2.512389898300171 + }, + { + "auxiliary_loss_clip": 0.01088917, + "auxiliary_loss_mlp": 0.01004961, + "balance_loss_clip": 1.05616724, + "balance_loss_mlp": 1.00329196, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8576629677423823, + "language_loss": 0.57995182, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60089064, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.32763672, + "router_z_loss_mlp": 0.01672363, + "step": 4215, + "time_per_iteration": 2.836705207824707 + }, + { + "auxiliary_loss_clip": 0.01154207, + "auxiliary_loss_mlp": 0.01042414, + "balance_loss_clip": 1.06100929, + "balance_loss_mlp": 1.02558112, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7226113568934553, + "language_loss": 0.83614969, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85811591, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.16821289, + "step": 4216, + "time_per_iteration": 2.661470413208008 + }, + { + "auxiliary_loss_clip": 0.011582, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.06384218, + "balance_loss_mlp": 1.01914966, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 4.1813062940303904, + "language_loss": 0.80182731, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82377249, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.17163086, + "step": 4217, + "time_per_iteration": 2.4615414142608643 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01040795, + "balance_loss_clip": 1.05609691, + "balance_loss_mlp": 1.02521443, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.6937787469256183, + "language_loss": 0.74579501, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.76769912, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.15576172, + "step": 4218, + "time_per_iteration": 2.4806740283966064 + }, + { + "auxiliary_loss_clip": 0.01151978, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.05603325, + "balance_loss_mlp": 1.02437758, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.6454882252933472, + "language_loss": 0.74489427, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76682615, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.16845703, + "step": 4219, + "time_per_iteration": 2.4780356884002686 + }, + { + "auxiliary_loss_clip": 0.01155354, + "auxiliary_loss_mlp": 0.01055148, + "balance_loss_clip": 1.05906045, + "balance_loss_mlp": 1.03600335, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 1.8751417809906528, + "language_loss": 0.8152225, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83732754, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.19128418, + "step": 4220, + "time_per_iteration": 2.4745776653289795 + }, + { + "auxiliary_loss_clip": 0.0115852, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.06281519, + "balance_loss_mlp": 1.02980971, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.120654799101651, + "language_loss": 0.71219409, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73425615, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.17895508, + "step": 4221, + "time_per_iteration": 2.4781062602996826 + }, + { + "auxiliary_loss_clip": 0.01166967, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.06960166, + "balance_loss_mlp": 1.02248931, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 1.6143844293744125, + "language_loss": 0.61693639, + "learning_rate": 3.497021496342202e-06, + "loss": 0.63900137, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17016602, + "step": 4222, + "time_per_iteration": 2.485895872116089 + }, + { + "auxiliary_loss_clip": 0.0115623, + "auxiliary_loss_mlp": 0.01049835, + "balance_loss_clip": 1.0580163, + "balance_loss_mlp": 1.03258538, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.717775328684787, + "language_loss": 0.74803972, + "learning_rate": 3.496763207094731e-06, + "loss": 0.77010036, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.17236328, + "step": 4223, + "time_per_iteration": 2.5265896320343018 + }, + { + "auxiliary_loss_clip": 0.01158691, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.06418371, + "balance_loss_mlp": 1.0229764, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.5306627029934567, + "language_loss": 0.80026126, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82223964, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16162109, + "step": 4224, + "time_per_iteration": 2.5109148025512695 + }, + { + "auxiliary_loss_clip": 0.01159015, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.06333756, + "balance_loss_mlp": 1.02685654, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.4744311059866624, + "language_loss": 0.77809644, + "learning_rate": 3.496246458337354e-06, + "loss": 0.80011982, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.16461182, + "step": 4225, + "time_per_iteration": 2.543273687362671 + }, + { + "auxiliary_loss_clip": 0.01152572, + "auxiliary_loss_mlp": 0.01054885, + "balance_loss_clip": 1.05720019, + "balance_loss_mlp": 1.03727818, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.9155489514220303, + "language_loss": 0.84681016, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86888468, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.17614746, + "step": 4226, + "time_per_iteration": 2.5265400409698486 + }, + { + "auxiliary_loss_clip": 0.0114789, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.05506158, + "balance_loss_mlp": 1.02862716, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.5007723424750008, + "language_loss": 0.71136081, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73329532, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16931152, + "step": 4227, + "time_per_iteration": 2.5443508625030518 + }, + { + "auxiliary_loss_clip": 0.01082896, + "auxiliary_loss_mlp": 0.01020718, + "balance_loss_clip": 1.05010605, + "balance_loss_mlp": 1.01881051, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9938032159579038, + "language_loss": 0.61922812, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64026427, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.01907349, + "step": 4228, + "time_per_iteration": 2.926722288131714 + }, + { + "auxiliary_loss_clip": 0.01150894, + "auxiliary_loss_mlp": 0.01041123, + "balance_loss_clip": 1.05414152, + "balance_loss_mlp": 1.02352774, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.7536152219111605, + "language_loss": 0.86085582, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88277596, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.17602539, + "step": 4229, + "time_per_iteration": 2.3977091312408447 + }, + { + "auxiliary_loss_clip": 0.01159698, + "auxiliary_loss_mlp": 0.01046073, + "balance_loss_clip": 1.06409323, + "balance_loss_mlp": 1.02897859, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 2.1744462789494556, + "language_loss": 0.77392054, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79597819, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.17102051, + "step": 4230, + "time_per_iteration": 2.5214216709136963 + }, + { + "auxiliary_loss_clip": 0.01154276, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.05961895, + "balance_loss_mlp": 1.02648389, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 2.0523911622052617, + "language_loss": 0.75291181, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.7748996, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.18017578, + "step": 4231, + "time_per_iteration": 2.440443277359009 + }, + { + "auxiliary_loss_clip": 0.01153463, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.05994582, + "balance_loss_mlp": 1.02008331, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.0527265953814067, + "language_loss": 0.73795271, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.75985605, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16784668, + "step": 4232, + "time_per_iteration": 2.4617209434509277 + }, + { + "auxiliary_loss_clip": 0.01154749, + "auxiliary_loss_mlp": 0.0104262, + "balance_loss_clip": 1.06087661, + "balance_loss_mlp": 1.02475023, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.7965106213703024, + "language_loss": 0.87017131, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.89214504, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.17883301, + "step": 4233, + "time_per_iteration": 2.4978339672088623 + }, + { + "auxiliary_loss_clip": 0.0114782, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.05906379, + "balance_loss_mlp": 1.01950169, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.6635962182411586, + "language_loss": 0.74739099, + "learning_rate": 3.493918281539737e-06, + "loss": 0.76921511, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.15087891, + "step": 4234, + "time_per_iteration": 2.5092897415161133 + }, + { + "auxiliary_loss_clip": 0.01153318, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.05611992, + "balance_loss_mlp": 1.02505541, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.518115132668292, + "language_loss": 0.74637955, + "learning_rate": 3.493659311850379e-06, + "loss": 0.76832724, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16394043, + "step": 4235, + "time_per_iteration": 3.9299964904785156 + }, + { + "auxiliary_loss_clip": 0.01163449, + "auxiliary_loss_mlp": 0.01047005, + "balance_loss_clip": 1.06091392, + "balance_loss_mlp": 1.0270493, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0213472459513557, + "language_loss": 0.64798677, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.67009127, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.19970703, + "step": 4236, + "time_per_iteration": 2.517857789993286 + }, + { + "auxiliary_loss_clip": 0.01163829, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.06864631, + "balance_loss_mlp": 1.01783931, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 2.1421280991864284, + "language_loss": 0.66652858, + "learning_rate": 3.493141202562354e-06, + "loss": 0.688501, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.15563965, + "step": 4237, + "time_per_iteration": 2.47536039352417 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.06141579, + "balance_loss_mlp": 1.0309217, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 2.118492350593305, + "language_loss": 0.75344706, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77550411, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.17419434, + "step": 4238, + "time_per_iteration": 2.505932569503784 + }, + { + "auxiliary_loss_clip": 0.01158604, + "auxiliary_loss_mlp": 0.01052038, + "balance_loss_clip": 1.06222486, + "balance_loss_mlp": 1.03199935, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8491003906335086, + "language_loss": 0.80802172, + "learning_rate": 3.492622866794074e-06, + "loss": 0.83012819, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.20031738, + "step": 4239, + "time_per_iteration": 2.5833332538604736 + }, + { + "auxiliary_loss_clip": 0.0115298, + "auxiliary_loss_mlp": 0.01044623, + "balance_loss_clip": 1.0603987, + "balance_loss_mlp": 1.02608562, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.881292766937472, + "language_loss": 0.77775073, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79972678, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.18530273, + "step": 4240, + "time_per_iteration": 2.4753470420837402 + }, + { + "auxiliary_loss_clip": 0.01160544, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.0593214, + "balance_loss_mlp": 1.0194571, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 1.778308229932774, + "language_loss": 0.82994676, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85192895, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.18200684, + "step": 4241, + "time_per_iteration": 2.5806002616882324 + }, + { + "auxiliary_loss_clip": 0.01155737, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.05999124, + "balance_loss_mlp": 1.03094411, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.7217279379070993, + "language_loss": 0.73859751, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.76063263, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16833496, + "step": 4242, + "time_per_iteration": 2.525207996368408 + }, + { + "auxiliary_loss_clip": 0.01161586, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.06414592, + "balance_loss_mlp": 1.02165389, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 5.332420197608134, + "language_loss": 0.73308253, + "learning_rate": 3.491585516131273e-06, + "loss": 0.7550832, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16845703, + "step": 4243, + "time_per_iteration": 2.4291255474090576 + }, + { + "auxiliary_loss_clip": 0.01160985, + "auxiliary_loss_mlp": 0.01045805, + "balance_loss_clip": 1.06302238, + "balance_loss_mlp": 1.02810299, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 1.6424582018882974, + "language_loss": 0.81502068, + "learning_rate": 3.491326037038301e-06, + "loss": 0.83708858, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.17700195, + "step": 4244, + "time_per_iteration": 2.461782693862915 + }, + { + "auxiliary_loss_clip": 0.01107954, + "auxiliary_loss_mlp": 0.01011309, + "balance_loss_clip": 1.07311046, + "balance_loss_mlp": 1.00935125, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6918237822342245, + "language_loss": 0.57717657, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59836918, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.34863281, + "router_z_loss_mlp": 0.01956177, + "step": 4245, + "time_per_iteration": 4.629671096801758 + }, + { + "auxiliary_loss_clip": 0.01162765, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.06451917, + "balance_loss_mlp": 1.0271101, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.2080815919694645, + "language_loss": 0.65069324, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.6727615, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16967773, + "step": 4246, + "time_per_iteration": 3.9383201599121094 + }, + { + "auxiliary_loss_clip": 0.01151983, + "auxiliary_loss_mlp": 0.01046049, + "balance_loss_clip": 1.0606699, + "balance_loss_mlp": 1.03048015, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 2.2203794046813936, + "language_loss": 0.81481624, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83679652, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.15563965, + "step": 4247, + "time_per_iteration": 2.4743728637695312 + }, + { + "auxiliary_loss_clip": 0.01159991, + "auxiliary_loss_mlp": 0.01047097, + "balance_loss_clip": 1.05647039, + "balance_loss_mlp": 1.02783263, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.1777557084228, + "language_loss": 0.83620834, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85827923, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.19262695, + "step": 4248, + "time_per_iteration": 2.458237648010254 + }, + { + "auxiliary_loss_clip": 0.01162265, + "auxiliary_loss_mlp": 0.01053104, + "balance_loss_clip": 1.06259036, + "balance_loss_mlp": 1.03599787, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 3.635071379339731, + "language_loss": 0.84388983, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17102051, + "step": 4249, + "time_per_iteration": 3.848259687423706 + }, + { + "auxiliary_loss_clip": 0.01083157, + "auxiliary_loss_mlp": 0.0101664, + "balance_loss_clip": 1.05069017, + "balance_loss_mlp": 1.01469672, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7517772861869748, + "language_loss": 0.56268501, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58368301, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.01940918, + "step": 4250, + "time_per_iteration": 3.077364683151245 + }, + { + "auxiliary_loss_clip": 0.01157012, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.02304649, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.0024752804256796, + "language_loss": 0.80493248, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82691336, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.18029785, + "step": 4251, + "time_per_iteration": 2.5716500282287598 + }, + { + "auxiliary_loss_clip": 0.01094619, + "auxiliary_loss_mlp": 0.01006727, + "balance_loss_clip": 1.06137514, + "balance_loss_mlp": 1.00486493, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.8024415000028945, + "language_loss": 0.66129375, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68230724, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.33251953, + "router_z_loss_mlp": 0.01861572, + "step": 4252, + "time_per_iteration": 3.113947629928589 + }, + { + "auxiliary_loss_clip": 0.01152895, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.05965352, + "balance_loss_mlp": 1.02752519, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 1.7542376776219648, + "language_loss": 0.74021703, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.76217562, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.15441895, + "step": 4253, + "time_per_iteration": 2.5324811935424805 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01042709, + "balance_loss_clip": 1.05903709, + "balance_loss_mlp": 1.0267117, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 2.487509906357571, + "language_loss": 0.72644585, + "learning_rate": 3.488728137415357e-06, + "loss": 0.74840486, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15979004, + "step": 4254, + "time_per_iteration": 2.458298444747925 + }, + { + "auxiliary_loss_clip": 0.01156355, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.0615921, + "balance_loss_mlp": 1.02401733, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.8064051058597081, + "language_loss": 0.81213528, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.8341189, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.17980957, + "step": 4255, + "time_per_iteration": 2.4625492095947266 + }, + { + "auxiliary_loss_clip": 0.01155597, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_clip": 1.0626905, + "balance_loss_mlp": 1.02859044, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.601711687066102, + "language_loss": 0.85397208, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87599534, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.18121338, + "step": 4256, + "time_per_iteration": 2.534919023513794 + }, + { + "auxiliary_loss_clip": 0.01155153, + "auxiliary_loss_mlp": 0.01054676, + "balance_loss_clip": 1.05687404, + "balance_loss_mlp": 1.03560305, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.8633860411740586, + "language_loss": 0.74715483, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.76925313, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.19067383, + "step": 4257, + "time_per_iteration": 2.500875949859619 + }, + { + "auxiliary_loss_clip": 0.01090744, + "auxiliary_loss_mlp": 0.01016088, + "balance_loss_clip": 1.05724573, + "balance_loss_mlp": 1.0141449, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.7965234166577067, + "language_loss": 0.65260589, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67367423, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.33496094, + "router_z_loss_mlp": 0.01940918, + "step": 4258, + "time_per_iteration": 3.113797187805176 + }, + { + "auxiliary_loss_clip": 0.01160939, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.06658792, + "balance_loss_mlp": 1.02861714, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.805394086227396, + "language_loss": 0.76700723, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78908199, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.17919922, + "step": 4259, + "time_per_iteration": 2.571430206298828 + }, + { + "auxiliary_loss_clip": 0.01099391, + "auxiliary_loss_mlp": 0.01008894, + "balance_loss_clip": 1.06654406, + "balance_loss_mlp": 1.00714183, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7893968834088004, + "language_loss": 0.58453584, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60561866, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.32861328, + "router_z_loss_mlp": 0.01750183, + "step": 4260, + "time_per_iteration": 3.1590335369110107 + }, + { + "auxiliary_loss_clip": 0.0115566, + "auxiliary_loss_mlp": 0.01045332, + "balance_loss_clip": 1.0586648, + "balance_loss_mlp": 1.02845192, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 4.4828028973388845, + "language_loss": 0.76576942, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.78777933, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.16870117, + "step": 4261, + "time_per_iteration": 2.5014288425445557 + }, + { + "auxiliary_loss_clip": 0.0116582, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.07110381, + "balance_loss_mlp": 1.02177, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 2.0265172612324944, + "language_loss": 0.83016884, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85219914, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.15441895, + "step": 4262, + "time_per_iteration": 2.616581916809082 + }, + { + "auxiliary_loss_clip": 0.01163767, + "auxiliary_loss_mlp": 0.01044605, + "balance_loss_clip": 1.06375492, + "balance_loss_mlp": 1.02724767, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.2692645659278505, + "language_loss": 0.73184162, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.75392532, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17358398, + "step": 4263, + "time_per_iteration": 2.417226552963257 + }, + { + "auxiliary_loss_clip": 0.0116661, + "auxiliary_loss_mlp": 0.01053907, + "balance_loss_clip": 1.0751667, + "balance_loss_mlp": 1.03695512, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.7212603205902568, + "language_loss": 0.82527393, + "learning_rate": 3.486124592522163e-06, + "loss": 0.8474791, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16943359, + "step": 4264, + "time_per_iteration": 2.5387942790985107 + }, + { + "auxiliary_loss_clip": 0.01161353, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.06679595, + "balance_loss_mlp": 1.02597511, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 2.4867312227318608, + "language_loss": 0.74777651, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76982206, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.17236328, + "step": 4265, + "time_per_iteration": 2.5394248962402344 + }, + { + "auxiliary_loss_clip": 0.01151687, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.05717826, + "balance_loss_mlp": 1.02311528, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.6761042502838848, + "language_loss": 0.81627697, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83818638, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.16125488, + "step": 4266, + "time_per_iteration": 2.487581491470337 + }, + { + "auxiliary_loss_clip": 0.01145138, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.05469918, + "balance_loss_mlp": 1.02154493, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.8923455549889514, + "language_loss": 0.79360002, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81543016, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.16320801, + "step": 4267, + "time_per_iteration": 2.512259006500244 + }, + { + "auxiliary_loss_clip": 0.01146368, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.05772829, + "balance_loss_mlp": 1.02482605, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.5224657581768997, + "language_loss": 0.79053158, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81240755, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.16394043, + "step": 4268, + "time_per_iteration": 2.5281927585601807 + }, + { + "auxiliary_loss_clip": 0.01153019, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.05863225, + "balance_loss_mlp": 1.02379954, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 1.7717580195450178, + "language_loss": 0.6814931, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70342374, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.16235352, + "step": 4269, + "time_per_iteration": 2.5186285972595215 + }, + { + "auxiliary_loss_clip": 0.01159782, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.06519353, + "balance_loss_mlp": 1.02740383, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 2.7958083106142273, + "language_loss": 0.7924872, + "learning_rate": 3.484559759962666e-06, + "loss": 0.8145268, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16772461, + "step": 4270, + "time_per_iteration": 2.4961354732513428 + }, + { + "auxiliary_loss_clip": 0.0116214, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.06147635, + "balance_loss_mlp": 1.02305353, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 1.9545069807183524, + "language_loss": 0.67609203, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.69813919, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.19543457, + "step": 4271, + "time_per_iteration": 2.5595719814300537 + }, + { + "auxiliary_loss_clip": 0.01164401, + "auxiliary_loss_mlp": 0.01045098, + "balance_loss_clip": 1.06474304, + "balance_loss_mlp": 1.02665663, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 2.340138801388028, + "language_loss": 0.87635005, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89844501, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.18432617, + "step": 4272, + "time_per_iteration": 2.604855537414551 + }, + { + "auxiliary_loss_clip": 0.01157535, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.06153774, + "balance_loss_mlp": 1.02636242, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.8061935150422013, + "language_loss": 0.820508, + "learning_rate": 3.483776583571541e-06, + "loss": 0.84252965, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.18261719, + "step": 4273, + "time_per_iteration": 2.4528658390045166 + }, + { + "auxiliary_loss_clip": 0.01156529, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.06587183, + "balance_loss_mlp": 1.02435136, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.4919397294762924, + "language_loss": 0.77292836, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79490483, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16772461, + "step": 4274, + "time_per_iteration": 2.557929754257202 + }, + { + "auxiliary_loss_clip": 0.01155611, + "auxiliary_loss_mlp": 0.01041212, + "balance_loss_clip": 1.06089795, + "balance_loss_mlp": 1.02373564, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.5651585513047486, + "language_loss": 0.84451735, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.8664856, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.17468262, + "step": 4275, + "time_per_iteration": 2.55161714553833 + }, + { + "auxiliary_loss_clip": 0.01155967, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.06222653, + "balance_loss_mlp": 1.0209589, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 4.657110270532674, + "language_loss": 0.78853035, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.81047136, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.17175293, + "step": 4276, + "time_per_iteration": 2.591639280319214 + }, + { + "auxiliary_loss_clip": 0.01158935, + "auxiliary_loss_mlp": 0.0104385, + "balance_loss_clip": 1.06624055, + "balance_loss_mlp": 1.02782774, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.735080558076673, + "language_loss": 0.79287648, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81490427, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.16003418, + "step": 4277, + "time_per_iteration": 2.5380990505218506 + }, + { + "auxiliary_loss_clip": 0.01161624, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.06809449, + "balance_loss_mlp": 1.02098703, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.422364207572655, + "language_loss": 0.78644443, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80842984, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15917969, + "step": 4278, + "time_per_iteration": 4.066589593887329 + }, + { + "auxiliary_loss_clip": 0.01156117, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.06194782, + "balance_loss_mlp": 1.02067828, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 1.9635404367645994, + "language_loss": 0.74549568, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76742721, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.16363525, + "step": 4279, + "time_per_iteration": 2.5470592975616455 + }, + { + "auxiliary_loss_clip": 0.01159193, + "auxiliary_loss_mlp": 0.01050754, + "balance_loss_clip": 1.06461, + "balance_loss_mlp": 1.03354049, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.3296282760048084, + "language_loss": 0.85145855, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87355798, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.17224121, + "step": 4280, + "time_per_iteration": 2.4827840328216553 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.06129694, + "balance_loss_mlp": 1.02565598, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 3.6506759122388797, + "language_loss": 0.79541469, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.81739271, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.1706543, + "step": 4281, + "time_per_iteration": 2.5245957374572754 + }, + { + "auxiliary_loss_clip": 0.01151344, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.05804515, + "balance_loss_mlp": 1.02736759, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 2.028741491701517, + "language_loss": 0.87329435, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89526707, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.18554688, + "step": 4282, + "time_per_iteration": 2.567286968231201 + }, + { + "auxiliary_loss_clip": 0.01162335, + "auxiliary_loss_mlp": 0.0104101, + "balance_loss_clip": 1.06447458, + "balance_loss_mlp": 1.02486897, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.564270119025012, + "language_loss": 0.70508242, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72711587, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.16137695, + "step": 4283, + "time_per_iteration": 2.485328197479248 + }, + { + "auxiliary_loss_clip": 0.01152097, + "auxiliary_loss_mlp": 0.01042812, + "balance_loss_clip": 1.06403029, + "balance_loss_mlp": 1.0268023, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 2.0477008670160886, + "language_loss": 0.80993271, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.83188176, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.15991211, + "step": 4284, + "time_per_iteration": 2.469609260559082 + }, + { + "auxiliary_loss_clip": 0.01149941, + "auxiliary_loss_mlp": 0.01037579, + "balance_loss_clip": 1.0586977, + "balance_loss_mlp": 1.02239215, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.2129066162263697, + "language_loss": 0.70245832, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72433352, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.15185547, + "step": 4285, + "time_per_iteration": 2.6274189949035645 + }, + { + "auxiliary_loss_clip": 0.01152974, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.06110978, + "balance_loss_mlp": 1.02382898, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 1.738196999690063, + "language_loss": 0.58205664, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60397923, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15478516, + "step": 4286, + "time_per_iteration": 2.462076425552368 + }, + { + "auxiliary_loss_clip": 0.01159966, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.0636903, + "balance_loss_mlp": 1.03007793, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 2.021662942936262, + "language_loss": 0.64281881, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66488016, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16101074, + "step": 4287, + "time_per_iteration": 2.4767582416534424 + }, + { + "auxiliary_loss_clip": 0.01160784, + "auxiliary_loss_mlp": 0.01038895, + "balance_loss_clip": 1.06505442, + "balance_loss_mlp": 1.0215385, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 1.9640289195854586, + "language_loss": 0.72128445, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74328125, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17358398, + "step": 4288, + "time_per_iteration": 3.852013349533081 + }, + { + "auxiliary_loss_clip": 0.01153127, + "auxiliary_loss_mlp": 0.01039739, + "balance_loss_clip": 1.06195998, + "balance_loss_mlp": 1.02445602, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.4998087143197112, + "language_loss": 0.77314329, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79507196, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.152771, + "step": 4289, + "time_per_iteration": 2.5557169914245605 + }, + { + "auxiliary_loss_clip": 0.01145155, + "auxiliary_loss_mlp": 0.0103808, + "balance_loss_clip": 1.0540992, + "balance_loss_mlp": 1.02151012, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 2.074053487605364, + "language_loss": 0.85314596, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.8749783, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.16577148, + "step": 4290, + "time_per_iteration": 4.05975866317749 + }, + { + "auxiliary_loss_clip": 0.01157115, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.0625174, + "balance_loss_mlp": 1.0283705, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 1.703897340512748, + "language_loss": 0.72570282, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74773258, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.17480469, + "step": 4291, + "time_per_iteration": 2.4867026805877686 + }, + { + "auxiliary_loss_clip": 0.01156915, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.06344724, + "balance_loss_mlp": 1.02323449, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4888699829130774, + "language_loss": 0.80833375, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83030337, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.16784668, + "step": 4292, + "time_per_iteration": 3.896181106567383 + }, + { + "auxiliary_loss_clip": 0.01155381, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_clip": 1.06030416, + "balance_loss_mlp": 1.02621937, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 2.6269306940963686, + "language_loss": 0.6777097, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69969141, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16564941, + "step": 4293, + "time_per_iteration": 2.5877606868743896 + }, + { + "auxiliary_loss_clip": 0.01164004, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_clip": 1.06993735, + "balance_loss_mlp": 1.02838516, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.782490273326548, + "language_loss": 0.75042605, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77250457, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.15447998, + "step": 4294, + "time_per_iteration": 2.4983646869659424 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01045244, + "balance_loss_clip": 1.05999207, + "balance_loss_mlp": 1.02872193, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.0481178897801224, + "language_loss": 0.80899835, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83095706, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16516113, + "step": 4295, + "time_per_iteration": 2.6042535305023193 + }, + { + "auxiliary_loss_clip": 0.01162535, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_clip": 1.06583714, + "balance_loss_mlp": 1.03102624, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 3.0139379287899186, + "language_loss": 0.72756696, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.7496767, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17419434, + "step": 4296, + "time_per_iteration": 2.5234036445617676 + }, + { + "auxiliary_loss_clip": 0.01152811, + "auxiliary_loss_mlp": 0.01040173, + "balance_loss_clip": 1.05991352, + "balance_loss_mlp": 1.02332914, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.5574633347371036, + "language_loss": 0.86857575, + "learning_rate": 3.477492965085067e-06, + "loss": 0.89050555, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.16845703, + "step": 4297, + "time_per_iteration": 2.5095055103302 + }, + { + "auxiliary_loss_clip": 0.01151429, + "auxiliary_loss_mlp": 0.01049891, + "balance_loss_clip": 1.05667317, + "balance_loss_mlp": 1.03388143, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.872868877855777, + "language_loss": 0.84629458, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86830783, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.16015625, + "step": 4298, + "time_per_iteration": 2.535548210144043 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.05597925, + "balance_loss_mlp": 1.0221808, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.3061125998202985, + "language_loss": 0.83241498, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85429901, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.17858887, + "step": 4299, + "time_per_iteration": 2.515231132507324 + }, + { + "auxiliary_loss_clip": 0.01146253, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.05706263, + "balance_loss_mlp": 1.0221082, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 5.6125306668895085, + "language_loss": 0.82694507, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84877944, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.15093994, + "step": 4300, + "time_per_iteration": 2.5636911392211914 + }, + { + "auxiliary_loss_clip": 0.01158467, + "auxiliary_loss_mlp": 0.01045833, + "balance_loss_clip": 1.06112075, + "balance_loss_mlp": 1.02870297, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 3.084759664234966, + "language_loss": 0.67475361, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69679654, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.17138672, + "step": 4301, + "time_per_iteration": 2.572791814804077 + }, + { + "auxiliary_loss_clip": 0.01156516, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.06101489, + "balance_loss_mlp": 1.02712989, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.976471492057617, + "language_loss": 0.81666255, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83866358, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16479492, + "step": 4302, + "time_per_iteration": 2.591787815093994 + }, + { + "auxiliary_loss_clip": 0.01154921, + "auxiliary_loss_mlp": 0.01044899, + "balance_loss_clip": 1.06151962, + "balance_loss_mlp": 1.02884126, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.9210906121169, + "language_loss": 0.92260665, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94460487, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.16064453, + "step": 4303, + "time_per_iteration": 2.4541773796081543 + }, + { + "auxiliary_loss_clip": 0.01153588, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.06162965, + "balance_loss_mlp": 1.02666473, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 2.3631672353142292, + "language_loss": 0.67315781, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69511825, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.15753174, + "step": 4304, + "time_per_iteration": 2.601391077041626 + }, + { + "auxiliary_loss_clip": 0.01152271, + "auxiliary_loss_mlp": 0.01045497, + "balance_loss_clip": 1.05872822, + "balance_loss_mlp": 1.02940392, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.0753937417153203, + "language_loss": 0.72674108, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74871874, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16101074, + "step": 4305, + "time_per_iteration": 2.6331260204315186 + }, + { + "auxiliary_loss_clip": 0.01152815, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_clip": 1.0579226, + "balance_loss_mlp": 1.02952266, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 3.345248133317033, + "language_loss": 0.75743711, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77942884, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16833496, + "step": 4306, + "time_per_iteration": 2.423133373260498 + }, + { + "auxiliary_loss_clip": 0.0108888, + "auxiliary_loss_mlp": 0.01014451, + "balance_loss_clip": 1.05563712, + "balance_loss_mlp": 1.01250196, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8280482662243644, + "language_loss": 0.57068253, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59171587, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.01947021, + "step": 4307, + "time_per_iteration": 3.0464000701904297 + }, + { + "auxiliary_loss_clip": 0.01163121, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.07092631, + "balance_loss_mlp": 1.02353871, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.6852320090129997, + "language_loss": 0.71570992, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73773241, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.15600586, + "step": 4308, + "time_per_iteration": 2.4639580249786377 + }, + { + "auxiliary_loss_clip": 0.01156854, + "auxiliary_loss_mlp": 0.01043343, + "balance_loss_clip": 1.06273913, + "balance_loss_mlp": 1.0271306, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.963972224745695, + "language_loss": 0.84584045, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86784238, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.16210938, + "step": 4309, + "time_per_iteration": 2.487161159515381 + }, + { + "auxiliary_loss_clip": 0.01151567, + "auxiliary_loss_mlp": 0.01045765, + "balance_loss_clip": 1.06121111, + "balance_loss_mlp": 1.03091776, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.5478415736949183, + "language_loss": 0.84873325, + "learning_rate": 3.474075855228966e-06, + "loss": 0.87070656, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.14849854, + "step": 4310, + "time_per_iteration": 2.453198194503784 + }, + { + "auxiliary_loss_clip": 0.01159378, + "auxiliary_loss_mlp": 0.01047873, + "balance_loss_clip": 1.0648793, + "balance_loss_mlp": 1.03170872, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.954620565198068, + "language_loss": 0.77336162, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79543412, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16162109, + "step": 4311, + "time_per_iteration": 2.5525264739990234 + }, + { + "auxiliary_loss_clip": 0.01154662, + "auxiliary_loss_mlp": 0.0104467, + "balance_loss_clip": 1.0610863, + "balance_loss_mlp": 1.0289942, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.8242731156393104, + "language_loss": 0.73028719, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.75228053, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15661621, + "step": 4312, + "time_per_iteration": 2.543579578399658 + }, + { + "auxiliary_loss_clip": 0.01154329, + "auxiliary_loss_mlp": 0.01040093, + "balance_loss_clip": 1.0627445, + "balance_loss_mlp": 1.02435708, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.7392926880358603, + "language_loss": 0.69846129, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72040558, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15734863, + "step": 4313, + "time_per_iteration": 2.461186408996582 + }, + { + "auxiliary_loss_clip": 0.01148313, + "auxiliary_loss_mlp": 0.01050198, + "balance_loss_clip": 1.05780935, + "balance_loss_mlp": 1.03396153, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.5910384283624097, + "language_loss": 0.8028236, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82480872, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.16223145, + "step": 4314, + "time_per_iteration": 2.5741827487945557 + }, + { + "auxiliary_loss_clip": 0.01155616, + "auxiliary_loss_mlp": 0.01045253, + "balance_loss_clip": 1.06066632, + "balance_loss_mlp": 1.02824175, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.208344929073346, + "language_loss": 0.67000556, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69201428, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.17016602, + "step": 4315, + "time_per_iteration": 2.588083267211914 + }, + { + "auxiliary_loss_clip": 0.01142313, + "auxiliary_loss_mlp": 0.01050353, + "balance_loss_clip": 1.05262566, + "balance_loss_mlp": 1.03430796, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5752859366983218, + "language_loss": 0.79248559, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81441236, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.16040039, + "step": 4316, + "time_per_iteration": 2.508314847946167 + }, + { + "auxiliary_loss_clip": 0.01150057, + "auxiliary_loss_mlp": 0.01045915, + "balance_loss_clip": 1.05610991, + "balance_loss_mlp": 1.02823591, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.6056214718115793, + "language_loss": 0.77929801, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80125773, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.17663574, + "step": 4317, + "time_per_iteration": 2.5815656185150146 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_clip": 1.06351733, + "balance_loss_mlp": 1.03752494, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.3968907884251975, + "language_loss": 0.77612376, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79822612, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.16674805, + "step": 4318, + "time_per_iteration": 2.5402116775512695 + }, + { + "auxiliary_loss_clip": 0.01147341, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.05653858, + "balance_loss_mlp": 1.0210402, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.6395784168848355, + "language_loss": 0.76551819, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78736687, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.16479492, + "step": 4319, + "time_per_iteration": 2.629988431930542 + }, + { + "auxiliary_loss_clip": 0.0115383, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.06180286, + "balance_loss_mlp": 1.02259636, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 3.8718267885946958, + "language_loss": 0.76469272, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78661579, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.15893555, + "step": 4320, + "time_per_iteration": 2.48516583442688 + }, + { + "auxiliary_loss_clip": 0.01157033, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.06261659, + "balance_loss_mlp": 1.02228177, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.4650143741987964, + "language_loss": 0.70874488, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73069978, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.16162109, + "step": 4321, + "time_per_iteration": 2.50260066986084 + }, + { + "auxiliary_loss_clip": 0.01155505, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.06062627, + "balance_loss_mlp": 1.02289486, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.585462477365725, + "language_loss": 0.74647558, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76842332, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16381836, + "step": 4322, + "time_per_iteration": 2.4369008541107178 + }, + { + "auxiliary_loss_clip": 0.01166842, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.07204378, + "balance_loss_mlp": 1.02317929, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 9.257020809168111, + "language_loss": 0.73766208, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75972569, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.16333008, + "step": 4323, + "time_per_iteration": 3.9325428009033203 + }, + { + "auxiliary_loss_clip": 0.01157339, + "auxiliary_loss_mlp": 0.01042099, + "balance_loss_clip": 1.05940318, + "balance_loss_mlp": 1.0249567, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.0637493507539335, + "language_loss": 0.67326313, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69525754, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.17138672, + "step": 4324, + "time_per_iteration": 2.517759084701538 + }, + { + "auxiliary_loss_clip": 0.01155588, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.06400156, + "balance_loss_mlp": 1.02401805, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 1.8247206389174575, + "language_loss": 0.70372647, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72567201, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.14941406, + "step": 4325, + "time_per_iteration": 2.660820960998535 + }, + { + "auxiliary_loss_clip": 0.01151287, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.05896807, + "balance_loss_mlp": 1.017524, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 1.9629274343092429, + "language_loss": 0.73105711, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75290507, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.15966797, + "step": 4326, + "time_per_iteration": 2.6229615211486816 + }, + { + "auxiliary_loss_clip": 0.01151714, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.06174576, + "balance_loss_mlp": 1.02182436, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.9888479660570066, + "language_loss": 0.86647379, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88836139, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15222168, + "step": 4327, + "time_per_iteration": 2.5652072429656982 + }, + { + "auxiliary_loss_clip": 0.01156105, + "auxiliary_loss_mlp": 0.01051437, + "balance_loss_clip": 1.06082177, + "balance_loss_mlp": 1.03349614, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.4607059263020978, + "language_loss": 0.80483228, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82690769, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.17932129, + "step": 4328, + "time_per_iteration": 2.748392105102539 + }, + { + "auxiliary_loss_clip": 0.01151825, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.0598228, + "balance_loss_mlp": 1.02233481, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.488498630046163, + "language_loss": 0.87616003, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89805865, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.15686035, + "step": 4329, + "time_per_iteration": 2.62408447265625 + }, + { + "auxiliary_loss_clip": 0.01148645, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.05860806, + "balance_loss_mlp": 1.0234251, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 1.8561980735030523, + "language_loss": 0.77710092, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79897827, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15661621, + "step": 4330, + "time_per_iteration": 2.54168963432312 + }, + { + "auxiliary_loss_clip": 0.01152126, + "auxiliary_loss_mlp": 0.01050173, + "balance_loss_clip": 1.05883145, + "balance_loss_mlp": 1.03291154, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.4553000435047267, + "language_loss": 0.75325614, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77527916, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.17260742, + "step": 4331, + "time_per_iteration": 2.5129454135894775 + }, + { + "auxiliary_loss_clip": 0.01162291, + "auxiliary_loss_mlp": 0.01042144, + "balance_loss_clip": 1.06750417, + "balance_loss_mlp": 1.02601528, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.4115321654027941, + "language_loss": 0.6921705, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.7142148, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16137695, + "step": 4332, + "time_per_iteration": 3.948504686355591 + }, + { + "auxiliary_loss_clip": 0.01157099, + "auxiliary_loss_mlp": 0.01051352, + "balance_loss_clip": 1.06134653, + "balance_loss_mlp": 1.03229058, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.181747496384416, + "language_loss": 0.80345643, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.82554096, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.19067383, + "step": 4333, + "time_per_iteration": 2.5738956928253174 + }, + { + "auxiliary_loss_clip": 0.01157386, + "auxiliary_loss_mlp": 0.01048594, + "balance_loss_clip": 1.06412339, + "balance_loss_mlp": 1.032107, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.81993275168395, + "language_loss": 0.80542874, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82748854, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16479492, + "step": 4334, + "time_per_iteration": 3.8558802604675293 + }, + { + "auxiliary_loss_clip": 0.01148822, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.05734527, + "balance_loss_mlp": 1.02612996, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.9296349637147758, + "language_loss": 0.80123878, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82317603, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.18786621, + "step": 4335, + "time_per_iteration": 2.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01095522, + "auxiliary_loss_mlp": 0.01016777, + "balance_loss_clip": 1.06096363, + "balance_loss_mlp": 1.01482511, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.8409651528388551, + "language_loss": 0.60815775, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62928075, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 0.01953125, + "step": 4336, + "time_per_iteration": 4.67948317527771 + }, + { + "auxiliary_loss_clip": 0.0115731, + "auxiliary_loss_mlp": 0.01047002, + "balance_loss_clip": 1.06214952, + "balance_loss_mlp": 1.03039598, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.056076039842931, + "language_loss": 0.77185452, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79389763, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.16601562, + "step": 4337, + "time_per_iteration": 2.63812518119812 + }, + { + "auxiliary_loss_clip": 0.01160923, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.06530929, + "balance_loss_mlp": 1.0247314, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 1.924465606436881, + "language_loss": 0.74720418, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76923776, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.17712402, + "step": 4338, + "time_per_iteration": 2.7590742111206055 + }, + { + "auxiliary_loss_clip": 0.0117129, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.0710988, + "balance_loss_mlp": 1.02939939, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.6414615958741647, + "language_loss": 0.80705178, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82923007, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.17138672, + "step": 4339, + "time_per_iteration": 2.602385997772217 + }, + { + "auxiliary_loss_clip": 0.01154857, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.06187522, + "balance_loss_mlp": 1.02175057, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 2.0819780761758206, + "language_loss": 0.76930702, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.79122788, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15466309, + "step": 4340, + "time_per_iteration": 2.5308268070220947 + }, + { + "auxiliary_loss_clip": 0.01156022, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.06184506, + "balance_loss_mlp": 1.02422249, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.6249744402707007, + "language_loss": 0.82790422, + "learning_rate": 3.465889281600845e-06, + "loss": 0.8498677, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16101074, + "step": 4341, + "time_per_iteration": 2.488779067993164 + }, + { + "auxiliary_loss_clip": 0.01153976, + "auxiliary_loss_mlp": 0.01040061, + "balance_loss_clip": 1.06090522, + "balance_loss_mlp": 1.02313304, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 3.2523495170034162, + "language_loss": 0.76839912, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79033947, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16918945, + "step": 4342, + "time_per_iteration": 2.5331380367279053 + }, + { + "auxiliary_loss_clip": 0.011547, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.06158209, + "balance_loss_mlp": 1.0147934, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.6990471431082064, + "language_loss": 0.65951592, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68137681, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.16589355, + "step": 4343, + "time_per_iteration": 2.614473581314087 + }, + { + "auxiliary_loss_clip": 0.01159162, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.06318092, + "balance_loss_mlp": 1.02535117, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.002825017807114, + "language_loss": 0.73312724, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75514299, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.17053223, + "step": 4344, + "time_per_iteration": 2.4614338874816895 + }, + { + "auxiliary_loss_clip": 0.0115578, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.06267667, + "balance_loss_mlp": 1.02345121, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.2811263146967615, + "language_loss": 0.86931139, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.8912698, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.1661377, + "step": 4345, + "time_per_iteration": 2.4462709426879883 + }, + { + "auxiliary_loss_clip": 0.01154149, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.06228423, + "balance_loss_mlp": 1.02651048, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 2.1323799077817784, + "language_loss": 0.76442993, + "learning_rate": 3.464563855876015e-06, + "loss": 0.7863977, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.16119385, + "step": 4346, + "time_per_iteration": 2.4956140518188477 + }, + { + "auxiliary_loss_clip": 0.01153357, + "auxiliary_loss_mlp": 0.01042543, + "balance_loss_clip": 1.05980301, + "balance_loss_mlp": 1.02571094, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5536589875778193, + "language_loss": 0.75676948, + "learning_rate": 3.464298604081606e-06, + "loss": 0.77872849, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16821289, + "step": 4347, + "time_per_iteration": 2.4910266399383545 + }, + { + "auxiliary_loss_clip": 0.01156379, + "auxiliary_loss_mlp": 0.01039706, + "balance_loss_clip": 1.06375527, + "balance_loss_mlp": 1.02402997, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4924812343813798, + "language_loss": 0.73492748, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75688827, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15686035, + "step": 4348, + "time_per_iteration": 2.5418202877044678 + }, + { + "auxiliary_loss_clip": 0.01159648, + "auxiliary_loss_mlp": 0.01046257, + "balance_loss_clip": 1.06398082, + "balance_loss_mlp": 1.02994871, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 2.4399180556057773, + "language_loss": 0.91074896, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93280804, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.16308594, + "step": 4349, + "time_per_iteration": 2.4707424640655518 + }, + { + "auxiliary_loss_clip": 0.01170403, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.07630396, + "balance_loss_mlp": 1.02058911, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7701298954687628, + "language_loss": 0.79955947, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82162881, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.15942383, + "step": 4350, + "time_per_iteration": 2.515550374984741 + }, + { + "auxiliary_loss_clip": 0.01155772, + "auxiliary_loss_mlp": 0.01043085, + "balance_loss_clip": 1.06711674, + "balance_loss_mlp": 1.02719402, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.9292119582857614, + "language_loss": 0.62217009, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64415866, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.15905762, + "step": 4351, + "time_per_iteration": 2.4182024002075195 + }, + { + "auxiliary_loss_clip": 0.01156097, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.06070864, + "balance_loss_mlp": 1.02357185, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.1013699555786816, + "language_loss": 0.83533096, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85729063, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.1628418, + "step": 4352, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01103037, + "auxiliary_loss_mlp": 0.01007891, + "balance_loss_clip": 1.07092607, + "balance_loss_mlp": 1.00527728, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.7956140278928338, + "language_loss": 0.70573747, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72684681, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.32128906, + "router_z_loss_mlp": 0.02612305, + "step": 4353, + "time_per_iteration": 3.0958807468414307 + }, + { + "auxiliary_loss_clip": 0.01156667, + "auxiliary_loss_mlp": 0.01044555, + "balance_loss_clip": 1.06525922, + "balance_loss_mlp": 1.02746081, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.5999077406454696, + "language_loss": 0.77488327, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.7968955, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.17077637, + "step": 4354, + "time_per_iteration": 2.561365842819214 + }, + { + "auxiliary_loss_clip": 0.01165145, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_clip": 1.06218052, + "balance_loss_mlp": 1.03206897, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.0878948560662947, + "language_loss": 0.68197799, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70412785, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.17773438, + "step": 4355, + "time_per_iteration": 2.49849796295166 + }, + { + "auxiliary_loss_clip": 0.01152225, + "auxiliary_loss_mlp": 0.01039808, + "balance_loss_clip": 1.05892849, + "balance_loss_mlp": 1.02224827, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 2.449405634555188, + "language_loss": 0.67693329, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69885361, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.17547607, + "step": 4356, + "time_per_iteration": 2.612471103668213 + }, + { + "auxiliary_loss_clip": 0.01095115, + "auxiliary_loss_mlp": 0.01006347, + "balance_loss_clip": 1.06168711, + "balance_loss_mlp": 1.00443697, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6799796242352584, + "language_loss": 0.53135604, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55237067, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 0.019104, + "step": 4357, + "time_per_iteration": 3.048231840133667 + }, + { + "auxiliary_loss_clip": 0.01162048, + "auxiliary_loss_mlp": 0.0104651, + "balance_loss_clip": 1.06454396, + "balance_loss_mlp": 1.02968907, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.9316911934761487, + "language_loss": 0.84009147, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86217701, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.16833496, + "step": 4358, + "time_per_iteration": 2.5804014205932617 + }, + { + "auxiliary_loss_clip": 0.01162575, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.06297541, + "balance_loss_mlp": 1.02479219, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.873194713028282, + "language_loss": 0.67510748, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.6971699, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.18884277, + "step": 4359, + "time_per_iteration": 2.5232186317443848 + }, + { + "auxiliary_loss_clip": 0.01147535, + "auxiliary_loss_mlp": 0.0104383, + "balance_loss_clip": 1.05371809, + "balance_loss_mlp": 1.02715302, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 2.366977857593587, + "language_loss": 0.78552079, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80743444, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16674805, + "step": 4360, + "time_per_iteration": 2.496776819229126 + }, + { + "auxiliary_loss_clip": 0.01145316, + "auxiliary_loss_mlp": 0.0104424, + "balance_loss_clip": 1.05477464, + "balance_loss_mlp": 1.02818215, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.6367249566388369, + "language_loss": 0.6809321, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70282769, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.16052246, + "step": 4361, + "time_per_iteration": 2.6106033325195312 + }, + { + "auxiliary_loss_clip": 0.01153482, + "auxiliary_loss_mlp": 0.01055245, + "balance_loss_clip": 1.05702949, + "balance_loss_mlp": 1.03716087, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 2.061809501118536, + "language_loss": 0.84002197, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86210918, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.18103027, + "step": 4362, + "time_per_iteration": 2.4863953590393066 + }, + { + "auxiliary_loss_clip": 0.01147118, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_clip": 1.05571759, + "balance_loss_mlp": 1.02712846, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.8027058114698407, + "language_loss": 0.65145594, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67337829, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.17993164, + "step": 4363, + "time_per_iteration": 2.602970838546753 + }, + { + "auxiliary_loss_clip": 0.01085063, + "auxiliary_loss_mlp": 0.01006702, + "balance_loss_clip": 1.05266905, + "balance_loss_mlp": 1.00477386, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.9018088414971456, + "language_loss": 0.6107384, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63165605, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.32373047, + "router_z_loss_mlp": 0.01925659, + "step": 4364, + "time_per_iteration": 3.1703972816467285 + }, + { + "auxiliary_loss_clip": 0.01158856, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.0624702, + "balance_loss_mlp": 1.02974463, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.4166519236375383, + "language_loss": 0.71845287, + "learning_rate": 3.459514586533184e-06, + "loss": 0.7405268, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.18786621, + "step": 4365, + "time_per_iteration": 2.626319646835327 + }, + { + "auxiliary_loss_clip": 0.01155985, + "auxiliary_loss_mlp": 0.0105182, + "balance_loss_clip": 1.0597223, + "balance_loss_mlp": 1.03488052, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.4913253957083181, + "language_loss": 0.77169001, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79376805, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16943359, + "step": 4366, + "time_per_iteration": 2.545684337615967 + }, + { + "auxiliary_loss_clip": 0.01151046, + "auxiliary_loss_mlp": 0.01047438, + "balance_loss_clip": 1.05774903, + "balance_loss_mlp": 1.03054583, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.702186038804238, + "language_loss": 0.75687861, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.77886349, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.16906738, + "step": 4367, + "time_per_iteration": 3.862793445587158 + }, + { + "auxiliary_loss_clip": 0.01146902, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.05556893, + "balance_loss_mlp": 1.0246855, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 1.6495903668176253, + "language_loss": 0.69473624, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71661615, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.1640625, + "step": 4368, + "time_per_iteration": 2.470045804977417 + }, + { + "auxiliary_loss_clip": 0.01158867, + "auxiliary_loss_mlp": 0.01045348, + "balance_loss_clip": 1.06686354, + "balance_loss_mlp": 1.02766955, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 2.1365330584890887, + "language_loss": 0.78681242, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80885458, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.17675781, + "step": 4369, + "time_per_iteration": 2.445021390914917 + }, + { + "auxiliary_loss_clip": 0.01145741, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.05365229, + "balance_loss_mlp": 1.02191043, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 3.4353082106161072, + "language_loss": 0.83301592, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85486853, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.17626953, + "step": 4370, + "time_per_iteration": 2.5557281970977783 + }, + { + "auxiliary_loss_clip": 0.01164941, + "auxiliary_loss_mlp": 0.01057064, + "balance_loss_clip": 1.06810975, + "balance_loss_mlp": 1.03650069, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.6711425760426195, + "language_loss": 0.71257234, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73479235, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.20544434, + "step": 4371, + "time_per_iteration": 2.4403200149536133 + }, + { + "auxiliary_loss_clip": 0.01091127, + "auxiliary_loss_mlp": 0.01006379, + "balance_loss_clip": 1.0569247, + "balance_loss_mlp": 1.00425994, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.693850094964315, + "language_loss": 0.56472188, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58569694, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 0.0211792, + "step": 4372, + "time_per_iteration": 3.186619281768799 + }, + { + "auxiliary_loss_clip": 0.01152907, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.06184244, + "balance_loss_mlp": 1.02158189, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.7349754411313503, + "language_loss": 0.7796067, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.80151784, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.16625977, + "step": 4373, + "time_per_iteration": 2.5106897354125977 + }, + { + "auxiliary_loss_clip": 0.01151451, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.05876255, + "balance_loss_mlp": 1.02230299, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 2.194282926844303, + "language_loss": 0.71744514, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73934615, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16357422, + "step": 4374, + "time_per_iteration": 2.4549038410186768 + }, + { + "auxiliary_loss_clip": 0.01154849, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_clip": 1.06236517, + "balance_loss_mlp": 1.0272994, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.5738441804386532, + "language_loss": 0.80840313, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83040035, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.17565918, + "step": 4375, + "time_per_iteration": 2.4965593814849854 + }, + { + "auxiliary_loss_clip": 0.01150231, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.05811524, + "balance_loss_mlp": 1.02433336, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 1.7180938425393357, + "language_loss": 0.6599195, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68182933, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.1640625, + "step": 4376, + "time_per_iteration": 3.9649147987365723 + }, + { + "auxiliary_loss_clip": 0.01152149, + "auxiliary_loss_mlp": 0.01043727, + "balance_loss_clip": 1.05718827, + "balance_loss_mlp": 1.02647758, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.9900001374541436, + "language_loss": 0.69685113, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71880996, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.17248535, + "step": 4377, + "time_per_iteration": 4.0509867668151855 + }, + { + "auxiliary_loss_clip": 0.01160809, + "auxiliary_loss_mlp": 0.01052234, + "balance_loss_clip": 1.06384587, + "balance_loss_mlp": 1.0340066, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.5800891844478113, + "language_loss": 0.79028714, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81241757, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.18200684, + "step": 4378, + "time_per_iteration": 2.714839458465576 + }, + { + "auxiliary_loss_clip": 0.01159433, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_clip": 1.06342411, + "balance_loss_mlp": 1.03562415, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.053535311367432, + "language_loss": 0.76409185, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78619719, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15466309, + "step": 4379, + "time_per_iteration": 2.4990744590759277 + }, + { + "auxiliary_loss_clip": 0.01162621, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.06439233, + "balance_loss_mlp": 1.0223074, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.0475898722238184, + "language_loss": 0.77468479, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.79672778, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.19384766, + "step": 4380, + "time_per_iteration": 3.8972058296203613 + }, + { + "auxiliary_loss_clip": 0.01154316, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.05821168, + "balance_loss_mlp": 1.02420199, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 1.850940502236154, + "language_loss": 0.63999945, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.661955, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.17016602, + "step": 4381, + "time_per_iteration": 2.4964680671691895 + }, + { + "auxiliary_loss_clip": 0.01157886, + "auxiliary_loss_mlp": 0.01045675, + "balance_loss_clip": 1.06314111, + "balance_loss_mlp": 1.02848494, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8798634209312364, + "language_loss": 0.82503319, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84706879, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.17211914, + "step": 4382, + "time_per_iteration": 2.432081460952759 + }, + { + "auxiliary_loss_clip": 0.01171533, + "auxiliary_loss_mlp": 0.01048533, + "balance_loss_clip": 1.07230997, + "balance_loss_mlp": 1.0305686, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 5.498101465540578, + "language_loss": 0.69766098, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71986163, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.1796875, + "step": 4383, + "time_per_iteration": 2.641979455947876 + }, + { + "auxiliary_loss_clip": 0.01154629, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.06155086, + "balance_loss_mlp": 1.027771, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.6962723507401225, + "language_loss": 0.69355059, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71552402, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.1494751, + "step": 4384, + "time_per_iteration": 2.449968099594116 + }, + { + "auxiliary_loss_clip": 0.01151875, + "auxiliary_loss_mlp": 0.01038709, + "balance_loss_clip": 1.06015325, + "balance_loss_mlp": 1.02201986, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.530306751749778, + "language_loss": 0.70168984, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72359574, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.16687012, + "step": 4385, + "time_per_iteration": 2.480849504470825 + }, + { + "auxiliary_loss_clip": 0.01156714, + "auxiliary_loss_mlp": 0.01044225, + "balance_loss_clip": 1.06427991, + "balance_loss_mlp": 1.0275234, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.9251151868551135, + "language_loss": 0.85258996, + "learning_rate": 3.453910573136482e-06, + "loss": 0.8745994, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16699219, + "step": 4386, + "time_per_iteration": 2.456925630569458 + }, + { + "auxiliary_loss_clip": 0.01156479, + "auxiliary_loss_mlp": 0.01045221, + "balance_loss_clip": 1.06127048, + "balance_loss_mlp": 1.0257417, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.126703172671324, + "language_loss": 0.77438843, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79640543, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.19470215, + "step": 4387, + "time_per_iteration": 2.4065823554992676 + }, + { + "auxiliary_loss_clip": 0.01158239, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.06514597, + "balance_loss_mlp": 1.0247016, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 2.9286351615153166, + "language_loss": 0.75884604, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78084099, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.16564941, + "step": 4388, + "time_per_iteration": 2.4284822940826416 + }, + { + "auxiliary_loss_clip": 0.01155944, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.0635525, + "balance_loss_mlp": 1.02117288, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.969170454400773, + "language_loss": 0.86159354, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88353181, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.16711426, + "step": 4389, + "time_per_iteration": 2.428389072418213 + }, + { + "auxiliary_loss_clip": 0.01082819, + "auxiliary_loss_mlp": 0.01022055, + "balance_loss_clip": 1.04982662, + "balance_loss_mlp": 1.01990032, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8132745612416803, + "language_loss": 0.60327923, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62432796, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.33056641, + "router_z_loss_mlp": 0.02154541, + "step": 4390, + "time_per_iteration": 3.2164804935455322 + }, + { + "auxiliary_loss_clip": 0.01161876, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.06626868, + "balance_loss_mlp": 1.01864982, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.663558308017254, + "language_loss": 0.77487499, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79685217, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.171875, + "step": 4391, + "time_per_iteration": 2.5049688816070557 + }, + { + "auxiliary_loss_clip": 0.01078231, + "auxiliary_loss_mlp": 0.01008101, + "balance_loss_clip": 1.04597306, + "balance_loss_mlp": 1.00637567, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.8216986294774421, + "language_loss": 0.58717895, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60804224, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.32275391, + "router_z_loss_mlp": 0.01727295, + "step": 4392, + "time_per_iteration": 3.088405132293701 + }, + { + "auxiliary_loss_clip": 0.01152386, + "auxiliary_loss_mlp": 0.01043555, + "balance_loss_clip": 1.05847657, + "balance_loss_mlp": 1.02625811, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.7342805818402702, + "language_loss": 0.68840003, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.71035945, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.1730957, + "step": 4393, + "time_per_iteration": 2.497659683227539 + }, + { + "auxiliary_loss_clip": 0.01162239, + "auxiliary_loss_mlp": 0.01043354, + "balance_loss_clip": 1.06296158, + "balance_loss_mlp": 1.02528167, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.78253317120197, + "language_loss": 0.83856291, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86061883, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.18066406, + "step": 4394, + "time_per_iteration": 2.4693708419799805 + }, + { + "auxiliary_loss_clip": 0.01160017, + "auxiliary_loss_mlp": 0.01044493, + "balance_loss_clip": 1.06119847, + "balance_loss_mlp": 1.0260278, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 3.0690476041975026, + "language_loss": 0.69857073, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72061574, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.18481445, + "step": 4395, + "time_per_iteration": 2.453514337539673 + }, + { + "auxiliary_loss_clip": 0.01166329, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.06765914, + "balance_loss_mlp": 1.02549529, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.8982093148282309, + "language_loss": 0.86572766, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.88781852, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.17272949, + "step": 4396, + "time_per_iteration": 2.445416212081909 + }, + { + "auxiliary_loss_clip": 0.01095403, + "auxiliary_loss_mlp": 0.01020488, + "balance_loss_clip": 1.06224871, + "balance_loss_mlp": 1.01790142, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7928362243525467, + "language_loss": 0.55034006, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57149893, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.02590942, + "step": 4397, + "time_per_iteration": 2.8271331787109375 + }, + { + "auxiliary_loss_clip": 0.0116101, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.0668869, + "balance_loss_mlp": 1.03183389, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.4532522266676735, + "language_loss": 0.78358567, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80568039, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16625977, + "step": 4398, + "time_per_iteration": 2.5587337017059326 + }, + { + "auxiliary_loss_clip": 0.01163621, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.06775498, + "balance_loss_mlp": 1.02509832, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.6833077128171854, + "language_loss": 0.6746695, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69672775, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17114258, + "step": 4399, + "time_per_iteration": 2.445450782775879 + }, + { + "auxiliary_loss_clip": 0.01166588, + "auxiliary_loss_mlp": 0.01039046, + "balance_loss_clip": 1.07612598, + "balance_loss_mlp": 1.02359629, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.660637830577397, + "language_loss": 0.86608732, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88814366, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.15454102, + "step": 4400, + "time_per_iteration": 2.4758543968200684 + }, + { + "auxiliary_loss_clip": 0.01165082, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.07028496, + "balance_loss_mlp": 1.02043724, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 1.7412617721133266, + "language_loss": 0.75526524, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77729529, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17468262, + "step": 4401, + "time_per_iteration": 2.4163155555725098 + }, + { + "auxiliary_loss_clip": 0.01156586, + "auxiliary_loss_mlp": 0.01051192, + "balance_loss_clip": 1.05650067, + "balance_loss_mlp": 1.03196371, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.6359114038463034, + "language_loss": 0.87744391, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.89952171, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.19213867, + "step": 4402, + "time_per_iteration": 2.5154778957366943 + }, + { + "auxiliary_loss_clip": 0.01154748, + "auxiliary_loss_mlp": 0.01045173, + "balance_loss_clip": 1.06030416, + "balance_loss_mlp": 1.0276134, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.6108589153634996, + "language_loss": 0.78181386, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.8038131, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.17553711, + "step": 4403, + "time_per_iteration": 2.5523200035095215 + }, + { + "auxiliary_loss_clip": 0.01171586, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_clip": 1.07233799, + "balance_loss_mlp": 1.02146137, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9523060567134058, + "language_loss": 0.87806928, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.9001776, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.17773438, + "step": 4404, + "time_per_iteration": 2.473094940185547 + }, + { + "auxiliary_loss_clip": 0.01173065, + "auxiliary_loss_mlp": 0.01039054, + "balance_loss_clip": 1.07357049, + "balance_loss_mlp": 1.02238846, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.6715384518131795, + "language_loss": 0.76281714, + "learning_rate": 3.448819322433709e-06, + "loss": 0.7849384, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.16662598, + "step": 4405, + "time_per_iteration": 2.575856924057007 + }, + { + "auxiliary_loss_clip": 0.01155786, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.06212032, + "balance_loss_mlp": 1.02243805, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.787900025033788, + "language_loss": 0.70091081, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72286832, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.17504883, + "step": 4406, + "time_per_iteration": 2.5355591773986816 + }, + { + "auxiliary_loss_clip": 0.01158555, + "auxiliary_loss_mlp": 0.01042491, + "balance_loss_clip": 1.06616235, + "balance_loss_mlp": 1.02680302, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.6592719286175865, + "language_loss": 0.8375473, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85955775, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.15686035, + "step": 4407, + "time_per_iteration": 2.4808647632598877 + }, + { + "auxiliary_loss_clip": 0.01153284, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.0597949, + "balance_loss_mlp": 1.015661, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.8033989263309342, + "language_loss": 0.76145357, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78331339, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.17028809, + "step": 4408, + "time_per_iteration": 2.5853312015533447 + }, + { + "auxiliary_loss_clip": 0.01153512, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.06249976, + "balance_loss_mlp": 1.02210808, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8040927545028704, + "language_loss": 0.70942014, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73134196, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16564941, + "step": 4409, + "time_per_iteration": 2.631739377975464 + }, + { + "auxiliary_loss_clip": 0.01168689, + "auxiliary_loss_mlp": 0.01040419, + "balance_loss_clip": 1.07172394, + "balance_loss_mlp": 1.02282345, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 2.0844935609500426, + "language_loss": 0.73653913, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75863016, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17602539, + "step": 4410, + "time_per_iteration": 3.971182346343994 + }, + { + "auxiliary_loss_clip": 0.01162073, + "auxiliary_loss_mlp": 0.01050551, + "balance_loss_clip": 1.06404448, + "balance_loss_mlp": 1.03388596, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.826224883840025, + "language_loss": 0.7336607, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.7557869, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16674805, + "step": 4411, + "time_per_iteration": 2.4533050060272217 + }, + { + "auxiliary_loss_clip": 0.01151862, + "auxiliary_loss_mlp": 0.01045525, + "balance_loss_clip": 1.05877173, + "balance_loss_mlp": 1.02875185, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 1.90372534647472, + "language_loss": 0.82105404, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84302789, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.16766357, + "step": 4412, + "time_per_iteration": 2.6491506099700928 + }, + { + "auxiliary_loss_clip": 0.01149968, + "auxiliary_loss_mlp": 0.01053874, + "balance_loss_clip": 1.05475187, + "balance_loss_mlp": 1.03682697, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 2.9080024667317295, + "language_loss": 0.74576128, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76779974, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.17053223, + "step": 4413, + "time_per_iteration": 2.615644931793213 + }, + { + "auxiliary_loss_clip": 0.01087076, + "auxiliary_loss_mlp": 0.01006797, + "balance_loss_clip": 1.05519509, + "balance_loss_mlp": 1.00480366, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8660519850570848, + "language_loss": 0.56946999, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59040868, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.31884766, + "router_z_loss_mlp": 0.01992798, + "step": 4414, + "time_per_iteration": 3.0129778385162354 + }, + { + "auxiliary_loss_clip": 0.01150179, + "auxiliary_loss_mlp": 0.01045649, + "balance_loss_clip": 1.05863166, + "balance_loss_mlp": 1.02934098, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.560861888808813, + "language_loss": 0.74463183, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76659006, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.16320801, + "step": 4415, + "time_per_iteration": 2.6041440963745117 + }, + { + "auxiliary_loss_clip": 0.01159769, + "auxiliary_loss_mlp": 0.01046559, + "balance_loss_clip": 1.0609895, + "balance_loss_mlp": 1.02696073, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.2038936915792426, + "language_loss": 0.87071937, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.89278269, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.19592285, + "step": 4416, + "time_per_iteration": 2.4391348361968994 + }, + { + "auxiliary_loss_clip": 0.01159318, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.06180084, + "balance_loss_mlp": 1.02693272, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6606222085722888, + "language_loss": 0.76388812, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78594172, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.19116211, + "step": 4417, + "time_per_iteration": 2.4629294872283936 + }, + { + "auxiliary_loss_clip": 0.01161555, + "auxiliary_loss_mlp": 0.01040357, + "balance_loss_clip": 1.06677461, + "balance_loss_mlp": 1.02074718, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.5494443172336836, + "language_loss": 0.79929966, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82131875, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.19616699, + "step": 4418, + "time_per_iteration": 2.5435709953308105 + }, + { + "auxiliary_loss_clip": 0.01156428, + "auxiliary_loss_mlp": 0.01051252, + "balance_loss_clip": 1.0605371, + "balance_loss_mlp": 1.03296494, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 3.9309935839276866, + "language_loss": 0.67062259, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69269937, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.1829834, + "step": 4419, + "time_per_iteration": 3.912182092666626 + }, + { + "auxiliary_loss_clip": 0.01159051, + "auxiliary_loss_mlp": 0.01049686, + "balance_loss_clip": 1.06409192, + "balance_loss_mlp": 1.0309341, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.866584419560639, + "language_loss": 0.79496121, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81704855, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.18725586, + "step": 4420, + "time_per_iteration": 3.930955410003662 + }, + { + "auxiliary_loss_clip": 0.01167942, + "auxiliary_loss_mlp": 0.01053642, + "balance_loss_clip": 1.06618965, + "balance_loss_mlp": 1.03324521, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.6310848641750164, + "language_loss": 0.81956935, + "learning_rate": 3.444516567560673e-06, + "loss": 0.84178519, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.20422363, + "step": 4421, + "time_per_iteration": 2.474233627319336 + }, + { + "auxiliary_loss_clip": 0.01153123, + "auxiliary_loss_mlp": 0.01044187, + "balance_loss_clip": 1.05949998, + "balance_loss_mlp": 1.02634192, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5430512541690002, + "language_loss": 0.66142321, + "learning_rate": 3.444247179349548e-06, + "loss": 0.68339628, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.1784668, + "step": 4422, + "time_per_iteration": 2.670156240463257 + }, + { + "auxiliary_loss_clip": 0.01157091, + "auxiliary_loss_mlp": 0.01044122, + "balance_loss_clip": 1.05970347, + "balance_loss_mlp": 1.02693176, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.021398938059039, + "language_loss": 0.74332803, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76534021, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17175293, + "step": 4423, + "time_per_iteration": 3.885779857635498 + }, + { + "auxiliary_loss_clip": 0.01155824, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.05895555, + "balance_loss_mlp": 1.02512407, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.8039311042230313, + "language_loss": 0.7822938, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80427808, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17468262, + "step": 4424, + "time_per_iteration": 2.6764333248138428 + }, + { + "auxiliary_loss_clip": 0.01162057, + "auxiliary_loss_mlp": 0.01045178, + "balance_loss_clip": 1.06441748, + "balance_loss_mlp": 1.02720141, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 1.9461050319080573, + "language_loss": 0.79071468, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81278706, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.17980957, + "step": 4425, + "time_per_iteration": 2.4241931438446045 + }, + { + "auxiliary_loss_clip": 0.01160062, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.06766915, + "balance_loss_mlp": 1.02405095, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.6663940590173176, + "language_loss": 0.80581367, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82781619, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16137695, + "step": 4426, + "time_per_iteration": 2.5832345485687256 + }, + { + "auxiliary_loss_clip": 0.01158515, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.0624634, + "balance_loss_mlp": 1.047544, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 4.092439344199869, + "language_loss": 0.7707392, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79300845, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.20874023, + "step": 4427, + "time_per_iteration": 2.548891305923462 + }, + { + "auxiliary_loss_clip": 0.01155939, + "auxiliary_loss_mlp": 0.01037284, + "balance_loss_clip": 1.06416488, + "balance_loss_mlp": 1.02034402, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.5671373454909647, + "language_loss": 0.76830602, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79023826, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.16943359, + "step": 4428, + "time_per_iteration": 2.4929165840148926 + }, + { + "auxiliary_loss_clip": 0.01161535, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.06394172, + "balance_loss_mlp": 1.01757228, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.821122252062506, + "language_loss": 0.83016825, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.85212505, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.16577148, + "step": 4429, + "time_per_iteration": 2.4563229084014893 + }, + { + "auxiliary_loss_clip": 0.01158155, + "auxiliary_loss_mlp": 0.0104155, + "balance_loss_clip": 1.06352973, + "balance_loss_mlp": 1.02395463, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 4.781748943343107, + "language_loss": 0.72257388, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74457097, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.17590332, + "step": 4430, + "time_per_iteration": 2.4511115550994873 + }, + { + "auxiliary_loss_clip": 0.01174471, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.07429218, + "balance_loss_mlp": 1.03111184, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.273137806520729, + "language_loss": 0.81484944, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83710086, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.19555664, + "step": 4431, + "time_per_iteration": 2.438014030456543 + }, + { + "auxiliary_loss_clip": 0.01166516, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.06682074, + "balance_loss_mlp": 1.02940583, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.123055560720813, + "language_loss": 0.76754308, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78968, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.1776123, + "step": 4432, + "time_per_iteration": 2.4510393142700195 + }, + { + "auxiliary_loss_clip": 0.01163054, + "auxiliary_loss_mlp": 0.01042268, + "balance_loss_clip": 1.06761444, + "balance_loss_mlp": 1.0245775, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.676876344274313, + "language_loss": 0.82847011, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85052335, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.17700195, + "step": 4433, + "time_per_iteration": 2.47103214263916 + }, + { + "auxiliary_loss_clip": 0.01160634, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.06740379, + "balance_loss_mlp": 1.02795744, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 1.880330794073099, + "language_loss": 0.7678057, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78987741, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.18579102, + "step": 4434, + "time_per_iteration": 2.4897994995117188 + }, + { + "auxiliary_loss_clip": 0.01168073, + "auxiliary_loss_mlp": 0.01041719, + "balance_loss_clip": 1.07182968, + "balance_loss_mlp": 1.0249579, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 1.9676924938532387, + "language_loss": 0.82658482, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84868276, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16748047, + "step": 4435, + "time_per_iteration": 2.4696853160858154 + }, + { + "auxiliary_loss_clip": 0.01163688, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_clip": 1.06594992, + "balance_loss_mlp": 1.03435183, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.672049518194134, + "language_loss": 0.8770414, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89921248, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.19067383, + "step": 4436, + "time_per_iteration": 2.4824230670928955 + }, + { + "auxiliary_loss_clip": 0.01163231, + "auxiliary_loss_mlp": 0.01052138, + "balance_loss_clip": 1.06346643, + "balance_loss_mlp": 1.03381526, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.42998579048001, + "language_loss": 0.78965127, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81180501, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.99755859, + "router_z_loss_mlp": 0.18322754, + "step": 4437, + "time_per_iteration": 2.503089189529419 + }, + { + "auxiliary_loss_clip": 0.01160376, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.06641412, + "balance_loss_mlp": 1.02725029, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.6455763930520275, + "language_loss": 0.64375049, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66579473, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.16784668, + "step": 4438, + "time_per_iteration": 2.5715017318725586 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.06279159, + "balance_loss_mlp": 1.02809119, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 2.0777096403808666, + "language_loss": 0.76164937, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.78369874, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.16870117, + "step": 4439, + "time_per_iteration": 2.5803308486938477 + }, + { + "auxiliary_loss_clip": 0.01162771, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.06497693, + "balance_loss_mlp": 1.0193826, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.8448745861678877, + "language_loss": 0.71590281, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73791534, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.19091797, + "step": 4440, + "time_per_iteration": 2.4929888248443604 + }, + { + "auxiliary_loss_clip": 0.01165537, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.06636763, + "balance_loss_mlp": 1.02587259, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.9392894300656698, + "language_loss": 0.66863608, + "learning_rate": 3.439118409456376e-06, + "loss": 0.69073594, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.18566895, + "step": 4441, + "time_per_iteration": 2.4617979526519775 + }, + { + "auxiliary_loss_clip": 0.01159095, + "auxiliary_loss_mlp": 0.01047456, + "balance_loss_clip": 1.06389666, + "balance_loss_mlp": 1.02889466, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5525290913807825, + "language_loss": 0.76218629, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78425181, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.18566895, + "step": 4442, + "time_per_iteration": 2.5064616203308105 + }, + { + "auxiliary_loss_clip": 0.01107283, + "auxiliary_loss_mlp": 0.01014692, + "balance_loss_clip": 1.07483745, + "balance_loss_mlp": 1.01257312, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.935977298741216, + "language_loss": 0.61263585, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63385558, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.32470703, + "router_z_loss_mlp": 0.02120972, + "step": 4443, + "time_per_iteration": 2.963641881942749 + }, + { + "auxiliary_loss_clip": 0.01160891, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.06253576, + "balance_loss_mlp": 1.02486777, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.534274631556346, + "language_loss": 0.76479983, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78682727, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16992188, + "step": 4444, + "time_per_iteration": 2.652909994125366 + }, + { + "auxiliary_loss_clip": 0.01160274, + "auxiliary_loss_mlp": 0.01041741, + "balance_loss_clip": 1.06160021, + "balance_loss_mlp": 1.0225122, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 1.6374558842053237, + "language_loss": 0.80460024, + "learning_rate": 3.438036155780158e-06, + "loss": 0.8266204, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.19226074, + "step": 4445, + "time_per_iteration": 2.5007333755493164 + }, + { + "auxiliary_loss_clip": 0.01161506, + "auxiliary_loss_mlp": 0.01042371, + "balance_loss_clip": 1.06349874, + "balance_loss_mlp": 1.02338135, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.254721207246422, + "language_loss": 0.89551252, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91755128, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.18994141, + "step": 4446, + "time_per_iteration": 2.4422223567962646 + }, + { + "auxiliary_loss_clip": 0.01156335, + "auxiliary_loss_mlp": 0.01045277, + "balance_loss_clip": 1.05866075, + "balance_loss_mlp": 1.02759814, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.7798482431135803, + "language_loss": 0.68357587, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70559198, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.17687988, + "step": 4447, + "time_per_iteration": 2.5315117835998535 + }, + { + "auxiliary_loss_clip": 0.01163871, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.06429839, + "balance_loss_mlp": 1.0217669, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9986543161861507, + "language_loss": 0.83457643, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85661519, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.18225098, + "step": 4448, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01155094, + "auxiliary_loss_mlp": 0.0105031, + "balance_loss_clip": 1.0598309, + "balance_loss_mlp": 1.03215432, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.9350558521970138, + "language_loss": 0.84179175, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86384571, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.18164062, + "step": 4449, + "time_per_iteration": 2.4638776779174805 + }, + { + "auxiliary_loss_clip": 0.01170493, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_clip": 1.06460476, + "balance_loss_mlp": 1.02992487, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.7268106101473955, + "language_loss": 0.83902591, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86122787, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.19763184, + "step": 4450, + "time_per_iteration": 2.465824604034424 + }, + { + "auxiliary_loss_clip": 0.01166556, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_clip": 1.06767058, + "balance_loss_mlp": 1.03052676, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.7367039789899918, + "language_loss": 0.81099653, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83314395, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.17675781, + "step": 4451, + "time_per_iteration": 2.492567777633667 + }, + { + "auxiliary_loss_clip": 0.01161652, + "auxiliary_loss_mlp": 0.01043844, + "balance_loss_clip": 1.06572437, + "balance_loss_mlp": 1.02740502, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.5975098758676862, + "language_loss": 0.86659694, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88865191, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16430664, + "step": 4452, + "time_per_iteration": 2.627645254135132 + }, + { + "auxiliary_loss_clip": 0.01155815, + "auxiliary_loss_mlp": 0.01042424, + "balance_loss_clip": 1.05852795, + "balance_loss_mlp": 1.0244472, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 2.447658334403046, + "language_loss": 0.83237535, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85435772, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.1796875, + "step": 4453, + "time_per_iteration": 3.950470447540283 + }, + { + "auxiliary_loss_clip": 0.01159023, + "auxiliary_loss_mlp": 0.01055548, + "balance_loss_clip": 1.06156015, + "balance_loss_mlp": 1.036713, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.9105475413667845, + "language_loss": 0.79655397, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81869972, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.18835449, + "step": 4454, + "time_per_iteration": 2.560041666030884 + }, + { + "auxiliary_loss_clip": 0.01162435, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_clip": 1.06413698, + "balance_loss_mlp": 1.02501631, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.6455340133575593, + "language_loss": 0.72289479, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74494261, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17333984, + "step": 4455, + "time_per_iteration": 2.5052073001861572 + }, + { + "auxiliary_loss_clip": 0.01167279, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.07225454, + "balance_loss_mlp": 1.02098775, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 2.4092425098977146, + "language_loss": 0.73687422, + "learning_rate": 3.435055461383471e-06, + "loss": 0.75892258, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.16577148, + "step": 4456, + "time_per_iteration": 2.4490878582000732 + }, + { + "auxiliary_loss_clip": 0.01160109, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_clip": 1.06179166, + "balance_loss_mlp": 1.02487993, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.8588241282698257, + "language_loss": 0.71081185, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73283625, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.17456055, + "step": 4457, + "time_per_iteration": 2.4984867572784424 + }, + { + "auxiliary_loss_clip": 0.01158301, + "auxiliary_loss_mlp": 0.01045965, + "balance_loss_clip": 1.06329262, + "balance_loss_mlp": 1.02858436, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 1.6679162670492524, + "language_loss": 0.79174149, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81378424, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.17382812, + "step": 4458, + "time_per_iteration": 2.466826915740967 + }, + { + "auxiliary_loss_clip": 0.01104966, + "auxiliary_loss_mlp": 0.01007333, + "balance_loss_clip": 1.07171464, + "balance_loss_mlp": 1.00519967, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 1.0617385829636503, + "language_loss": 0.58653605, + "learning_rate": 3.434241401387739e-06, + "loss": 0.6076591, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.33251953, + "router_z_loss_mlp": 0.02134705, + "step": 4459, + "time_per_iteration": 3.0519535541534424 + }, + { + "auxiliary_loss_clip": 0.01149742, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.05500686, + "balance_loss_mlp": 1.02406335, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 1.7997494675280417, + "language_loss": 0.85106134, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87297142, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.17199707, + "step": 4460, + "time_per_iteration": 2.492992877960205 + }, + { + "auxiliary_loss_clip": 0.01151078, + "auxiliary_loss_mlp": 0.01038578, + "balance_loss_clip": 1.05727243, + "balance_loss_mlp": 1.02084005, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 1.8241002448783494, + "language_loss": 0.6839183, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70581484, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.17724609, + "step": 4461, + "time_per_iteration": 2.472630739212036 + }, + { + "auxiliary_loss_clip": 0.01152758, + "auxiliary_loss_mlp": 0.01051552, + "balance_loss_clip": 1.05807507, + "balance_loss_mlp": 1.03394437, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.50831192424539, + "language_loss": 0.67425168, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69629478, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.17608643, + "step": 4462, + "time_per_iteration": 2.496488332748413 + }, + { + "auxiliary_loss_clip": 0.01161631, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_clip": 1.06646466, + "balance_loss_mlp": 1.02829254, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.6570062774145404, + "language_loss": 0.69435209, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71643513, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.18371582, + "step": 4463, + "time_per_iteration": 3.9027678966522217 + }, + { + "auxiliary_loss_clip": 0.01158236, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.06056261, + "balance_loss_mlp": 1.02338505, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.813232549683041, + "language_loss": 0.77890176, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80089629, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.17834473, + "step": 4464, + "time_per_iteration": 3.835132598876953 + }, + { + "auxiliary_loss_clip": 0.0115812, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_clip": 1.06199694, + "balance_loss_mlp": 1.02853727, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.85724843716857, + "language_loss": 0.70724511, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72929382, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.18212891, + "step": 4465, + "time_per_iteration": 2.520078182220459 + }, + { + "auxiliary_loss_clip": 0.01084069, + "auxiliary_loss_mlp": 0.01011652, + "balance_loss_clip": 1.05233002, + "balance_loss_mlp": 1.00960207, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6845062871079324, + "language_loss": 0.5312556, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55221277, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.31689453, + "router_z_loss_mlp": 0.02050781, + "step": 4466, + "time_per_iteration": 3.232393264770508 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.05988133, + "balance_loss_mlp": 1.03341603, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.1309469948243183, + "language_loss": 0.74255949, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76462156, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.17810059, + "step": 4467, + "time_per_iteration": 3.9001030921936035 + }, + { + "auxiliary_loss_clip": 0.01162232, + "auxiliary_loss_mlp": 0.01052746, + "balance_loss_clip": 1.0656271, + "balance_loss_mlp": 1.03480518, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 3.2619903610322023, + "language_loss": 0.80610859, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.8282584, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.17944336, + "step": 4468, + "time_per_iteration": 2.480234384536743 + }, + { + "auxiliary_loss_clip": 0.01110308, + "auxiliary_loss_mlp": 0.01006041, + "balance_loss_clip": 1.07619023, + "balance_loss_mlp": 1.00387454, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8437307299240968, + "language_loss": 0.59499717, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61616069, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.34130859, + "router_z_loss_mlp": 0.02166748, + "step": 4469, + "time_per_iteration": 3.169896364212036 + }, + { + "auxiliary_loss_clip": 0.01158315, + "auxiliary_loss_mlp": 0.01044423, + "balance_loss_clip": 1.06319737, + "balance_loss_mlp": 1.02590942, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.2049149799128105, + "language_loss": 0.81860596, + "learning_rate": 3.431252329084972e-06, + "loss": 0.84063333, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.18518066, + "step": 4470, + "time_per_iteration": 2.4685006141662598 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.05754268, + "balance_loss_mlp": 1.02177584, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 2.2443139497088977, + "language_loss": 0.82965237, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.85152578, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.171875, + "step": 4471, + "time_per_iteration": 2.497969150543213 + }, + { + "auxiliary_loss_clip": 0.0115295, + "auxiliary_loss_mlp": 0.01044048, + "balance_loss_clip": 1.06209457, + "balance_loss_mlp": 1.02647662, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 2.125778014477328, + "language_loss": 0.69451118, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71648121, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.17565918, + "step": 4472, + "time_per_iteration": 2.5237464904785156 + }, + { + "auxiliary_loss_clip": 0.01152231, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.05927539, + "balance_loss_mlp": 1.02402449, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.7619776304997823, + "language_loss": 0.67857188, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70051551, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.18115234, + "step": 4473, + "time_per_iteration": 2.5827138423919678 + }, + { + "auxiliary_loss_clip": 0.01152774, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.06040597, + "balance_loss_mlp": 1.02827239, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.6599419897783567, + "language_loss": 0.82790148, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.84987748, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16564941, + "step": 4474, + "time_per_iteration": 2.5275919437408447 + }, + { + "auxiliary_loss_clip": 0.01152983, + "auxiliary_loss_mlp": 0.01049869, + "balance_loss_clip": 1.06133974, + "balance_loss_mlp": 1.03326285, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 1.762510800019504, + "language_loss": 0.70172763, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72375619, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.1661377, + "step": 4475, + "time_per_iteration": 2.6197996139526367 + }, + { + "auxiliary_loss_clip": 0.01160196, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.06371951, + "balance_loss_mlp": 1.02321064, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 2.605771500150048, + "language_loss": 0.73450816, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75650823, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.1661377, + "step": 4476, + "time_per_iteration": 2.562223434448242 + }, + { + "auxiliary_loss_clip": 0.01161694, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.06880307, + "balance_loss_mlp": 1.02499425, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.6761692533019477, + "language_loss": 0.80528951, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8273198, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16345215, + "step": 4477, + "time_per_iteration": 2.4459784030914307 + }, + { + "auxiliary_loss_clip": 0.0115534, + "auxiliary_loss_mlp": 0.01046213, + "balance_loss_clip": 1.06056881, + "balance_loss_mlp": 1.02908254, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.6418655941320603, + "language_loss": 0.65133834, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67335391, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17138672, + "step": 4478, + "time_per_iteration": 2.65516996383667 + }, + { + "auxiliary_loss_clip": 0.01160015, + "auxiliary_loss_mlp": 0.01046689, + "balance_loss_clip": 1.06580639, + "balance_loss_mlp": 1.0303812, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8084458398389656, + "language_loss": 0.81025195, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83231902, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.16308594, + "step": 4479, + "time_per_iteration": 2.4597291946411133 + }, + { + "auxiliary_loss_clip": 0.01153439, + "auxiliary_loss_mlp": 0.01049186, + "balance_loss_clip": 1.05735779, + "balance_loss_mlp": 1.03263998, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.2591067202036776, + "language_loss": 0.80639911, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.82842535, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.1652832, + "step": 4480, + "time_per_iteration": 2.5601582527160645 + }, + { + "auxiliary_loss_clip": 0.01145245, + "auxiliary_loss_mlp": 0.01037987, + "balance_loss_clip": 1.05463815, + "balance_loss_mlp": 1.02188206, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.6998850446017189, + "language_loss": 0.77512658, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79695886, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16125488, + "step": 4481, + "time_per_iteration": 2.4723291397094727 + }, + { + "auxiliary_loss_clip": 0.01159167, + "auxiliary_loss_mlp": 0.01045287, + "balance_loss_clip": 1.06340146, + "balance_loss_mlp": 1.02797771, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.8269551904470616, + "language_loss": 0.73795915, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76000369, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.17297363, + "step": 4482, + "time_per_iteration": 2.524484157562256 + }, + { + "auxiliary_loss_clip": 0.01159399, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.06529629, + "balance_loss_mlp": 1.02476001, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.9693554144576295, + "language_loss": 0.72342825, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74544084, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.17102051, + "step": 4483, + "time_per_iteration": 2.4485180377960205 + }, + { + "auxiliary_loss_clip": 0.01153482, + "auxiliary_loss_mlp": 0.01047552, + "balance_loss_clip": 1.05681372, + "balance_loss_mlp": 1.02927744, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.757282744525307, + "language_loss": 0.87016904, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89217937, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.18273926, + "step": 4484, + "time_per_iteration": 2.493175506591797 + }, + { + "auxiliary_loss_clip": 0.01156372, + "auxiliary_loss_mlp": 0.01049338, + "balance_loss_clip": 1.05963635, + "balance_loss_mlp": 1.03256559, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.635136471984027, + "language_loss": 0.66583729, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68789434, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.16748047, + "step": 4485, + "time_per_iteration": 2.5987696647644043 + }, + { + "auxiliary_loss_clip": 0.01155928, + "auxiliary_loss_mlp": 0.01047583, + "balance_loss_clip": 1.06027102, + "balance_loss_mlp": 1.03041673, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.3631876781884738, + "language_loss": 0.72613704, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74817216, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.17163086, + "step": 4486, + "time_per_iteration": 2.470759868621826 + }, + { + "auxiliary_loss_clip": 0.01165714, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.06697583, + "balance_loss_mlp": 1.02614355, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 2.0460186298994474, + "language_loss": 0.84684181, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86892772, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.16723633, + "step": 4487, + "time_per_iteration": 2.5217926502227783 + }, + { + "auxiliary_loss_clip": 0.01161285, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.06450844, + "balance_loss_mlp": 1.02971745, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.016040702047063, + "language_loss": 0.71540177, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73749113, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.17932129, + "step": 4488, + "time_per_iteration": 2.614685535430908 + }, + { + "auxiliary_loss_clip": 0.01159536, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_clip": 1.06444025, + "balance_loss_mlp": 1.02995646, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.8572349567567443, + "language_loss": 0.84020019, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86226737, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.17224121, + "step": 4489, + "time_per_iteration": 2.5660881996154785 + }, + { + "auxiliary_loss_clip": 0.01165882, + "auxiliary_loss_mlp": 0.01053909, + "balance_loss_clip": 1.06803823, + "balance_loss_mlp": 1.03627789, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.5782879795099145, + "language_loss": 0.9032799, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92547786, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.17626953, + "step": 4490, + "time_per_iteration": 2.4481868743896484 + }, + { + "auxiliary_loss_clip": 0.01155854, + "auxiliary_loss_mlp": 0.01039784, + "balance_loss_clip": 1.0634861, + "balance_loss_mlp": 1.02298784, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.789805879542429, + "language_loss": 0.73420286, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75615919, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16784668, + "step": 4491, + "time_per_iteration": 2.639054536819458 + }, + { + "auxiliary_loss_clip": 0.01161656, + "auxiliary_loss_mlp": 0.01043633, + "balance_loss_clip": 1.06504011, + "balance_loss_mlp": 1.02598453, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.4876466696202955, + "language_loss": 0.74613112, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76818395, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.17645264, + "step": 4492, + "time_per_iteration": 2.526050329208374 + }, + { + "auxiliary_loss_clip": 0.01157679, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.0627811, + "balance_loss_mlp": 1.02763057, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.7480299783749218, + "language_loss": 0.89397359, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91599703, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.17053223, + "step": 4493, + "time_per_iteration": 2.5118420124053955 + }, + { + "auxiliary_loss_clip": 0.01160681, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.0654223, + "balance_loss_mlp": 1.02841699, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.6323432758154255, + "language_loss": 0.71418351, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73624176, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.16748047, + "step": 4494, + "time_per_iteration": 2.5138649940490723 + }, + { + "auxiliary_loss_clip": 0.01159598, + "auxiliary_loss_mlp": 0.01040051, + "balance_loss_clip": 1.06680441, + "balance_loss_mlp": 1.02348113, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 1.9200924404536617, + "language_loss": 0.86376852, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88576502, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.16564941, + "step": 4495, + "time_per_iteration": 2.5068581104278564 + }, + { + "auxiliary_loss_clip": 0.01155791, + "auxiliary_loss_mlp": 0.01041823, + "balance_loss_clip": 1.06226707, + "balance_loss_mlp": 1.02440631, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.8224823343403223, + "language_loss": 0.76818037, + "learning_rate": 3.424161168522959e-06, + "loss": 0.79015648, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.1739502, + "step": 4496, + "time_per_iteration": 3.9038655757904053 + }, + { + "auxiliary_loss_clip": 0.01095997, + "auxiliary_loss_mlp": 0.01007136, + "balance_loss_clip": 1.06400478, + "balance_loss_mlp": 1.00532138, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7225610021166506, + "language_loss": 0.50185645, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52288771, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.31933594, + "router_z_loss_mlp": 0.01815796, + "step": 4497, + "time_per_iteration": 3.1627395153045654 + }, + { + "auxiliary_loss_clip": 0.01166366, + "auxiliary_loss_mlp": 0.01041906, + "balance_loss_clip": 1.07179022, + "balance_loss_mlp": 1.02565789, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.9974581130202538, + "language_loss": 0.72433513, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74641788, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.16247559, + "step": 4498, + "time_per_iteration": 2.494513511657715 + }, + { + "auxiliary_loss_clip": 0.01092806, + "auxiliary_loss_mlp": 0.01004928, + "balance_loss_clip": 1.06114447, + "balance_loss_mlp": 1.00320554, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7591633970342051, + "language_loss": 0.59190071, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61287808, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.01724243, + "step": 4499, + "time_per_iteration": 3.1352157592773438 + }, + { + "auxiliary_loss_clip": 0.01158726, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.06606746, + "balance_loss_mlp": 1.02143693, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 1.885319110574437, + "language_loss": 0.73671591, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75868994, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.17236328, + "step": 4500, + "time_per_iteration": 2.5354161262512207 + }, + { + "auxiliary_loss_clip": 0.01163966, + "auxiliary_loss_mlp": 0.01045995, + "balance_loss_clip": 1.06759834, + "balance_loss_mlp": 1.02869821, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 2.8661176955241023, + "language_loss": 0.80864429, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.83074385, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.17279053, + "step": 4501, + "time_per_iteration": 2.5453460216522217 + }, + { + "auxiliary_loss_clip": 0.01158621, + "auxiliary_loss_mlp": 0.01054243, + "balance_loss_clip": 1.06253397, + "balance_loss_mlp": 1.03490674, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.7500476903792932, + "language_loss": 0.72602093, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74814951, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.19348145, + "step": 4502, + "time_per_iteration": 2.512805938720703 + }, + { + "auxiliary_loss_clip": 0.01163676, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.0649929, + "balance_loss_mlp": 1.01997256, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.9086587415911003, + "language_loss": 0.68163645, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.7036646, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.19152832, + "step": 4503, + "time_per_iteration": 2.6747403144836426 + }, + { + "auxiliary_loss_clip": 0.01159355, + "auxiliary_loss_mlp": 0.01039727, + "balance_loss_clip": 1.06577909, + "balance_loss_mlp": 1.02303755, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 1.9040073819278662, + "language_loss": 0.68000579, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70199656, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16674805, + "step": 4504, + "time_per_iteration": 2.490752935409546 + }, + { + "auxiliary_loss_clip": 0.01159736, + "auxiliary_loss_mlp": 0.01046473, + "balance_loss_clip": 1.06727147, + "balance_loss_mlp": 1.03032029, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.4664874298669845, + "language_loss": 0.75422621, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77628827, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16149902, + "step": 4505, + "time_per_iteration": 2.553131341934204 + }, + { + "auxiliary_loss_clip": 0.01169559, + "auxiliary_loss_mlp": 0.01050498, + "balance_loss_clip": 1.0690676, + "balance_loss_mlp": 1.03222322, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 1.9422092786841942, + "language_loss": 0.73779976, + "learning_rate": 3.42142406835758e-06, + "loss": 0.76000029, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.18273926, + "step": 4506, + "time_per_iteration": 3.8521811962127686 + }, + { + "auxiliary_loss_clip": 0.01157008, + "auxiliary_loss_mlp": 0.01041789, + "balance_loss_clip": 1.0616641, + "balance_loss_mlp": 1.02393198, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.1109059789319793, + "language_loss": 0.80699623, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82898426, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.17858887, + "step": 4507, + "time_per_iteration": 3.87424898147583 + }, + { + "auxiliary_loss_clip": 0.01086135, + "auxiliary_loss_mlp": 0.01003292, + "balance_loss_clip": 1.05461693, + "balance_loss_mlp": 1.00160217, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7480096887808111, + "language_loss": 0.50862634, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52952063, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.31494141, + "router_z_loss_mlp": 0.01693726, + "step": 4508, + "time_per_iteration": 2.987159013748169 + }, + { + "auxiliary_loss_clip": 0.01159996, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.0679127, + "balance_loss_mlp": 1.02373195, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 1.8457114870752929, + "language_loss": 0.74572873, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76772928, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.16333008, + "step": 4509, + "time_per_iteration": 2.5494816303253174 + }, + { + "auxiliary_loss_clip": 0.01157415, + "auxiliary_loss_mlp": 0.01040401, + "balance_loss_clip": 1.06774688, + "balance_loss_mlp": 1.02474904, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 2.8184254126093298, + "language_loss": 0.72011745, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.74209559, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.15661621, + "step": 4510, + "time_per_iteration": 3.851834774017334 + }, + { + "auxiliary_loss_clip": 0.01165514, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.07015538, + "balance_loss_mlp": 1.02095103, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.4043195321725817, + "language_loss": 0.70267296, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72470695, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.16943359, + "step": 4511, + "time_per_iteration": 2.469528913497925 + }, + { + "auxiliary_loss_clip": 0.01163417, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.06606424, + "balance_loss_mlp": 1.02649558, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.151841751657216, + "language_loss": 0.80616236, + "learning_rate": 3.419779220367979e-06, + "loss": 0.82823622, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17468262, + "step": 4512, + "time_per_iteration": 2.532804012298584 + }, + { + "auxiliary_loss_clip": 0.01153649, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.06268942, + "balance_loss_mlp": 1.02143979, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.5153679736988457, + "language_loss": 0.80628419, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82818627, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.15124512, + "step": 4513, + "time_per_iteration": 2.6171092987060547 + }, + { + "auxiliary_loss_clip": 0.01163452, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.06566453, + "balance_loss_mlp": 1.02766943, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8850145680238746, + "language_loss": 0.88047314, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90254831, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.16394043, + "step": 4514, + "time_per_iteration": 2.4868218898773193 + }, + { + "auxiliary_loss_clip": 0.01157087, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.06445527, + "balance_loss_mlp": 1.02563095, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.5937672485254895, + "language_loss": 0.9192307, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94123203, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.17419434, + "step": 4515, + "time_per_iteration": 2.4944887161254883 + }, + { + "auxiliary_loss_clip": 0.0116352, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.06551361, + "balance_loss_mlp": 1.03196645, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.6039363658655836, + "language_loss": 0.74260187, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76475149, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.19482422, + "step": 4516, + "time_per_iteration": 2.4646997451782227 + }, + { + "auxiliary_loss_clip": 0.01152838, + "auxiliary_loss_mlp": 0.01045511, + "balance_loss_clip": 1.05903482, + "balance_loss_mlp": 1.02780855, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 1.8036459799472935, + "language_loss": 0.76015407, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78213751, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.17700195, + "step": 4517, + "time_per_iteration": 2.44779372215271 + }, + { + "auxiliary_loss_clip": 0.01157918, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.06542659, + "balance_loss_mlp": 1.02455807, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.6993993153399773, + "language_loss": 0.77451628, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79651761, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.17651367, + "step": 4518, + "time_per_iteration": 2.490269660949707 + }, + { + "auxiliary_loss_clip": 0.01155252, + "auxiliary_loss_mlp": 0.01039054, + "balance_loss_clip": 1.06258285, + "balance_loss_mlp": 1.02390838, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.745537679054592, + "language_loss": 0.68467867, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70662171, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15142822, + "step": 4519, + "time_per_iteration": 2.4472641944885254 + }, + { + "auxiliary_loss_clip": 0.01153131, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.05959535, + "balance_loss_mlp": 1.02017426, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 1.9622482534701615, + "language_loss": 0.75780094, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7796967, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16271973, + "step": 4520, + "time_per_iteration": 2.507308006286621 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01050329, + "balance_loss_clip": 1.06134582, + "balance_loss_mlp": 1.03133893, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.1837591483947834, + "language_loss": 0.76382601, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78589565, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.18994141, + "step": 4521, + "time_per_iteration": 2.4968466758728027 + }, + { + "auxiliary_loss_clip": 0.01157186, + "auxiliary_loss_mlp": 0.01044476, + "balance_loss_clip": 1.0617187, + "balance_loss_mlp": 1.02705932, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.7757157454530246, + "language_loss": 0.75149488, + "learning_rate": 3.417033501108875e-06, + "loss": 0.77351153, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17419434, + "step": 4522, + "time_per_iteration": 2.4508676528930664 + }, + { + "auxiliary_loss_clip": 0.01157717, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.06260598, + "balance_loss_mlp": 1.02082598, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 2.092022862466892, + "language_loss": 0.72818017, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75013465, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16894531, + "step": 4523, + "time_per_iteration": 2.480290651321411 + }, + { + "auxiliary_loss_clip": 0.01151761, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.06127584, + "balance_loss_mlp": 1.01958311, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.490783343275982, + "language_loss": 0.74200404, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76388478, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.16748047, + "step": 4524, + "time_per_iteration": 2.4788479804992676 + }, + { + "auxiliary_loss_clip": 0.0117309, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.0745554, + "balance_loss_mlp": 1.02055216, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.5386958097958698, + "language_loss": 0.76349115, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78559136, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.16381836, + "step": 4525, + "time_per_iteration": 2.589061975479126 + }, + { + "auxiliary_loss_clip": 0.01155668, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.06527543, + "balance_loss_mlp": 1.02961648, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.9791924277423363, + "language_loss": 0.81346297, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.83548188, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.16589355, + "step": 4526, + "time_per_iteration": 2.5356414318084717 + }, + { + "auxiliary_loss_clip": 0.01160126, + "auxiliary_loss_mlp": 0.01043605, + "balance_loss_clip": 1.06298757, + "balance_loss_mlp": 1.0254252, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 3.1923838038737897, + "language_loss": 0.77935588, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.80139315, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.1817627, + "step": 4527, + "time_per_iteration": 2.513779401779175 + }, + { + "auxiliary_loss_clip": 0.01156755, + "auxiliary_loss_mlp": 0.01043814, + "balance_loss_clip": 1.0645709, + "balance_loss_mlp": 1.02621913, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1337835226142214, + "language_loss": 0.82128567, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84329134, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.17590332, + "step": 4528, + "time_per_iteration": 2.5049543380737305 + }, + { + "auxiliary_loss_clip": 0.01153634, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.06244659, + "balance_loss_mlp": 1.02776027, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 2.03975692978881, + "language_loss": 0.77436996, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79633933, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15551758, + "step": 4529, + "time_per_iteration": 2.5871670246124268 + }, + { + "auxiliary_loss_clip": 0.01160653, + "auxiliary_loss_mlp": 0.01046541, + "balance_loss_clip": 1.0664022, + "balance_loss_mlp": 1.03099597, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.144708695167077, + "language_loss": 0.82455963, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84663159, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15551758, + "step": 4530, + "time_per_iteration": 2.5850913524627686 + }, + { + "auxiliary_loss_clip": 0.01165762, + "auxiliary_loss_mlp": 0.01041176, + "balance_loss_clip": 1.06778753, + "balance_loss_mlp": 1.02415299, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.056711729611813, + "language_loss": 0.92109573, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.94316506, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.17028809, + "step": 4531, + "time_per_iteration": 2.8940377235412598 + }, + { + "auxiliary_loss_clip": 0.0115973, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.06292415, + "balance_loss_mlp": 1.02661407, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 3.4104134451316135, + "language_loss": 0.76309597, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78512728, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.16772461, + "step": 4532, + "time_per_iteration": 2.620234251022339 + }, + { + "auxiliary_loss_clip": 0.01156815, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.06610513, + "balance_loss_mlp": 1.02088785, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 4.9992033328018985, + "language_loss": 0.88649774, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.90843141, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.15673828, + "step": 4533, + "time_per_iteration": 2.7565953731536865 + }, + { + "auxiliary_loss_clip": 0.01159376, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.06978488, + "balance_loss_mlp": 1.02012205, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 1.9701918325313608, + "language_loss": 0.71532142, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73726964, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15307617, + "step": 4534, + "time_per_iteration": 2.7761805057525635 + }, + { + "auxiliary_loss_clip": 0.01154614, + "auxiliary_loss_mlp": 0.01041206, + "balance_loss_clip": 1.06164718, + "balance_loss_mlp": 1.02412367, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.950852624516575, + "language_loss": 0.91530037, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.9372586, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.17102051, + "step": 4535, + "time_per_iteration": 2.92952561378479 + }, + { + "auxiliary_loss_clip": 0.01161985, + "auxiliary_loss_mlp": 0.01046621, + "balance_loss_clip": 1.06515288, + "balance_loss_mlp": 1.02876377, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 2.596506345996419, + "language_loss": 0.72775763, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74984372, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.17871094, + "step": 4536, + "time_per_iteration": 2.7040791511535645 + }, + { + "auxiliary_loss_clip": 0.01160065, + "auxiliary_loss_mlp": 0.01040302, + "balance_loss_clip": 1.06527746, + "balance_loss_mlp": 1.02365398, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.7569928230751686, + "language_loss": 0.71480495, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73680866, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16656494, + "step": 4537, + "time_per_iteration": 2.7582955360412598 + }, + { + "auxiliary_loss_clip": 0.01151613, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.05948639, + "balance_loss_mlp": 1.02229118, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.305330315196112, + "language_loss": 0.7871201, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80902624, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.16711426, + "step": 4538, + "time_per_iteration": 2.5148630142211914 + }, + { + "auxiliary_loss_clip": 0.0115556, + "auxiliary_loss_mlp": 0.01045725, + "balance_loss_clip": 1.06035423, + "balance_loss_mlp": 1.02935743, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.8650119546552149, + "language_loss": 0.90288043, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92489326, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.16369629, + "step": 4539, + "time_per_iteration": 2.6490862369537354 + }, + { + "auxiliary_loss_clip": 0.01158088, + "auxiliary_loss_mlp": 0.01041316, + "balance_loss_clip": 1.06405401, + "balance_loss_mlp": 1.02280283, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.1997066886594223, + "language_loss": 0.87963188, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90162593, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.18518066, + "step": 4540, + "time_per_iteration": 4.078470230102539 + }, + { + "auxiliary_loss_clip": 0.01157376, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.0631932, + "balance_loss_mlp": 1.02074015, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9998048582120262, + "language_loss": 0.81829989, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84024131, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.16027832, + "step": 4541, + "time_per_iteration": 2.5355842113494873 + }, + { + "auxiliary_loss_clip": 0.01155779, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_clip": 1.06325841, + "balance_loss_mlp": 1.02790868, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.9561910987637274, + "language_loss": 0.79709989, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81910217, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16564941, + "step": 4542, + "time_per_iteration": 2.546630382537842 + }, + { + "auxiliary_loss_clip": 0.01161245, + "auxiliary_loss_mlp": 0.01041059, + "balance_loss_clip": 1.0692693, + "balance_loss_mlp": 1.02509689, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.4513939583682407, + "language_loss": 0.89522624, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91724932, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.15966797, + "step": 4543, + "time_per_iteration": 2.7319209575653076 + }, + { + "auxiliary_loss_clip": 0.01157273, + "auxiliary_loss_mlp": 0.01055063, + "balance_loss_clip": 1.06048107, + "balance_loss_mlp": 1.03581119, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 1.9765003370991967, + "language_loss": 0.63686049, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65898383, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.19262695, + "step": 4544, + "time_per_iteration": 2.5036098957061768 + }, + { + "auxiliary_loss_clip": 0.01156999, + "auxiliary_loss_mlp": 0.01043563, + "balance_loss_clip": 1.06560087, + "balance_loss_mlp": 1.02615809, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.7444924301878082, + "language_loss": 0.69921553, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72122109, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.17407227, + "step": 4545, + "time_per_iteration": 2.5575125217437744 + }, + { + "auxiliary_loss_clip": 0.01125267, + "auxiliary_loss_mlp": 0.01028212, + "balance_loss_clip": 1.09106898, + "balance_loss_mlp": 1.02593803, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7323003758240697, + "language_loss": 0.61649168, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63802648, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.34130859, + "router_z_loss_mlp": 0.0227356, + "step": 4546, + "time_per_iteration": 3.1084110736846924 + }, + { + "auxiliary_loss_clip": 0.01157644, + "auxiliary_loss_mlp": 0.01048456, + "balance_loss_clip": 1.06307697, + "balance_loss_mlp": 1.03102791, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 2.0212708501479963, + "language_loss": 0.6481216, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67018259, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.17431641, + "step": 4547, + "time_per_iteration": 2.429996967315674 + }, + { + "auxiliary_loss_clip": 0.01154159, + "auxiliary_loss_mlp": 0.01036785, + "balance_loss_clip": 1.06425333, + "balance_loss_mlp": 1.02138329, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 3.186561339862469, + "language_loss": 0.77316082, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79507023, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15393066, + "step": 4548, + "time_per_iteration": 2.584162950515747 + }, + { + "auxiliary_loss_clip": 0.01150603, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.06070781, + "balance_loss_mlp": 1.03216338, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8346698499919118, + "language_loss": 0.82461911, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84660292, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.15594482, + "step": 4549, + "time_per_iteration": 2.4787023067474365 + }, + { + "auxiliary_loss_clip": 0.01153093, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.05826342, + "balance_loss_mlp": 1.02613139, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.4029686809016875, + "language_loss": 0.70898783, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73096007, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.18005371, + "step": 4550, + "time_per_iteration": 3.8399128913879395 + }, + { + "auxiliary_loss_clip": 0.01156949, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.06566799, + "balance_loss_mlp": 1.02146864, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 1.9889478360585593, + "language_loss": 0.7889359, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81087101, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.15087891, + "step": 4551, + "time_per_iteration": 4.056796312332153 + }, + { + "auxiliary_loss_clip": 0.0115932, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.06434107, + "balance_loss_mlp": 1.02445388, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.195252364434188, + "language_loss": 0.70410782, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.7261098, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.16442871, + "step": 4552, + "time_per_iteration": 2.4539196491241455 + }, + { + "auxiliary_loss_clip": 0.01154869, + "auxiliary_loss_mlp": 0.0103875, + "balance_loss_clip": 1.06178641, + "balance_loss_mlp": 1.02216756, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 3.5977174279596067, + "language_loss": 0.71917969, + "learning_rate": 3.408487669858431e-06, + "loss": 0.74111593, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.16589355, + "step": 4553, + "time_per_iteration": 2.5090701580047607 + }, + { + "auxiliary_loss_clip": 0.01151244, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.05990505, + "balance_loss_mlp": 1.01905465, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.8518464956439873, + "language_loss": 0.59584153, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61771154, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.16687012, + "step": 4554, + "time_per_iteration": 3.9832935333251953 + }, + { + "auxiliary_loss_clip": 0.01166756, + "auxiliary_loss_mlp": 0.01039075, + "balance_loss_clip": 1.07028353, + "balance_loss_mlp": 1.02267766, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7153443374956807, + "language_loss": 0.74161685, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76367521, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16387939, + "step": 4555, + "time_per_iteration": 2.6961240768432617 + }, + { + "auxiliary_loss_clip": 0.01155288, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.06279767, + "balance_loss_mlp": 1.02092767, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 2.0418188760934806, + "language_loss": 0.78097486, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80289817, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.16125488, + "step": 4556, + "time_per_iteration": 2.499403238296509 + }, + { + "auxiliary_loss_clip": 0.01164017, + "auxiliary_loss_mlp": 0.01049457, + "balance_loss_clip": 1.06050789, + "balance_loss_mlp": 1.03069353, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.8151794540650936, + "language_loss": 0.8214215, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84355617, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.18762207, + "step": 4557, + "time_per_iteration": 2.4378209114074707 + }, + { + "auxiliary_loss_clip": 0.01153936, + "auxiliary_loss_mlp": 0.01042224, + "balance_loss_clip": 1.05919731, + "balance_loss_mlp": 1.02641678, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.8247608949356553, + "language_loss": 0.72909486, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.75105643, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15808105, + "step": 4558, + "time_per_iteration": 2.439122438430786 + }, + { + "auxiliary_loss_clip": 0.01151429, + "auxiliary_loss_mlp": 0.01058338, + "balance_loss_clip": 1.05811477, + "balance_loss_mlp": 1.03993249, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.18970853380732, + "language_loss": 0.67960739, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70170498, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.1842041, + "step": 4559, + "time_per_iteration": 2.4216554164886475 + }, + { + "auxiliary_loss_clip": 0.01174531, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.07893515, + "balance_loss_mlp": 1.03057098, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 1.7795170859835745, + "language_loss": 0.72289234, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74510217, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15869141, + "step": 4560, + "time_per_iteration": 2.518585205078125 + }, + { + "auxiliary_loss_clip": 0.0114936, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.05649126, + "balance_loss_mlp": 1.02473128, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.6579620631611764, + "language_loss": 0.8181234, + "learning_rate": 3.406273949573303e-06, + "loss": 0.84002626, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.16186523, + "step": 4561, + "time_per_iteration": 2.5645971298217773 + }, + { + "auxiliary_loss_clip": 0.01155484, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.06090355, + "balance_loss_mlp": 1.02750063, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.8043498030526588, + "language_loss": 0.75489736, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77688754, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16027832, + "step": 4562, + "time_per_iteration": 2.4728097915649414 + }, + { + "auxiliary_loss_clip": 0.01152879, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.06038034, + "balance_loss_mlp": 1.02070642, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.8143080268781426, + "language_loss": 0.74654895, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76844931, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16467285, + "step": 4563, + "time_per_iteration": 2.491982936859131 + }, + { + "auxiliary_loss_clip": 0.01164964, + "auxiliary_loss_mlp": 0.0104752, + "balance_loss_clip": 1.06493044, + "balance_loss_mlp": 1.02860153, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.9084594891930664, + "language_loss": 0.62377179, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64589661, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.18920898, + "step": 4564, + "time_per_iteration": 2.474724054336548 + }, + { + "auxiliary_loss_clip": 0.01162204, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.06944561, + "balance_loss_mlp": 1.02469027, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.5701219007522804, + "language_loss": 0.78744453, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80948508, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.17156982, + "step": 4565, + "time_per_iteration": 2.610731840133667 + }, + { + "auxiliary_loss_clip": 0.01151501, + "auxiliary_loss_mlp": 0.01046257, + "balance_loss_clip": 1.05969882, + "balance_loss_mlp": 1.02874494, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.153410984439584, + "language_loss": 0.68157512, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70355272, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.1751709, + "step": 4566, + "time_per_iteration": 2.485661268234253 + }, + { + "auxiliary_loss_clip": 0.01157826, + "auxiliary_loss_mlp": 0.01042675, + "balance_loss_clip": 1.06751168, + "balance_loss_mlp": 1.02742839, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.8580153245944306, + "language_loss": 0.61376244, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63576746, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.15234375, + "step": 4567, + "time_per_iteration": 2.501220226287842 + }, + { + "auxiliary_loss_clip": 0.01157118, + "auxiliary_loss_mlp": 0.01041067, + "balance_loss_clip": 1.06304812, + "balance_loss_mlp": 1.02374566, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 2.035749803469314, + "language_loss": 0.82463092, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84661275, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.1730957, + "step": 4568, + "time_per_iteration": 2.4925665855407715 + }, + { + "auxiliary_loss_clip": 0.01152382, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.05909073, + "balance_loss_mlp": 1.02578473, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 1.8266591597348458, + "language_loss": 0.68647957, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70844436, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.18322754, + "step": 4569, + "time_per_iteration": 2.7557177543640137 + }, + { + "auxiliary_loss_clip": 0.01158359, + "auxiliary_loss_mlp": 0.01055663, + "balance_loss_clip": 1.06342673, + "balance_loss_mlp": 1.03579068, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.242896386235205, + "language_loss": 0.70962346, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73176366, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.19873047, + "step": 4570, + "time_per_iteration": 2.5248701572418213 + }, + { + "auxiliary_loss_clip": 0.01101292, + "auxiliary_loss_mlp": 0.01041603, + "balance_loss_clip": 1.06820214, + "balance_loss_mlp": 1.03957987, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.72919622394182, + "language_loss": 0.55806702, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57949597, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.33105469, + "router_z_loss_mlp": 0.02023315, + "step": 4571, + "time_per_iteration": 3.219947099685669 + }, + { + "auxiliary_loss_clip": 0.01158556, + "auxiliary_loss_mlp": 0.01048056, + "balance_loss_clip": 1.06171858, + "balance_loss_mlp": 1.03049636, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 1.9706120026678835, + "language_loss": 0.76941216, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79147828, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.17565918, + "step": 4572, + "time_per_iteration": 2.4437291622161865 + }, + { + "auxiliary_loss_clip": 0.01150388, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.06216359, + "balance_loss_mlp": 1.02166259, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.519104756518543, + "language_loss": 0.81271583, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83458054, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.144104, + "step": 4573, + "time_per_iteration": 2.471224069595337 + }, + { + "auxiliary_loss_clip": 0.01153524, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.06342304, + "balance_loss_mlp": 1.01956046, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.687862320188441, + "language_loss": 0.79080141, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81269836, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.16625977, + "step": 4574, + "time_per_iteration": 2.4487829208374023 + }, + { + "auxiliary_loss_clip": 0.01158034, + "auxiliary_loss_mlp": 0.01049228, + "balance_loss_clip": 1.06453943, + "balance_loss_mlp": 1.03286099, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 1.8302141627725623, + "language_loss": 0.7419703, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76404297, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16345215, + "step": 4575, + "time_per_iteration": 2.4502713680267334 + }, + { + "auxiliary_loss_clip": 0.01165544, + "auxiliary_loss_mlp": 0.01039833, + "balance_loss_clip": 1.07184768, + "balance_loss_mlp": 1.02419329, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.6799151881623053, + "language_loss": 0.71352971, + "learning_rate": 3.402114029526814e-06, + "loss": 0.73558348, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15637207, + "step": 4576, + "time_per_iteration": 2.6256678104400635 + }, + { + "auxiliary_loss_clip": 0.01156883, + "auxiliary_loss_mlp": 0.01037517, + "balance_loss_clip": 1.06230986, + "balance_loss_mlp": 1.02057695, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.6138594700684012, + "language_loss": 0.73260403, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75454801, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16918945, + "step": 4577, + "time_per_iteration": 2.491060733795166 + }, + { + "auxiliary_loss_clip": 0.01165403, + "auxiliary_loss_mlp": 0.0104309, + "balance_loss_clip": 1.06967473, + "balance_loss_mlp": 1.0256381, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 2.0938529284857816, + "language_loss": 0.76160467, + "learning_rate": 3.401558468884188e-06, + "loss": 0.7836895, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17468262, + "step": 4578, + "time_per_iteration": 2.5308573246002197 + }, + { + "auxiliary_loss_clip": 0.01174643, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.07656717, + "balance_loss_mlp": 1.02376986, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 1.4802846346459728, + "language_loss": 0.66592848, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68810236, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.18981934, + "step": 4579, + "time_per_iteration": 2.5370469093322754 + }, + { + "auxiliary_loss_clip": 0.0115794, + "auxiliary_loss_mlp": 0.01047665, + "balance_loss_clip": 1.0627811, + "balance_loss_mlp": 1.02915192, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.9365713287055686, + "language_loss": 0.7991901, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82124615, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.18505859, + "step": 4580, + "time_per_iteration": 2.5018177032470703 + }, + { + "auxiliary_loss_clip": 0.01159808, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.06712592, + "balance_loss_mlp": 1.02626777, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 2.717430168030054, + "language_loss": 0.67476404, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69680613, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.18127441, + "step": 4581, + "time_per_iteration": 2.422701120376587 + }, + { + "auxiliary_loss_clip": 0.01165242, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_clip": 1.06725526, + "balance_loss_mlp": 1.02533901, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.825639167605315, + "language_loss": 0.78239542, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16052246, + "step": 4582, + "time_per_iteration": 2.574432611465454 + }, + { + "auxiliary_loss_clip": 0.01164135, + "auxiliary_loss_mlp": 0.01036538, + "balance_loss_clip": 1.07109773, + "balance_loss_mlp": 1.02176762, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.5894363088178582, + "language_loss": 0.84415084, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86615759, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.14770508, + "step": 4583, + "time_per_iteration": 2.5594046115875244 + }, + { + "auxiliary_loss_clip": 0.01157783, + "auxiliary_loss_mlp": 0.01039373, + "balance_loss_clip": 1.06372499, + "balance_loss_mlp": 1.02361345, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.6872889782162526, + "language_loss": 0.67324942, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69522101, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15771484, + "step": 4584, + "time_per_iteration": 3.956195831298828 + }, + { + "auxiliary_loss_clip": 0.01153569, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.06092238, + "balance_loss_mlp": 1.02419078, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.68472566605311, + "language_loss": 0.77239573, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79433155, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15808105, + "step": 4585, + "time_per_iteration": 2.4821531772613525 + }, + { + "auxiliary_loss_clip": 0.01163265, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.06844258, + "balance_loss_mlp": 1.02212954, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.7325876287471953, + "language_loss": 0.71790707, + "learning_rate": 3.399334101267362e-06, + "loss": 0.73993313, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.17224121, + "step": 4586, + "time_per_iteration": 2.469520330429077 + }, + { + "auxiliary_loss_clip": 0.01164153, + "auxiliary_loss_mlp": 0.01035243, + "balance_loss_clip": 1.07018614, + "balance_loss_mlp": 1.01898313, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.51390598516971, + "language_loss": 0.80448043, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82647443, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.16259766, + "step": 4587, + "time_per_iteration": 2.526575803756714 + }, + { + "auxiliary_loss_clip": 0.01157793, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.06667089, + "balance_loss_mlp": 1.02550173, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 1.8416137512183701, + "language_loss": 0.82765114, + "learning_rate": 3.398777478523316e-06, + "loss": 0.84964764, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.16345215, + "step": 4588, + "time_per_iteration": 2.450838804244995 + }, + { + "auxiliary_loss_clip": 0.01150053, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.05952859, + "balance_loss_mlp": 1.02177286, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3732478746108672, + "language_loss": 0.75472021, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77659774, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.15942383, + "step": 4589, + "time_per_iteration": 2.5189836025238037 + }, + { + "auxiliary_loss_clip": 0.01155543, + "auxiliary_loss_mlp": 0.01040219, + "balance_loss_clip": 1.06483436, + "balance_loss_mlp": 1.02339876, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 4.0451699484030055, + "language_loss": 0.88793766, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90989524, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.16821289, + "step": 4590, + "time_per_iteration": 2.498063087463379 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.01045719, + "balance_loss_clip": 1.06477082, + "balance_loss_mlp": 1.02871966, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.503250440619027, + "language_loss": 0.71258843, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73463458, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.17016602, + "step": 4591, + "time_per_iteration": 2.604139566421509 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_clip": 1.06381094, + "balance_loss_mlp": 1.02844, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8685535769156394, + "language_loss": 0.80177438, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82379204, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.16638184, + "step": 4592, + "time_per_iteration": 2.4735329151153564 + }, + { + "auxiliary_loss_clip": 0.01104997, + "auxiliary_loss_mlp": 0.01011718, + "balance_loss_clip": 1.07011461, + "balance_loss_mlp": 1.00966454, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7082931308861735, + "language_loss": 0.61623502, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63740218, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.34912109, + "router_z_loss_mlp": 0.02053833, + "step": 4593, + "time_per_iteration": 4.457723140716553 + }, + { + "auxiliary_loss_clip": 0.01154483, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.0615027, + "balance_loss_mlp": 1.02451408, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.8534972950511603, + "language_loss": 0.77612686, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79807377, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.15679932, + "step": 4594, + "time_per_iteration": 4.020566940307617 + }, + { + "auxiliary_loss_clip": 0.01155495, + "auxiliary_loss_mlp": 0.01045208, + "balance_loss_clip": 1.06258965, + "balance_loss_mlp": 1.02742231, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.3953251443291066, + "language_loss": 0.91717798, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93918502, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.17773438, + "step": 4595, + "time_per_iteration": 2.5595123767852783 + }, + { + "auxiliary_loss_clip": 0.01157588, + "auxiliary_loss_mlp": 0.01054197, + "balance_loss_clip": 1.06038082, + "balance_loss_mlp": 1.03633952, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 2.287187928568091, + "language_loss": 0.69794977, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.72006762, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.17858887, + "step": 4596, + "time_per_iteration": 2.529863119125366 + }, + { + "auxiliary_loss_clip": 0.01159603, + "auxiliary_loss_mlp": 0.01048394, + "balance_loss_clip": 1.05865932, + "balance_loss_mlp": 1.03082228, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.2662372025879303, + "language_loss": 0.63544846, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65752846, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17565918, + "step": 4597, + "time_per_iteration": 2.5468358993530273 + }, + { + "auxiliary_loss_clip": 0.01162933, + "auxiliary_loss_mlp": 0.01044953, + "balance_loss_clip": 1.06939089, + "balance_loss_mlp": 1.02871644, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 1.8789238161728512, + "language_loss": 0.8623448, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88442361, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16235352, + "step": 4598, + "time_per_iteration": 3.780994176864624 + }, + { + "auxiliary_loss_clip": 0.01155996, + "auxiliary_loss_mlp": 0.01047141, + "balance_loss_clip": 1.06157231, + "balance_loss_mlp": 1.02961731, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 3.1258699002586128, + "language_loss": 0.80420142, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82623279, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.1751709, + "step": 4599, + "time_per_iteration": 2.44681715965271 + }, + { + "auxiliary_loss_clip": 0.01160589, + "auxiliary_loss_mlp": 0.01051373, + "balance_loss_clip": 1.06300974, + "balance_loss_mlp": 1.03454041, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.6279807842541765, + "language_loss": 0.78892261, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81104219, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.16827393, + "step": 4600, + "time_per_iteration": 2.5814807415008545 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01046751, + "balance_loss_clip": 1.05856884, + "balance_loss_mlp": 1.02984762, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 2.0260123044943628, + "language_loss": 0.7366057, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75861561, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16906738, + "step": 4601, + "time_per_iteration": 2.5087034702301025 + }, + { + "auxiliary_loss_clip": 0.01150806, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.05843186, + "balance_loss_mlp": 1.03361619, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.801764733180055, + "language_loss": 0.80069917, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.82273483, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.19140625, + "step": 4602, + "time_per_iteration": 2.486783742904663 + }, + { + "auxiliary_loss_clip": 0.01165376, + "auxiliary_loss_mlp": 0.01054381, + "balance_loss_clip": 1.06624532, + "balance_loss_mlp": 1.03587937, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.271248374176035, + "language_loss": 0.77350676, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79570425, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.18493652, + "step": 4603, + "time_per_iteration": 2.4439544677734375 + }, + { + "auxiliary_loss_clip": 0.01152547, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.06315255, + "balance_loss_mlp": 1.02812386, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.5949387413156213, + "language_loss": 0.81481701, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.8367784, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.15454102, + "step": 4604, + "time_per_iteration": 2.3765368461608887 + }, + { + "auxiliary_loss_clip": 0.01157356, + "auxiliary_loss_mlp": 0.01040237, + "balance_loss_clip": 1.06115472, + "balance_loss_mlp": 1.02345288, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.8143772227393986, + "language_loss": 0.70094585, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72292179, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16784668, + "step": 4605, + "time_per_iteration": 2.5485141277313232 + }, + { + "auxiliary_loss_clip": 0.01096408, + "auxiliary_loss_mlp": 0.01005251, + "balance_loss_clip": 1.06325483, + "balance_loss_mlp": 1.00343895, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7047440599720982, + "language_loss": 0.5714246, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.5924412, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.33154297, + "router_z_loss_mlp": 0.01812744, + "step": 4606, + "time_per_iteration": 3.219698190689087 + }, + { + "auxiliary_loss_clip": 0.01152735, + "auxiliary_loss_mlp": 0.01046941, + "balance_loss_clip": 1.05782664, + "balance_loss_mlp": 1.02929807, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 1.9421853055882587, + "language_loss": 0.70001197, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.72200871, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.1763916, + "step": 4607, + "time_per_iteration": 2.8288259506225586 + }, + { + "auxiliary_loss_clip": 0.01145074, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.05440784, + "balance_loss_mlp": 1.02267551, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.5694387058470356, + "language_loss": 0.70116687, + "learning_rate": 3.393199595837555e-06, + "loss": 0.72300494, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.16040039, + "step": 4608, + "time_per_iteration": 2.7720956802368164 + }, + { + "auxiliary_loss_clip": 0.01154045, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.0585562, + "balance_loss_mlp": 1.02311087, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 2.6884466158080906, + "language_loss": 0.72870743, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75064063, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16162109, + "step": 4609, + "time_per_iteration": 2.615469217300415 + }, + { + "auxiliary_loss_clip": 0.01166512, + "auxiliary_loss_mlp": 0.01048623, + "balance_loss_clip": 1.06775391, + "balance_loss_mlp": 1.03131354, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.4038087656639893, + "language_loss": 0.84271151, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86486292, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.17297363, + "step": 4610, + "time_per_iteration": 2.557807445526123 + }, + { + "auxiliary_loss_clip": 0.01159004, + "auxiliary_loss_mlp": 0.01046627, + "balance_loss_clip": 1.0596211, + "balance_loss_mlp": 1.02909112, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.3193194431235793, + "language_loss": 0.69252568, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71458197, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.17541504, + "step": 4611, + "time_per_iteration": 2.9575142860412598 + }, + { + "auxiliary_loss_clip": 0.01153096, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.06395435, + "balance_loss_mlp": 1.02138472, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.3479213441210267, + "language_loss": 0.73298603, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75489235, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.16143799, + "step": 4612, + "time_per_iteration": 2.441990852355957 + }, + { + "auxiliary_loss_clip": 0.01151561, + "auxiliary_loss_mlp": 0.01048698, + "balance_loss_clip": 1.05541432, + "balance_loss_mlp": 1.03150821, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.0267721695739476, + "language_loss": 0.66758502, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68958759, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.17211914, + "step": 4613, + "time_per_iteration": 2.4875078201293945 + }, + { + "auxiliary_loss_clip": 0.01145821, + "auxiliary_loss_mlp": 0.01047326, + "balance_loss_clip": 1.05531049, + "balance_loss_mlp": 1.02998114, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.7724254254356866, + "language_loss": 0.79502094, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81695235, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.17346191, + "step": 4614, + "time_per_iteration": 2.473731517791748 + }, + { + "auxiliary_loss_clip": 0.01157206, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.06138706, + "balance_loss_mlp": 1.02980494, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.958347629888012, + "language_loss": 0.79729009, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.81932819, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16821289, + "step": 4615, + "time_per_iteration": 2.4314932823181152 + }, + { + "auxiliary_loss_clip": 0.01157767, + "auxiliary_loss_mlp": 0.01049266, + "balance_loss_clip": 1.05841231, + "balance_loss_mlp": 1.03209984, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.729072180357466, + "language_loss": 0.63627243, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.65834272, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.17150879, + "step": 4616, + "time_per_iteration": 2.3770792484283447 + }, + { + "auxiliary_loss_clip": 0.01146099, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.05360281, + "balance_loss_mlp": 1.0277195, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.9072758203231925, + "language_loss": 0.82399064, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84589183, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.16308594, + "step": 4617, + "time_per_iteration": 2.5146121978759766 + }, + { + "auxiliary_loss_clip": 0.01156192, + "auxiliary_loss_mlp": 0.01051331, + "balance_loss_clip": 1.05948043, + "balance_loss_mlp": 1.03492832, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 4.628416187177325, + "language_loss": 0.77700168, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79907691, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.16394043, + "step": 4618, + "time_per_iteration": 2.3967552185058594 + }, + { + "auxiliary_loss_clip": 0.01158573, + "auxiliary_loss_mlp": 0.01037764, + "balance_loss_clip": 1.06424189, + "balance_loss_mlp": 1.02278495, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 2.0633751259621183, + "language_loss": 0.84931725, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87128061, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.14990234, + "step": 4619, + "time_per_iteration": 2.507927656173706 + }, + { + "auxiliary_loss_clip": 0.01156267, + "auxiliary_loss_mlp": 0.01044883, + "balance_loss_clip": 1.06360948, + "balance_loss_mlp": 1.02964783, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4180854780862597, + "language_loss": 0.76979017, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79180169, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15222168, + "step": 4620, + "time_per_iteration": 2.4629316329956055 + }, + { + "auxiliary_loss_clip": 0.01153387, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_clip": 1.06033766, + "balance_loss_mlp": 1.02774096, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.042294189791394, + "language_loss": 0.78553754, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80751896, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.17004395, + "step": 4621, + "time_per_iteration": 2.525141954421997 + }, + { + "auxiliary_loss_clip": 0.01152438, + "auxiliary_loss_mlp": 0.0104769, + "balance_loss_clip": 1.05907404, + "balance_loss_mlp": 1.03047609, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.027937565786898, + "language_loss": 0.88019776, + "learning_rate": 3.389282499322611e-06, + "loss": 0.90219903, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.17224121, + "step": 4622, + "time_per_iteration": 2.4443671703338623 + }, + { + "auxiliary_loss_clip": 0.01146752, + "auxiliary_loss_mlp": 0.01048258, + "balance_loss_clip": 1.05407786, + "balance_loss_mlp": 1.03124714, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 2.025770098331241, + "language_loss": 0.81441796, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83636808, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.17016602, + "step": 4623, + "time_per_iteration": 2.541577100753784 + }, + { + "auxiliary_loss_clip": 0.01158098, + "auxiliary_loss_mlp": 0.01042534, + "balance_loss_clip": 1.06381631, + "balance_loss_mlp": 1.02610743, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 2.1947115163740407, + "language_loss": 0.81559134, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83759773, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16430664, + "step": 4624, + "time_per_iteration": 2.498076915740967 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01055381, + "balance_loss_clip": 1.06053984, + "balance_loss_mlp": 1.03660524, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.102379268377637, + "language_loss": 0.76626867, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78834522, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.18786621, + "step": 4625, + "time_per_iteration": 3.0678939819335938 + }, + { + "auxiliary_loss_clip": 0.01148806, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_clip": 1.05595767, + "balance_loss_mlp": 1.02459812, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 3.6004556817586524, + "language_loss": 0.70063347, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72254181, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.17431641, + "step": 4626, + "time_per_iteration": 2.5708649158477783 + }, + { + "auxiliary_loss_clip": 0.01163552, + "auxiliary_loss_mlp": 0.01043788, + "balance_loss_clip": 1.06439841, + "balance_loss_mlp": 1.02593005, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.5577067181134137, + "language_loss": 0.92068523, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94275862, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17858887, + "step": 4627, + "time_per_iteration": 3.923457622528076 + }, + { + "auxiliary_loss_clip": 0.01159622, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.0633471, + "balance_loss_mlp": 1.02503884, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 1.9670859543273869, + "language_loss": 0.8580718, + "learning_rate": 3.387600581071121e-06, + "loss": 0.88008201, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16369629, + "step": 4628, + "time_per_iteration": 2.563157796859741 + }, + { + "auxiliary_loss_clip": 0.0115087, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.05943227, + "balance_loss_mlp": 1.02417636, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.470618794955019, + "language_loss": 0.79282975, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81473553, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.15515137, + "step": 4629, + "time_per_iteration": 2.5128822326660156 + }, + { + "auxiliary_loss_clip": 0.01151991, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.06277776, + "balance_loss_mlp": 1.02186584, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.5765387644108695, + "language_loss": 0.84464228, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86653578, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15490723, + "step": 4630, + "time_per_iteration": 2.5767459869384766 + }, + { + "auxiliary_loss_clip": 0.01156221, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.06174684, + "balance_loss_mlp": 1.02198339, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.068882865094573, + "language_loss": 0.81755936, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83951962, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.17834473, + "step": 4631, + "time_per_iteration": 2.496702194213867 + }, + { + "auxiliary_loss_clip": 0.01169076, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.07131159, + "balance_loss_mlp": 1.03226066, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 1.7402702260648255, + "language_loss": 0.71096158, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73313349, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15856934, + "step": 4632, + "time_per_iteration": 2.532935380935669 + }, + { + "auxiliary_loss_clip": 0.01153666, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_clip": 1.0627234, + "balance_loss_mlp": 1.02890682, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8360012260859626, + "language_loss": 0.82551646, + "learning_rate": 3.386197535437145e-06, + "loss": 0.8475014, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.15924072, + "step": 4633, + "time_per_iteration": 2.4683725833892822 + }, + { + "auxiliary_loss_clip": 0.0116383, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.06861317, + "balance_loss_mlp": 1.02278185, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.768051268951884, + "language_loss": 0.87638855, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89843881, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.1842041, + "step": 4634, + "time_per_iteration": 2.5662450790405273 + }, + { + "auxiliary_loss_clip": 0.01160065, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_clip": 1.0660224, + "balance_loss_mlp": 1.02548468, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.6889574857335452, + "language_loss": 0.77150869, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79355741, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.19311523, + "step": 4635, + "time_per_iteration": 2.52392578125 + }, + { + "auxiliary_loss_clip": 0.01154201, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.05910707, + "balance_loss_mlp": 1.02500761, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.5476184912491637, + "language_loss": 0.6552763, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67723989, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.17150879, + "step": 4636, + "time_per_iteration": 2.5278491973876953 + }, + { + "auxiliary_loss_clip": 0.0115399, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_clip": 1.05999959, + "balance_loss_mlp": 1.02467346, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.5117198686568405, + "language_loss": 0.83610409, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85806572, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.17504883, + "step": 4637, + "time_per_iteration": 5.818854570388794 + }, + { + "auxiliary_loss_clip": 0.01153549, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.0620085, + "balance_loss_mlp": 1.03203475, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.6895031593926428, + "language_loss": 0.76022255, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78223908, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16064453, + "step": 4638, + "time_per_iteration": 2.609042167663574 + }, + { + "auxiliary_loss_clip": 0.01159578, + "auxiliary_loss_mlp": 0.0104918, + "balance_loss_clip": 1.0641377, + "balance_loss_mlp": 1.03240728, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.959056559717524, + "language_loss": 0.71472967, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73681724, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16784668, + "step": 4639, + "time_per_iteration": 2.6005117893218994 + }, + { + "auxiliary_loss_clip": 0.01151178, + "auxiliary_loss_mlp": 0.01041666, + "balance_loss_clip": 1.05883038, + "balance_loss_mlp": 1.02413058, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.0190995295645453, + "language_loss": 0.65976834, + "learning_rate": 3.384231064128447e-06, + "loss": 0.68169677, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.17553711, + "step": 4640, + "time_per_iteration": 2.683730125427246 + }, + { + "auxiliary_loss_clip": 0.01157532, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.06446147, + "balance_loss_mlp": 1.02317929, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 3.7480506497034463, + "language_loss": 0.72163343, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74360174, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.16107178, + "step": 4641, + "time_per_iteration": 4.067570209503174 + }, + { + "auxiliary_loss_clip": 0.01172235, + "auxiliary_loss_mlp": 0.01039928, + "balance_loss_clip": 1.07333028, + "balance_loss_mlp": 1.02188015, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.9604203324708611, + "language_loss": 0.75444388, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77656555, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.18054199, + "step": 4642, + "time_per_iteration": 3.134434700012207 + }, + { + "auxiliary_loss_clip": 0.01160594, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.06537938, + "balance_loss_mlp": 1.02629602, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.860303209296636, + "language_loss": 0.85825527, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.88030207, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.17810059, + "step": 4643, + "time_per_iteration": 2.736388921737671 + }, + { + "auxiliary_loss_clip": 0.01162558, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.0697422, + "balance_loss_mlp": 1.02960372, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 2.0192359550640213, + "language_loss": 0.83143765, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85352868, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16931152, + "step": 4644, + "time_per_iteration": 2.7006330490112305 + }, + { + "auxiliary_loss_clip": 0.01156625, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.06179178, + "balance_loss_mlp": 1.02198684, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 1.8962208852799107, + "language_loss": 0.79090315, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81286031, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17102051, + "step": 4645, + "time_per_iteration": 2.688990354537964 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.01010336, + "balance_loss_clip": 1.09016776, + "balance_loss_mlp": 1.00846791, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7793713631739245, + "language_loss": 0.6226595, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64401913, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.35498047, + "router_z_loss_mlp": 0.01867676, + "step": 4646, + "time_per_iteration": 3.5282270908355713 + }, + { + "auxiliary_loss_clip": 0.0115479, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.06498146, + "balance_loss_mlp": 1.02189529, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.9121172331540608, + "language_loss": 0.89363331, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91555125, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15100098, + "step": 4647, + "time_per_iteration": 2.798480272293091 + }, + { + "auxiliary_loss_clip": 0.0115245, + "auxiliary_loss_mlp": 0.01049282, + "balance_loss_clip": 1.05779159, + "balance_loss_mlp": 1.03094769, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6407555492805033, + "language_loss": 0.86854672, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89056402, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.18334961, + "step": 4648, + "time_per_iteration": 2.5645222663879395 + }, + { + "auxiliary_loss_clip": 0.01154192, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.06055093, + "balance_loss_mlp": 1.01946402, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 4.131898144291193, + "language_loss": 0.7301721, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75206828, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15942383, + "step": 4649, + "time_per_iteration": 2.578186511993408 + }, + { + "auxiliary_loss_clip": 0.01153756, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.06205082, + "balance_loss_mlp": 1.02233112, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.219037180206272, + "language_loss": 0.80205601, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82399666, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.1796875, + "step": 4650, + "time_per_iteration": 2.458258867263794 + }, + { + "auxiliary_loss_clip": 0.01134013, + "auxiliary_loss_mlp": 0.01013777, + "balance_loss_clip": 1.09919167, + "balance_loss_mlp": 1.01140487, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.8413681185392031, + "language_loss": 0.58851552, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60999334, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.34814453, + "router_z_loss_mlp": 0.02371216, + "step": 4651, + "time_per_iteration": 3.1229641437530518 + }, + { + "auxiliary_loss_clip": 0.01159784, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_clip": 1.06395221, + "balance_loss_mlp": 1.02618337, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.7343838570442909, + "language_loss": 0.74042463, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.7624706, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.1862793, + "step": 4652, + "time_per_iteration": 2.53515625 + }, + { + "auxiliary_loss_clip": 0.01160154, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.06391907, + "balance_loss_mlp": 1.03124404, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.2496153457737242, + "language_loss": 0.79716265, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81924868, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.17199707, + "step": 4653, + "time_per_iteration": 2.610215425491333 + }, + { + "auxiliary_loss_clip": 0.0116073, + "auxiliary_loss_mlp": 0.01042353, + "balance_loss_clip": 1.06693506, + "balance_loss_mlp": 1.02516294, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.06978652955113, + "language_loss": 0.78663969, + "learning_rate": 3.380290409114312e-06, + "loss": 0.80867052, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.171875, + "step": 4654, + "time_per_iteration": 2.5094635486602783 + }, + { + "auxiliary_loss_clip": 0.01165087, + "auxiliary_loss_mlp": 0.01041922, + "balance_loss_clip": 1.06796789, + "balance_loss_mlp": 1.02425575, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.1755099097821766, + "language_loss": 0.81316626, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83523631, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.17675781, + "step": 4655, + "time_per_iteration": 2.469851493835449 + }, + { + "auxiliary_loss_clip": 0.01153282, + "auxiliary_loss_mlp": 0.0105229, + "balance_loss_clip": 1.05862093, + "balance_loss_mlp": 1.03418219, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.607335058543615, + "language_loss": 0.8155182, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83757395, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.18103027, + "step": 4656, + "time_per_iteration": 2.5639491081237793 + }, + { + "auxiliary_loss_clip": 0.01158261, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_clip": 1.06467295, + "balance_loss_mlp": 1.02559805, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6147808726304749, + "language_loss": 0.83424401, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.8562541, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.17150879, + "step": 4657, + "time_per_iteration": 2.469367504119873 + }, + { + "auxiliary_loss_clip": 0.01159465, + "auxiliary_loss_mlp": 0.01051292, + "balance_loss_clip": 1.0632906, + "balance_loss_mlp": 1.03225482, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 2.110046044695754, + "language_loss": 0.63594425, + "learning_rate": 3.379162622133105e-06, + "loss": 0.65805179, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.19042969, + "step": 4658, + "time_per_iteration": 2.568964719772339 + }, + { + "auxiliary_loss_clip": 0.01159993, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.06519318, + "balance_loss_mlp": 1.02896178, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 2.701931015901796, + "language_loss": 0.7806592, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.8027215, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.17272949, + "step": 4659, + "time_per_iteration": 2.469757080078125 + }, + { + "auxiliary_loss_clip": 0.01165603, + "auxiliary_loss_mlp": 0.01050984, + "balance_loss_clip": 1.06890011, + "balance_loss_mlp": 1.03388906, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 2.7189006553711397, + "language_loss": 0.79806328, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.82022917, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17089844, + "step": 4660, + "time_per_iteration": 2.482682704925537 + }, + { + "auxiliary_loss_clip": 0.01164758, + "auxiliary_loss_mlp": 0.01043777, + "balance_loss_clip": 1.06920075, + "balance_loss_mlp": 1.0269444, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 2.1826282169704903, + "language_loss": 0.80406225, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82614768, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.16827393, + "step": 4661, + "time_per_iteration": 2.6774561405181885 + }, + { + "auxiliary_loss_clip": 0.01168881, + "auxiliary_loss_mlp": 0.01054684, + "balance_loss_clip": 1.07358825, + "balance_loss_mlp": 1.03772068, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.6358822059292482, + "language_loss": 0.79106158, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.81329721, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.16967773, + "step": 4662, + "time_per_iteration": 2.6536900997161865 + }, + { + "auxiliary_loss_clip": 0.0116157, + "auxiliary_loss_mlp": 0.01045463, + "balance_loss_clip": 1.06316805, + "balance_loss_mlp": 1.02685416, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 1.699537102314809, + "language_loss": 0.69694018, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71901047, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.18615723, + "step": 4663, + "time_per_iteration": 2.420146942138672 + }, + { + "auxiliary_loss_clip": 0.01156774, + "auxiliary_loss_mlp": 0.01054226, + "balance_loss_clip": 1.0602088, + "balance_loss_mlp": 1.03429437, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.9988197347039138, + "language_loss": 0.77741539, + "learning_rate": 3.377469372935791e-06, + "loss": 0.79952544, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.19934082, + "step": 4664, + "time_per_iteration": 2.4943273067474365 + }, + { + "auxiliary_loss_clip": 0.01152945, + "auxiliary_loss_mlp": 0.01046254, + "balance_loss_clip": 1.06279731, + "balance_loss_mlp": 1.0298506, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 2.1351604496867336, + "language_loss": 0.79552114, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81751311, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.1640625, + "step": 4665, + "time_per_iteration": 2.430943250656128 + }, + { + "auxiliary_loss_clip": 0.01154419, + "auxiliary_loss_mlp": 0.01049146, + "balance_loss_clip": 1.0600481, + "balance_loss_mlp": 1.03159881, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.9506008299326627, + "language_loss": 0.81290847, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.83494413, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.17529297, + "step": 4666, + "time_per_iteration": 2.5022850036621094 + }, + { + "auxiliary_loss_clip": 0.01155304, + "auxiliary_loss_mlp": 0.01061466, + "balance_loss_clip": 1.06039739, + "balance_loss_mlp": 1.04264283, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 1.8959211457108243, + "language_loss": 0.8482222, + "learning_rate": 3.376622043036658e-06, + "loss": 0.87038994, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.18823242, + "step": 4667, + "time_per_iteration": 2.5227746963500977 + }, + { + "auxiliary_loss_clip": 0.01167038, + "auxiliary_loss_mlp": 0.01047862, + "balance_loss_clip": 1.06721067, + "balance_loss_mlp": 1.03071928, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.7508781258413253, + "language_loss": 0.79315293, + "learning_rate": 3.376339495319373e-06, + "loss": 0.8153019, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.17138672, + "step": 4668, + "time_per_iteration": 2.6000008583068848 + }, + { + "auxiliary_loss_clip": 0.0115632, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.05811834, + "balance_loss_mlp": 1.02911973, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.5330559979186795, + "language_loss": 0.76139271, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78343153, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.18432617, + "step": 4669, + "time_per_iteration": 2.5010175704956055 + }, + { + "auxiliary_loss_clip": 0.01155756, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.06270838, + "balance_loss_mlp": 1.02887034, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.066335218810402, + "language_loss": 0.7903744, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81238842, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.16772461, + "step": 4670, + "time_per_iteration": 3.8483545780181885 + }, + { + "auxiliary_loss_clip": 0.01159122, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.06046915, + "balance_loss_mlp": 1.03090894, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 1.8529513318653323, + "language_loss": 0.79274338, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81483138, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.18762207, + "step": 4671, + "time_per_iteration": 2.473536491394043 + }, + { + "auxiliary_loss_clip": 0.01159551, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.06852913, + "balance_loss_mlp": 1.02163672, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.764707385310303, + "language_loss": 0.74976945, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77173674, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.15551758, + "step": 4672, + "time_per_iteration": 2.485004425048828 + }, + { + "auxiliary_loss_clip": 0.01167612, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_clip": 1.0687201, + "balance_loss_mlp": 1.02687836, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 3.889735281348478, + "language_loss": 0.75363767, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77577186, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.18908691, + "step": 4673, + "time_per_iteration": 2.4491231441497803 + }, + { + "auxiliary_loss_clip": 0.01172255, + "auxiliary_loss_mlp": 0.01044963, + "balance_loss_clip": 1.0741632, + "balance_loss_mlp": 1.02783263, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 1.8145142871373168, + "language_loss": 0.72610265, + "learning_rate": 3.374643113381237e-06, + "loss": 0.7482748, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17138672, + "step": 4674, + "time_per_iteration": 2.511155128479004 + }, + { + "auxiliary_loss_clip": 0.01159927, + "auxiliary_loss_mlp": 0.01040735, + "balance_loss_clip": 1.06445193, + "balance_loss_mlp": 1.02263904, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 2.32283107015432, + "language_loss": 0.78004003, + "learning_rate": 3.374360200552541e-06, + "loss": 0.80204666, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.18103027, + "step": 4675, + "time_per_iteration": 2.6955206394195557 + }, + { + "auxiliary_loss_clip": 0.01157121, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_clip": 1.06140876, + "balance_loss_mlp": 1.02480912, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.8070907576877753, + "language_loss": 0.70242989, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72442472, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.17578125, + "step": 4676, + "time_per_iteration": 2.462278127670288 + }, + { + "auxiliary_loss_clip": 0.01154757, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.06664336, + "balance_loss_mlp": 1.02424097, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.8604827521064289, + "language_loss": 0.70383298, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72578847, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.16552734, + "step": 4677, + "time_per_iteration": 2.4599251747131348 + }, + { + "auxiliary_loss_clip": 0.01152052, + "auxiliary_loss_mlp": 0.01046319, + "balance_loss_clip": 1.05859005, + "balance_loss_mlp": 1.02718556, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 2.0248991546738226, + "language_loss": 0.63661242, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65859616, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.19152832, + "step": 4678, + "time_per_iteration": 2.5819411277770996 + }, + { + "auxiliary_loss_clip": 0.01152107, + "auxiliary_loss_mlp": 0.01044549, + "balance_loss_clip": 1.05902028, + "balance_loss_mlp": 1.02758515, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.5538353345562237, + "language_loss": 0.70869946, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.73066598, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.16967773, + "step": 4679, + "time_per_iteration": 2.5135319232940674 + }, + { + "auxiliary_loss_clip": 0.01148117, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.05507326, + "balance_loss_mlp": 1.02442944, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.954097549834098, + "language_loss": 0.74860418, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77050471, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.17504883, + "step": 4680, + "time_per_iteration": 5.4449028968811035 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.05827427, + "balance_loss_mlp": 1.02375031, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.5777268108306908, + "language_loss": 0.77292389, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79488146, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.16918945, + "step": 4681, + "time_per_iteration": 2.5182034969329834 + }, + { + "auxiliary_loss_clip": 0.01153593, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.05735493, + "balance_loss_mlp": 1.02271986, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 2.0212847252274977, + "language_loss": 0.7438997, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76584041, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.17773438, + "step": 4682, + "time_per_iteration": 2.5026354789733887 + }, + { + "auxiliary_loss_clip": 0.01152609, + "auxiliary_loss_mlp": 0.01044725, + "balance_loss_clip": 1.06127596, + "balance_loss_mlp": 1.02858424, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.6696824374420813, + "language_loss": 0.80711734, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82909071, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.16137695, + "step": 4683, + "time_per_iteration": 2.509338855743408 + }, + { + "auxiliary_loss_clip": 0.01161943, + "auxiliary_loss_mlp": 0.01047383, + "balance_loss_clip": 1.06596744, + "balance_loss_mlp": 1.02945411, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.6258933476744772, + "language_loss": 0.76258147, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78467476, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17944336, + "step": 4684, + "time_per_iteration": 3.832430124282837 + }, + { + "auxiliary_loss_clip": 0.01150427, + "auxiliary_loss_mlp": 0.01036272, + "balance_loss_clip": 1.05742884, + "balance_loss_mlp": 1.01959491, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 2.012314946331915, + "language_loss": 0.76300192, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78486896, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.16674805, + "step": 4685, + "time_per_iteration": 2.4168541431427 + }, + { + "auxiliary_loss_clip": 0.01148158, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.05729949, + "balance_loss_mlp": 1.02692819, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 1.619161899938788, + "language_loss": 0.76079994, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78271842, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.16748047, + "step": 4686, + "time_per_iteration": 2.5440497398376465 + }, + { + "auxiliary_loss_clip": 0.01150932, + "auxiliary_loss_mlp": 0.01049896, + "balance_loss_clip": 1.05641901, + "balance_loss_mlp": 1.03102517, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 4.49763279020504, + "language_loss": 0.63575977, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65776807, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.18884277, + "step": 4687, + "time_per_iteration": 2.680602788925171 + }, + { + "auxiliary_loss_clip": 0.01155575, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.05866551, + "balance_loss_mlp": 1.0288744, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 2.0965230832274266, + "language_loss": 0.76883245, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.79084706, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17016602, + "step": 4688, + "time_per_iteration": 2.4998273849487305 + }, + { + "auxiliary_loss_clip": 0.01153589, + "auxiliary_loss_mlp": 0.01040908, + "balance_loss_clip": 1.06051993, + "balance_loss_mlp": 1.02417135, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.0028199658891066, + "language_loss": 0.78915477, + "learning_rate": 3.37039395366863e-06, + "loss": 0.81109977, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.16760254, + "step": 4689, + "time_per_iteration": 2.3992061614990234 + }, + { + "auxiliary_loss_clip": 0.01147939, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.05483425, + "balance_loss_mlp": 1.02254879, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.6682879377866582, + "language_loss": 0.78356683, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80543709, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.1652832, + "step": 4690, + "time_per_iteration": 2.5226731300354004 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01050628, + "balance_loss_clip": 1.05922675, + "balance_loss_mlp": 1.03277063, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.8153003650244177, + "language_loss": 0.87102747, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89307654, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.17858887, + "step": 4691, + "time_per_iteration": 2.474034309387207 + }, + { + "auxiliary_loss_clip": 0.01153434, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.05450559, + "balance_loss_mlp": 1.03502417, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.1587410854045186, + "language_loss": 0.82156491, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84363496, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.1854248, + "step": 4692, + "time_per_iteration": 2.5072643756866455 + }, + { + "auxiliary_loss_clip": 0.01156509, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.06208467, + "balance_loss_mlp": 1.02484763, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.580735098457692, + "language_loss": 0.74709737, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76908386, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.1730957, + "step": 4693, + "time_per_iteration": 2.5981709957122803 + }, + { + "auxiliary_loss_clip": 0.01156913, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.06012058, + "balance_loss_mlp": 1.02241385, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6477605718690809, + "language_loss": 0.77459025, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.7965517, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.16809082, + "step": 4694, + "time_per_iteration": 2.525750160217285 + }, + { + "auxiliary_loss_clip": 0.01149217, + "auxiliary_loss_mlp": 0.01041507, + "balance_loss_clip": 1.05783832, + "balance_loss_mlp": 1.02444863, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.07308164584265, + "language_loss": 0.6699335, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69184071, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.1706543, + "step": 4695, + "time_per_iteration": 2.5421388149261475 + }, + { + "auxiliary_loss_clip": 0.01161617, + "auxiliary_loss_mlp": 0.01050087, + "balance_loss_clip": 1.06509185, + "balance_loss_mlp": 1.03100121, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.492609701338834, + "language_loss": 0.75731897, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77943599, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.19091797, + "step": 4696, + "time_per_iteration": 2.5243477821350098 + }, + { + "auxiliary_loss_clip": 0.01165114, + "auxiliary_loss_mlp": 0.01045198, + "balance_loss_clip": 1.06791544, + "balance_loss_mlp": 1.02877069, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.5437283213544486, + "language_loss": 0.62810248, + "learning_rate": 3.368122952024877e-06, + "loss": 0.65020561, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16418457, + "step": 4697, + "time_per_iteration": 2.694425106048584 + }, + { + "auxiliary_loss_clip": 0.01151378, + "auxiliary_loss_mlp": 0.01036601, + "balance_loss_clip": 1.06020498, + "balance_loss_mlp": 1.02105582, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.6223773848917673, + "language_loss": 0.73506761, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.7569474, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.15551758, + "step": 4698, + "time_per_iteration": 2.653388261795044 + }, + { + "auxiliary_loss_clip": 0.01147395, + "auxiliary_loss_mlp": 0.0103977, + "balance_loss_clip": 1.05727029, + "balance_loss_mlp": 1.02404642, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.761927897354075, + "language_loss": 0.74737787, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.76924944, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15716553, + "step": 4699, + "time_per_iteration": 2.538597822189331 + }, + { + "auxiliary_loss_clip": 0.01156585, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.06157601, + "balance_loss_mlp": 1.02088618, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 2.830643196862939, + "language_loss": 0.80069494, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8226496, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.17993164, + "step": 4700, + "time_per_iteration": 2.5249085426330566 + }, + { + "auxiliary_loss_clip": 0.01154004, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.0639267, + "balance_loss_mlp": 1.02879179, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.8052236411339586, + "language_loss": 0.81786782, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.83984828, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15246582, + "step": 4701, + "time_per_iteration": 2.595005512237549 + }, + { + "auxiliary_loss_clip": 0.01158649, + "auxiliary_loss_mlp": 0.01041067, + "balance_loss_clip": 1.06613207, + "balance_loss_mlp": 1.02540302, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 3.1944188718894315, + "language_loss": 0.73360133, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75559855, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.15673828, + "step": 4702, + "time_per_iteration": 2.6386725902557373 + }, + { + "auxiliary_loss_clip": 0.01149314, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.05935287, + "balance_loss_mlp": 1.02245605, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 4.935960185492025, + "language_loss": 0.7837373, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80562401, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.16906738, + "step": 4703, + "time_per_iteration": 2.5066473484039307 + }, + { + "auxiliary_loss_clip": 0.01153662, + "auxiliary_loss_mlp": 0.01047654, + "balance_loss_clip": 1.06000745, + "balance_loss_mlp": 1.02993989, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.9154982281310124, + "language_loss": 0.69436944, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71638262, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.17712402, + "step": 4704, + "time_per_iteration": 2.5736165046691895 + }, + { + "auxiliary_loss_clip": 0.01153292, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.06229711, + "balance_loss_mlp": 1.02196169, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 2.22664391241379, + "language_loss": 0.70531195, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72723806, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.17346191, + "step": 4705, + "time_per_iteration": 2.5006561279296875 + }, + { + "auxiliary_loss_clip": 0.01103299, + "auxiliary_loss_mlp": 0.01014403, + "balance_loss_clip": 1.07048023, + "balance_loss_mlp": 1.01233149, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.719505952462226, + "language_loss": 0.59257096, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61374795, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 0.02072144, + "step": 4706, + "time_per_iteration": 3.1721999645233154 + }, + { + "auxiliary_loss_clip": 0.01150354, + "auxiliary_loss_mlp": 0.01039983, + "balance_loss_clip": 1.06210661, + "balance_loss_mlp": 1.02485561, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.4655554039462402, + "language_loss": 0.82140374, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84330714, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15124512, + "step": 4707, + "time_per_iteration": 2.5604920387268066 + }, + { + "auxiliary_loss_clip": 0.01147418, + "auxiliary_loss_mlp": 0.01036342, + "balance_loss_clip": 1.05286932, + "balance_loss_mlp": 1.01852024, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.4643599907455394, + "language_loss": 0.80502069, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82685828, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.17822266, + "step": 4708, + "time_per_iteration": 2.494471549987793 + }, + { + "auxiliary_loss_clip": 0.01086947, + "auxiliary_loss_mlp": 0.01005675, + "balance_loss_clip": 1.05518293, + "balance_loss_mlp": 1.00359166, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8798951129199218, + "language_loss": 0.62844783, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64937401, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.31738281, + "router_z_loss_mlp": 0.02084351, + "step": 4709, + "time_per_iteration": 2.991544008255005 + }, + { + "auxiliary_loss_clip": 0.01142212, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.05289781, + "balance_loss_mlp": 1.02236724, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.4560271924345256, + "language_loss": 0.74098933, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76279891, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.16369629, + "step": 4710, + "time_per_iteration": 2.480055332183838 + }, + { + "auxiliary_loss_clip": 0.01146209, + "auxiliary_loss_mlp": 0.01052606, + "balance_loss_clip": 1.05377352, + "balance_loss_mlp": 1.03411698, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.8665953167580365, + "language_loss": 0.79031062, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81229877, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.18475342, + "step": 4711, + "time_per_iteration": 2.502692937850952 + }, + { + "auxiliary_loss_clip": 0.01161253, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.0638361, + "balance_loss_mlp": 1.02378678, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.012366617145813, + "language_loss": 0.70463026, + "learning_rate": 3.363855879093996e-06, + "loss": 0.72665185, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.17138672, + "step": 4712, + "time_per_iteration": 2.5411746501922607 + }, + { + "auxiliary_loss_clip": 0.01161037, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.06671715, + "balance_loss_mlp": 1.02978253, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 2.491985523907309, + "language_loss": 0.81728804, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83936942, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.17321777, + "step": 4713, + "time_per_iteration": 2.6309335231781006 + }, + { + "auxiliary_loss_clip": 0.0115153, + "auxiliary_loss_mlp": 0.01040636, + "balance_loss_clip": 1.0600487, + "balance_loss_mlp": 1.02329147, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 2.3847469939256745, + "language_loss": 0.75505531, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77697706, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.17346191, + "step": 4714, + "time_per_iteration": 4.192798852920532 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.0104656, + "balance_loss_clip": 1.05593359, + "balance_loss_mlp": 1.02970409, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.5134935405416332, + "language_loss": 0.78273594, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80469561, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16845703, + "step": 4715, + "time_per_iteration": 2.50993275642395 + }, + { + "auxiliary_loss_clip": 0.01148041, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.05710161, + "balance_loss_mlp": 1.02302516, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 2.0950232779335156, + "language_loss": 0.7373684, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.75924551, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.16644287, + "step": 4716, + "time_per_iteration": 2.4784655570983887 + }, + { + "auxiliary_loss_clip": 0.01158412, + "auxiliary_loss_mlp": 0.01048924, + "balance_loss_clip": 1.05920219, + "balance_loss_mlp": 1.02919447, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.5834795672932818, + "language_loss": 0.74263966, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76471305, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.19726562, + "step": 4717, + "time_per_iteration": 2.4195032119750977 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_clip": 1.05635953, + "balance_loss_mlp": 1.02848709, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.6491491028118013, + "language_loss": 0.66729605, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.68928212, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.17163086, + "step": 4718, + "time_per_iteration": 2.4742329120635986 + }, + { + "auxiliary_loss_clip": 0.01155536, + "auxiliary_loss_mlp": 0.01046876, + "balance_loss_clip": 1.05750847, + "balance_loss_mlp": 1.02883995, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 2.208096604484872, + "language_loss": 0.72598386, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74800801, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.18041992, + "step": 4719, + "time_per_iteration": 2.5874369144439697 + }, + { + "auxiliary_loss_clip": 0.01149092, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.0232141, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.6689673823849425, + "language_loss": 0.80030805, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82219744, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.16601562, + "step": 4720, + "time_per_iteration": 2.5309829711914062 + }, + { + "auxiliary_loss_clip": 0.01155, + "auxiliary_loss_mlp": 0.01044601, + "balance_loss_clip": 1.05998087, + "balance_loss_mlp": 1.02667212, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.8663308255697664, + "language_loss": 0.7948603, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81685638, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.17932129, + "step": 4721, + "time_per_iteration": 2.5657265186309814 + }, + { + "auxiliary_loss_clip": 0.01149986, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.05694473, + "balance_loss_mlp": 1.01986802, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.358914639929054, + "language_loss": 0.82797897, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84984183, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.16430664, + "step": 4722, + "time_per_iteration": 2.57597279548645 + }, + { + "auxiliary_loss_clip": 0.01149556, + "auxiliary_loss_mlp": 0.01039722, + "balance_loss_clip": 1.05706775, + "balance_loss_mlp": 1.02336597, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.8517560699886673, + "language_loss": 0.70221734, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72411013, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16345215, + "step": 4723, + "time_per_iteration": 4.016258239746094 + }, + { + "auxiliary_loss_clip": 0.01149889, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_clip": 1.05838978, + "balance_loss_mlp": 1.02512884, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.441284707481104, + "language_loss": 0.78676605, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8086921, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.17578125, + "step": 4724, + "time_per_iteration": 3.8902275562286377 + }, + { + "auxiliary_loss_clip": 0.01148891, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_clip": 1.05581093, + "balance_loss_mlp": 1.02808714, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6540044156136304, + "language_loss": 0.92380726, + "learning_rate": 3.36014833532143e-06, + "loss": 0.9457438, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.16674805, + "step": 4725, + "time_per_iteration": 2.5069220066070557 + }, + { + "auxiliary_loss_clip": 0.01156224, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.05981219, + "balance_loss_mlp": 1.0250001, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.5842987200031795, + "language_loss": 0.88642693, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90842617, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.18713379, + "step": 4726, + "time_per_iteration": 2.6441619396209717 + }, + { + "auxiliary_loss_clip": 0.01145701, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.05312121, + "balance_loss_mlp": 1.02611113, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 5.055943546280309, + "language_loss": 0.78804231, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80993164, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.17114258, + "step": 4727, + "time_per_iteration": 2.9084792137145996 + }, + { + "auxiliary_loss_clip": 0.01147134, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.05693507, + "balance_loss_mlp": 1.02307844, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 3.6686018348006852, + "language_loss": 0.66747963, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68933493, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.15319824, + "step": 4728, + "time_per_iteration": 3.9332079887390137 + }, + { + "auxiliary_loss_clip": 0.01140191, + "auxiliary_loss_mlp": 0.01044029, + "balance_loss_clip": 1.04875028, + "balance_loss_mlp": 1.02618289, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.7373764147468023, + "language_loss": 0.76465261, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78649479, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.17834473, + "step": 4729, + "time_per_iteration": 2.503312349319458 + }, + { + "auxiliary_loss_clip": 0.01150883, + "auxiliary_loss_mlp": 0.01045967, + "balance_loss_clip": 1.05531895, + "balance_loss_mlp": 1.02893209, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 2.0968514993171423, + "language_loss": 0.66631943, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68828797, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.17028809, + "step": 4730, + "time_per_iteration": 2.5121877193450928 + }, + { + "auxiliary_loss_clip": 0.01154846, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.06319427, + "balance_loss_mlp": 1.02300823, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.78783755010944, + "language_loss": 0.74333358, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76528454, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.17248535, + "step": 4731, + "time_per_iteration": 2.483304977416992 + }, + { + "auxiliary_loss_clip": 0.01157207, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.06218433, + "balance_loss_mlp": 1.02075291, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.8044242340411416, + "language_loss": 0.83660281, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85854852, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.16601562, + "step": 4732, + "time_per_iteration": 2.5315794944763184 + }, + { + "auxiliary_loss_clip": 0.01149819, + "auxiliary_loss_mlp": 0.01043946, + "balance_loss_clip": 1.0556823, + "balance_loss_mlp": 1.02600455, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.7692203342732515, + "language_loss": 0.79109043, + "learning_rate": 3.357862435944109e-06, + "loss": 0.8130281, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.17932129, + "step": 4733, + "time_per_iteration": 2.473893880844116 + }, + { + "auxiliary_loss_clip": 0.01155096, + "auxiliary_loss_mlp": 0.01043579, + "balance_loss_clip": 1.05865109, + "balance_loss_mlp": 1.02655613, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 3.0950119994471836, + "language_loss": 0.71497941, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73696613, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.17016602, + "step": 4734, + "time_per_iteration": 2.4806408882141113 + }, + { + "auxiliary_loss_clip": 0.01143242, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.05215716, + "balance_loss_mlp": 1.01663613, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 2.5914151213259404, + "language_loss": 0.74559432, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76735699, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.16394043, + "step": 4735, + "time_per_iteration": 2.4152591228485107 + }, + { + "auxiliary_loss_clip": 0.01155103, + "auxiliary_loss_mlp": 0.01044298, + "balance_loss_clip": 1.06371176, + "balance_loss_mlp": 1.02899122, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.7229861873205834, + "language_loss": 0.79501903, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81701297, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.1529541, + "step": 4736, + "time_per_iteration": 2.4780139923095703 + }, + { + "auxiliary_loss_clip": 0.01149829, + "auxiliary_loss_mlp": 0.01043069, + "balance_loss_clip": 1.05720806, + "balance_loss_mlp": 1.02560496, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.66677009106252, + "language_loss": 0.60183179, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62376082, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.17468262, + "step": 4737, + "time_per_iteration": 2.4873781204223633 + }, + { + "auxiliary_loss_clip": 0.01144565, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.05575728, + "balance_loss_mlp": 1.02165997, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.9218054039975336, + "language_loss": 0.86481988, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88664138, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.15917969, + "step": 4738, + "time_per_iteration": 2.4554216861724854 + }, + { + "auxiliary_loss_clip": 0.01158581, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_clip": 1.06365585, + "balance_loss_mlp": 1.02831292, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.8520186244575365, + "language_loss": 0.89822948, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92027903, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.18078613, + "step": 4739, + "time_per_iteration": 2.412466049194336 + }, + { + "auxiliary_loss_clip": 0.01151718, + "auxiliary_loss_mlp": 0.01042036, + "balance_loss_clip": 1.05958486, + "balance_loss_mlp": 1.02515638, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.5003481077348229, + "language_loss": 0.72141242, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74334997, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.16870117, + "step": 4740, + "time_per_iteration": 2.887754440307617 + }, + { + "auxiliary_loss_clip": 0.01157398, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.06452465, + "balance_loss_mlp": 1.02109158, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.9912648940105533, + "language_loss": 0.77793294, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.79987371, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15582275, + "step": 4741, + "time_per_iteration": 2.5389490127563477 + }, + { + "auxiliary_loss_clip": 0.01149381, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.05229151, + "balance_loss_mlp": 1.02462053, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.817183656042844, + "language_loss": 0.7610516, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78296673, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.17504883, + "step": 4742, + "time_per_iteration": 2.496976375579834 + }, + { + "auxiliary_loss_clip": 0.01148149, + "auxiliary_loss_mlp": 0.01048227, + "balance_loss_clip": 1.05244756, + "balance_loss_mlp": 1.02953553, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 3.217588417615996, + "language_loss": 0.57615018, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59811389, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.18701172, + "step": 4743, + "time_per_iteration": 2.458667039871216 + }, + { + "auxiliary_loss_clip": 0.01152768, + "auxiliary_loss_mlp": 0.01047558, + "balance_loss_clip": 1.05751371, + "balance_loss_mlp": 1.03040409, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.6038683225357848, + "language_loss": 0.73791361, + "learning_rate": 3.354713944700797e-06, + "loss": 0.7599169, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.17138672, + "step": 4744, + "time_per_iteration": 2.5547680854797363 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.01045455, + "balance_loss_clip": 1.05300927, + "balance_loss_mlp": 1.02887321, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.6850077245362307, + "language_loss": 0.77565598, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79756534, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16589355, + "step": 4745, + "time_per_iteration": 2.3950841426849365 + }, + { + "auxiliary_loss_clip": 0.01145422, + "auxiliary_loss_mlp": 0.01038125, + "balance_loss_clip": 1.05948806, + "balance_loss_mlp": 1.02279496, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 5.273203139481866, + "language_loss": 0.82916594, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85100144, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.15319824, + "step": 4746, + "time_per_iteration": 2.4802377223968506 + }, + { + "auxiliary_loss_clip": 0.01156641, + "auxiliary_loss_mlp": 0.01043833, + "balance_loss_clip": 1.05827975, + "balance_loss_mlp": 1.02573705, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.9427803312498033, + "language_loss": 0.79319561, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81520033, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.1809082, + "step": 4747, + "time_per_iteration": 2.5769879817962646 + }, + { + "auxiliary_loss_clip": 0.01071517, + "auxiliary_loss_mlp": 0.01016578, + "balance_loss_clip": 1.03956199, + "balance_loss_mlp": 1.01440871, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7767913474718067, + "language_loss": 0.60447437, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.6253553, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.31982422, + "router_z_loss_mlp": 0.021698, + "step": 4748, + "time_per_iteration": 3.126694679260254 + }, + { + "auxiliary_loss_clip": 0.01147396, + "auxiliary_loss_mlp": 0.0103988, + "balance_loss_clip": 1.05552673, + "balance_loss_mlp": 1.02360785, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.17463146166897, + "language_loss": 0.80226433, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82413709, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.16271973, + "step": 4749, + "time_per_iteration": 2.3871994018554688 + }, + { + "auxiliary_loss_clip": 0.01145785, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.05409551, + "balance_loss_mlp": 1.02235198, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.860132295277328, + "language_loss": 0.7068429, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72869074, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.16650391, + "step": 4750, + "time_per_iteration": 2.5094480514526367 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.01042081, + "balance_loss_clip": 1.05746865, + "balance_loss_mlp": 1.02584505, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.501272214206233, + "language_loss": 0.82224411, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84415877, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.16229248, + "step": 4751, + "time_per_iteration": 2.564694404602051 + }, + { + "auxiliary_loss_clip": 0.0114506, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.05372608, + "balance_loss_mlp": 1.02666545, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.712886981140037, + "language_loss": 0.80171168, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82360989, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.18103027, + "step": 4752, + "time_per_iteration": 2.610971212387085 + }, + { + "auxiliary_loss_clip": 0.01152855, + "auxiliary_loss_mlp": 0.01052155, + "balance_loss_clip": 1.06007361, + "balance_loss_mlp": 1.03388059, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.7549962626937137, + "language_loss": 0.78431737, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.80636752, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.18261719, + "step": 4753, + "time_per_iteration": 2.7070910930633545 + }, + { + "auxiliary_loss_clip": 0.01148783, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.05276155, + "balance_loss_mlp": 1.02767563, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.2351956267695474, + "language_loss": 0.9006474, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.92261124, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.19946289, + "step": 4754, + "time_per_iteration": 2.6005561351776123 + }, + { + "auxiliary_loss_clip": 0.01146377, + "auxiliary_loss_mlp": 0.01050664, + "balance_loss_clip": 1.05242586, + "balance_loss_mlp": 1.0343802, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 2.694616530750597, + "language_loss": 0.82521731, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84718776, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16259766, + "step": 4755, + "time_per_iteration": 2.433850049972534 + }, + { + "auxiliary_loss_clip": 0.01159073, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.06220496, + "balance_loss_mlp": 1.02292609, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.6671788519651587, + "language_loss": 0.83648515, + "learning_rate": 3.351272138300922e-06, + "loss": 0.85847259, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.16748047, + "step": 4756, + "time_per_iteration": 2.5711236000061035 + }, + { + "auxiliary_loss_clip": 0.0108344, + "auxiliary_loss_mlp": 0.01015845, + "balance_loss_clip": 1.0529815, + "balance_loss_mlp": 1.01287055, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.87033169990569, + "language_loss": 0.61043119, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63142401, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.30517578, + "router_z_loss_mlp": 0.0296936, + "step": 4757, + "time_per_iteration": 3.201768636703491 + }, + { + "auxiliary_loss_clip": 0.01147458, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.05483925, + "balance_loss_mlp": 1.04229748, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.984498345095006, + "language_loss": 0.65807903, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.6801607, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.1842041, + "step": 4758, + "time_per_iteration": 3.87138295173645 + }, + { + "auxiliary_loss_clip": 0.01158463, + "auxiliary_loss_mlp": 0.01040044, + "balance_loss_clip": 1.06103587, + "balance_loss_mlp": 1.02288997, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.4263426911371844, + "language_loss": 0.63009787, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65208298, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17150879, + "step": 4759, + "time_per_iteration": 2.651353597640991 + }, + { + "auxiliary_loss_clip": 0.01148904, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.05722582, + "balance_loss_mlp": 1.02738953, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.7119021114212296, + "language_loss": 0.74569416, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76762247, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.16540527, + "step": 4760, + "time_per_iteration": 2.535264253616333 + }, + { + "auxiliary_loss_clip": 0.0115598, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.06434512, + "balance_loss_mlp": 1.02232242, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 2.174856769406307, + "language_loss": 0.7283355, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.75027418, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.15563965, + "step": 4761, + "time_per_iteration": 2.6360504627227783 + }, + { + "auxiliary_loss_clip": 0.0115619, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_clip": 1.06111109, + "balance_loss_mlp": 1.02886677, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.095186301221263, + "language_loss": 0.75041282, + "learning_rate": 3.349548466945793e-06, + "loss": 0.77242899, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16564941, + "step": 4762, + "time_per_iteration": 2.5014047622680664 + }, + { + "auxiliary_loss_clip": 0.01150503, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_clip": 1.05965304, + "balance_loss_mlp": 1.02938807, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.514210849031298, + "language_loss": 0.76376933, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78573167, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.16345215, + "step": 4763, + "time_per_iteration": 2.585995674133301 + }, + { + "auxiliary_loss_clip": 0.01146561, + "auxiliary_loss_mlp": 0.01038533, + "balance_loss_clip": 1.05387628, + "balance_loss_mlp": 1.02141464, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 2.1373856771832522, + "language_loss": 0.77197969, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79383063, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.17114258, + "step": 4764, + "time_per_iteration": 2.5228829383850098 + }, + { + "auxiliary_loss_clip": 0.01148344, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.05344605, + "balance_loss_mlp": 1.0395205, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.7720661812930554, + "language_loss": 0.71181107, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73389584, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.20605469, + "step": 4765, + "time_per_iteration": 2.5792412757873535 + }, + { + "auxiliary_loss_clip": 0.01154502, + "auxiliary_loss_mlp": 0.01041469, + "balance_loss_clip": 1.06290817, + "balance_loss_mlp": 1.02513182, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.460436502922682, + "language_loss": 0.76078093, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78274059, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16326904, + "step": 4766, + "time_per_iteration": 3.100102663040161 + }, + { + "auxiliary_loss_clip": 0.01149642, + "auxiliary_loss_mlp": 0.01040736, + "balance_loss_clip": 1.05938017, + "balance_loss_mlp": 1.02409458, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.523601765823936, + "language_loss": 0.77803826, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79994196, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.16650391, + "step": 4767, + "time_per_iteration": 5.489803314208984 + }, + { + "auxiliary_loss_clip": 0.01148924, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.05634451, + "balance_loss_mlp": 1.02678251, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.37774647393145, + "language_loss": 0.6500904, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67201459, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.16711426, + "step": 4768, + "time_per_iteration": 2.497546672821045 + }, + { + "auxiliary_loss_clip": 0.01149517, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_clip": 1.05333889, + "balance_loss_mlp": 1.03213358, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.4582495980196901, + "language_loss": 0.70672256, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72870868, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16967773, + "step": 4769, + "time_per_iteration": 2.517855644226074 + }, + { + "auxiliary_loss_clip": 0.01142251, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.04974008, + "balance_loss_mlp": 1.02031422, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.8294844488132755, + "language_loss": 0.75260949, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77438778, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15252686, + "step": 4770, + "time_per_iteration": 2.5979318618774414 + }, + { + "auxiliary_loss_clip": 0.01145702, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.05257964, + "balance_loss_mlp": 1.02782261, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.1936506310999726, + "language_loss": 0.67771232, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69960725, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.15966797, + "step": 4771, + "time_per_iteration": 4.081476449966431 + }, + { + "auxiliary_loss_clip": 0.01087724, + "auxiliary_loss_mlp": 0.010176, + "balance_loss_clip": 1.05620074, + "balance_loss_mlp": 1.01588678, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7848256379796362, + "language_loss": 0.56859529, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58964854, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.01712036, + "step": 4772, + "time_per_iteration": 3.073179006576538 + }, + { + "auxiliary_loss_clip": 0.01151131, + "auxiliary_loss_mlp": 0.01047597, + "balance_loss_clip": 1.05575323, + "balance_loss_mlp": 1.02954888, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 3.3303774252836362, + "language_loss": 0.83334982, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85533714, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.18054199, + "step": 4773, + "time_per_iteration": 2.5375301837921143 + }, + { + "auxiliary_loss_clip": 0.01149865, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.05374658, + "balance_loss_mlp": 1.02284503, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.5063614633182247, + "language_loss": 0.78340948, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80531216, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.17565918, + "step": 4774, + "time_per_iteration": 2.5942835807800293 + }, + { + "auxiliary_loss_clip": 0.01158797, + "auxiliary_loss_mlp": 0.01042875, + "balance_loss_clip": 1.06248581, + "balance_loss_mlp": 1.02525568, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 2.0274852920788353, + "language_loss": 0.73625737, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75827414, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.17602539, + "step": 4775, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01150106, + "auxiliary_loss_mlp": 0.010416, + "balance_loss_clip": 1.05505097, + "balance_loss_mlp": 1.02569747, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 2.041020674476741, + "language_loss": 0.88338268, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90529972, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15905762, + "step": 4776, + "time_per_iteration": 2.671571731567383 + }, + { + "auxiliary_loss_clip": 0.01149566, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.05550539, + "balance_loss_mlp": 1.02104402, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 2.027564976653952, + "language_loss": 0.73927319, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76113677, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15759277, + "step": 4777, + "time_per_iteration": 2.9120826721191406 + }, + { + "auxiliary_loss_clip": 0.01155655, + "auxiliary_loss_mlp": 0.01051721, + "balance_loss_clip": 1.0591414, + "balance_loss_mlp": 1.03249264, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 2.6385890095639017, + "language_loss": 0.79890186, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.8209756, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.19226074, + "step": 4778, + "time_per_iteration": 2.8509681224823 + }, + { + "auxiliary_loss_clip": 0.01148674, + "auxiliary_loss_mlp": 0.01049737, + "balance_loss_clip": 1.0573566, + "balance_loss_mlp": 1.03221309, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.5619083651458694, + "language_loss": 0.73600191, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.75798601, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.1751709, + "step": 4779, + "time_per_iteration": 2.59550142288208 + }, + { + "auxiliary_loss_clip": 0.01155595, + "auxiliary_loss_mlp": 0.0103815, + "balance_loss_clip": 1.05998445, + "balance_loss_mlp": 1.02068615, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5832929277749725, + "language_loss": 0.76247573, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78441322, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.17468262, + "step": 4780, + "time_per_iteration": 2.860764265060425 + }, + { + "auxiliary_loss_clip": 0.01146262, + "auxiliary_loss_mlp": 0.01041177, + "balance_loss_clip": 1.05513489, + "balance_loss_mlp": 1.025805, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.864104665688712, + "language_loss": 0.80821443, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83008879, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15356445, + "step": 4781, + "time_per_iteration": 2.840653896331787 + }, + { + "auxiliary_loss_clip": 0.01151844, + "auxiliary_loss_mlp": 0.01044229, + "balance_loss_clip": 1.05502069, + "balance_loss_mlp": 1.02578771, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 2.274740446636763, + "language_loss": 0.86260277, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88456351, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.18444824, + "step": 4782, + "time_per_iteration": 2.7759406566619873 + }, + { + "auxiliary_loss_clip": 0.01152271, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_clip": 1.05840039, + "balance_loss_mlp": 1.0290792, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 2.611580289753306, + "language_loss": 0.71233046, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73432261, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.17883301, + "step": 4783, + "time_per_iteration": 2.504380464553833 + }, + { + "auxiliary_loss_clip": 0.01147171, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_clip": 1.0557133, + "balance_loss_mlp": 1.03688788, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 1.6404143712955053, + "language_loss": 0.76804185, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79006886, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.18640137, + "step": 4784, + "time_per_iteration": 2.5351040363311768 + }, + { + "auxiliary_loss_clip": 0.01147079, + "auxiliary_loss_mlp": 0.01044871, + "balance_loss_clip": 1.05607605, + "balance_loss_mlp": 1.02758622, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.7387899455863531, + "language_loss": 0.75753474, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.77945429, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.17285156, + "step": 4785, + "time_per_iteration": 2.526028871536255 + }, + { + "auxiliary_loss_clip": 0.01152205, + "auxiliary_loss_mlp": 0.01050468, + "balance_loss_clip": 1.0573622, + "balance_loss_mlp": 1.03311157, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 2.1439321464847136, + "language_loss": 0.83072758, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85275424, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.17358398, + "step": 4786, + "time_per_iteration": 2.552471399307251 + }, + { + "auxiliary_loss_clip": 0.01157836, + "auxiliary_loss_mlp": 0.010393, + "balance_loss_clip": 1.06221879, + "balance_loss_mlp": 1.02387393, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.9872105314544735, + "language_loss": 0.79952347, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82149482, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.1541748, + "step": 4787, + "time_per_iteration": 2.4643261432647705 + }, + { + "auxiliary_loss_clip": 0.0114945, + "auxiliary_loss_mlp": 0.01039725, + "balance_loss_clip": 1.05539036, + "balance_loss_mlp": 1.02333391, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.7693683112214602, + "language_loss": 0.83485174, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85674345, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.16369629, + "step": 4788, + "time_per_iteration": 2.609036445617676 + }, + { + "auxiliary_loss_clip": 0.01157012, + "auxiliary_loss_mlp": 0.01044101, + "balance_loss_clip": 1.06294775, + "balance_loss_mlp": 1.02737546, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 2.1355588732100004, + "language_loss": 0.73466319, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75667429, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.16711426, + "step": 4789, + "time_per_iteration": 2.5873260498046875 + }, + { + "auxiliary_loss_clip": 0.01144754, + "auxiliary_loss_mlp": 0.01046895, + "balance_loss_clip": 1.05499959, + "balance_loss_mlp": 1.03089726, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7778860621201027, + "language_loss": 0.83586866, + "learning_rate": 3.341480346078704e-06, + "loss": 0.85778517, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.15979004, + "step": 4790, + "time_per_iteration": 2.496371030807495 + }, + { + "auxiliary_loss_clip": 0.01153418, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.05938339, + "balance_loss_mlp": 1.03571832, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.6663547668200638, + "language_loss": 0.78138208, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.8034668, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.19335938, + "step": 4791, + "time_per_iteration": 2.490234375 + }, + { + "auxiliary_loss_clip": 0.01152121, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_clip": 1.05481851, + "balance_loss_mlp": 1.02651846, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.9324193890701586, + "language_loss": 0.70277727, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72472513, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.16131592, + "step": 4792, + "time_per_iteration": 2.4314754009246826 + }, + { + "auxiliary_loss_clip": 0.01155977, + "auxiliary_loss_mlp": 0.01041302, + "balance_loss_clip": 1.06064343, + "balance_loss_mlp": 1.02528059, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 1.6668815007113917, + "language_loss": 0.79465711, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81662989, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.16015625, + "step": 4793, + "time_per_iteration": 2.492180824279785 + }, + { + "auxiliary_loss_clip": 0.0114807, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_clip": 1.05907905, + "balance_loss_mlp": 1.02094769, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.8539712468233849, + "language_loss": 0.78118283, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80303264, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.1595459, + "step": 4794, + "time_per_iteration": 2.6962194442749023 + }, + { + "auxiliary_loss_clip": 0.01162742, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.06746125, + "balance_loss_mlp": 1.02457666, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.2111228084663184, + "language_loss": 0.8272472, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84929264, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.17248535, + "step": 4795, + "time_per_iteration": 2.522261381149292 + }, + { + "auxiliary_loss_clip": 0.01146382, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.05876112, + "balance_loss_mlp": 1.02667475, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 1.960003341362746, + "language_loss": 0.74783319, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76971638, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.15258789, + "step": 4796, + "time_per_iteration": 2.6122334003448486 + }, + { + "auxiliary_loss_clip": 0.01158658, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.06140685, + "balance_loss_mlp": 1.02469039, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 2.126540756032974, + "language_loss": 0.72775018, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74976784, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.1842041, + "step": 4797, + "time_per_iteration": 2.4997072219848633 + }, + { + "auxiliary_loss_clip": 0.01156096, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.06093121, + "balance_loss_mlp": 1.01947916, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 4.048950293168508, + "language_loss": 0.74966377, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.77158618, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16662598, + "step": 4798, + "time_per_iteration": 2.4141533374786377 + }, + { + "auxiliary_loss_clip": 0.01161268, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.06443667, + "balance_loss_mlp": 1.02507019, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7340308965460918, + "language_loss": 0.65087533, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67292112, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.18225098, + "step": 4799, + "time_per_iteration": 2.5305724143981934 + }, + { + "auxiliary_loss_clip": 0.01160191, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.06563246, + "balance_loss_mlp": 1.03072953, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.7680378011588356, + "language_loss": 0.82594955, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84802461, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16577148, + "step": 4800, + "time_per_iteration": 2.6292927265167236 + }, + { + "auxiliary_loss_clip": 0.01144065, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.05628729, + "balance_loss_mlp": 1.02130771, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 2.874133044467484, + "language_loss": 0.90889692, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.930704, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.15319824, + "step": 4801, + "time_per_iteration": 3.916444778442383 + }, + { + "auxiliary_loss_clip": 0.01149103, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.05708003, + "balance_loss_mlp": 1.01819265, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.2229817341008364, + "language_loss": 0.73949105, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76132953, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.16540527, + "step": 4802, + "time_per_iteration": 2.471604585647583 + }, + { + "auxiliary_loss_clip": 0.01086435, + "auxiliary_loss_mlp": 0.01009836, + "balance_loss_clip": 1.05535293, + "balance_loss_mlp": 1.00817025, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7862881713172688, + "language_loss": 0.62958872, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65055144, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.01664734, + "step": 4803, + "time_per_iteration": 3.0521185398101807 + }, + { + "auxiliary_loss_clip": 0.01158214, + "auxiliary_loss_mlp": 0.01045182, + "balance_loss_clip": 1.06718349, + "balance_loss_mlp": 1.02879071, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.8022320400350211, + "language_loss": 0.70684075, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72887468, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.16381836, + "step": 4804, + "time_per_iteration": 2.4559407234191895 + }, + { + "auxiliary_loss_clip": 0.01158856, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.06301963, + "balance_loss_mlp": 1.02221012, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.9652721587830706, + "language_loss": 0.678339, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70032704, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.17749023, + "step": 4805, + "time_per_iteration": 2.511138439178467 + }, + { + "auxiliary_loss_clip": 0.01158928, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_clip": 1.06543279, + "balance_loss_mlp": 1.02746069, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.5327694414976147, + "language_loss": 0.69639313, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71841693, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16003418, + "step": 4806, + "time_per_iteration": 2.590449333190918 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01044442, + "balance_loss_clip": 1.05458724, + "balance_loss_mlp": 1.02793169, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.5525981146230832, + "language_loss": 0.71197951, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73386508, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.16516113, + "step": 4807, + "time_per_iteration": 2.5381710529327393 + }, + { + "auxiliary_loss_clip": 0.01159912, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.06615257, + "balance_loss_mlp": 1.02329648, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 2.125188521497987, + "language_loss": 0.81640625, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83841109, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.17260742, + "step": 4808, + "time_per_iteration": 2.698655128479004 + }, + { + "auxiliary_loss_clip": 0.01149614, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_clip": 1.06089246, + "balance_loss_mlp": 1.02840304, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.8963839199529178, + "language_loss": 0.78477114, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80671322, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.16186523, + "step": 4809, + "time_per_iteration": 2.4995152950286865 + }, + { + "auxiliary_loss_clip": 0.01152516, + "auxiliary_loss_mlp": 0.01055499, + "balance_loss_clip": 1.05392504, + "balance_loss_mlp": 1.03702164, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 2.6185847699664215, + "language_loss": 0.78729647, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80937666, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.18469238, + "step": 4810, + "time_per_iteration": 3.813676595687866 + }, + { + "auxiliary_loss_clip": 0.01151363, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.06060946, + "balance_loss_mlp": 1.02195191, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.5644810582565027, + "language_loss": 0.76897156, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.790865, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.16027832, + "step": 4811, + "time_per_iteration": 3.9298171997070312 + }, + { + "auxiliary_loss_clip": 0.01153166, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.06219149, + "balance_loss_mlp": 1.02747726, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3915774654323647, + "language_loss": 0.77080786, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79278743, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.17297363, + "step": 4812, + "time_per_iteration": 2.560129404067993 + }, + { + "auxiliary_loss_clip": 0.01092849, + "auxiliary_loss_mlp": 0.01014655, + "balance_loss_clip": 1.05823207, + "balance_loss_mlp": 1.01263428, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8684214517157387, + "language_loss": 0.6028806, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62395573, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.34667969, + "router_z_loss_mlp": 0.02023315, + "step": 4813, + "time_per_iteration": 3.2850427627563477 + }, + { + "auxiliary_loss_clip": 0.01147986, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.05737233, + "balance_loss_mlp": 1.02193725, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 2.0543238386113383, + "language_loss": 0.821926, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84379023, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.16503906, + "step": 4814, + "time_per_iteration": 2.4151110649108887 + }, + { + "auxiliary_loss_clip": 0.01147603, + "auxiliary_loss_mlp": 0.01054234, + "balance_loss_clip": 1.05300224, + "balance_loss_mlp": 1.03703177, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.72559003759506, + "language_loss": 0.73002231, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.75204062, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.17199707, + "step": 4815, + "time_per_iteration": 2.477792501449585 + }, + { + "auxiliary_loss_clip": 0.01141386, + "auxiliary_loss_mlp": 0.01042467, + "balance_loss_clip": 1.05305111, + "balance_loss_mlp": 1.02726734, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.4358121498564411, + "language_loss": 0.7101571, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.73199564, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15197754, + "step": 4816, + "time_per_iteration": 3.8478097915649414 + }, + { + "auxiliary_loss_clip": 0.01151359, + "auxiliary_loss_mlp": 0.01049381, + "balance_loss_clip": 1.05396771, + "balance_loss_mlp": 1.03201246, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.2789355154021114, + "language_loss": 0.74638665, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76839405, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.17358398, + "step": 4817, + "time_per_iteration": 2.452488422393799 + }, + { + "auxiliary_loss_clip": 0.01154452, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.05962253, + "balance_loss_mlp": 1.03324008, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 2.16697894254464, + "language_loss": 0.76160681, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78365529, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.17150879, + "step": 4818, + "time_per_iteration": 2.576723337173462 + }, + { + "auxiliary_loss_clip": 0.01142558, + "auxiliary_loss_mlp": 0.01042746, + "balance_loss_clip": 1.04952121, + "balance_loss_mlp": 1.02572286, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 2.0397328776029484, + "language_loss": 0.80121219, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82306528, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.17028809, + "step": 4819, + "time_per_iteration": 2.430250883102417 + }, + { + "auxiliary_loss_clip": 0.01152278, + "auxiliary_loss_mlp": 0.0104023, + "balance_loss_clip": 1.0545603, + "balance_loss_mlp": 1.02225363, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.7410055707980245, + "language_loss": 0.78771895, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80964404, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.17993164, + "step": 4820, + "time_per_iteration": 2.624973773956299 + }, + { + "auxiliary_loss_clip": 0.01155332, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.06023896, + "balance_loss_mlp": 1.01902819, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.1223092140337707, + "language_loss": 0.73102605, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75293946, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.1697998, + "step": 4821, + "time_per_iteration": 2.451611042022705 + }, + { + "auxiliary_loss_clip": 0.01147694, + "auxiliary_loss_mlp": 0.01041806, + "balance_loss_clip": 1.05549407, + "balance_loss_mlp": 1.02474713, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.879883055205485, + "language_loss": 0.72078526, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74268019, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.17053223, + "step": 4822, + "time_per_iteration": 2.51192307472229 + }, + { + "auxiliary_loss_clip": 0.01142807, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_clip": 1.05094767, + "balance_loss_mlp": 1.02963114, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.8930515554678091, + "language_loss": 0.6632911, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68517983, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.16418457, + "step": 4823, + "time_per_iteration": 2.4099204540252686 + }, + { + "auxiliary_loss_clip": 0.0113733, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.04723489, + "balance_loss_mlp": 1.0202831, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 1.8154237705511114, + "language_loss": 0.81022799, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83196485, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.16064453, + "step": 4824, + "time_per_iteration": 2.5296220779418945 + }, + { + "auxiliary_loss_clip": 0.01144744, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.0513978, + "balance_loss_mlp": 1.02441359, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 2.0544651517655033, + "language_loss": 0.72856122, + "learning_rate": 3.331339140206385e-06, + "loss": 0.75042897, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.17602539, + "step": 4825, + "time_per_iteration": 2.4849445819854736 + }, + { + "auxiliary_loss_clip": 0.01142377, + "auxiliary_loss_mlp": 0.01039559, + "balance_loss_clip": 1.05028927, + "balance_loss_mlp": 1.0222615, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 3.0406799682581562, + "language_loss": 0.73191512, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75373447, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.17285156, + "step": 4826, + "time_per_iteration": 2.512916326522827 + }, + { + "auxiliary_loss_clip": 0.0114564, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.05072451, + "balance_loss_mlp": 1.02699316, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 2.549911091100014, + "language_loss": 0.68382567, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70571363, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16162109, + "step": 4827, + "time_per_iteration": 2.4690213203430176 + }, + { + "auxiliary_loss_clip": 0.01145401, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.05119884, + "balance_loss_mlp": 1.0211513, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 2.1589655372978607, + "language_loss": 0.79897285, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82081014, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.17175293, + "step": 4828, + "time_per_iteration": 2.4779863357543945 + }, + { + "auxiliary_loss_clip": 0.0114946, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.05844831, + "balance_loss_mlp": 1.02951515, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 1.8182030520333181, + "language_loss": 0.80531502, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82727456, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.16967773, + "step": 4829, + "time_per_iteration": 2.4537384510040283 + }, + { + "auxiliary_loss_clip": 0.01140653, + "auxiliary_loss_mlp": 0.01044318, + "balance_loss_clip": 1.05126381, + "balance_loss_mlp": 1.02762842, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.5403561311767477, + "language_loss": 0.82516658, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84701627, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.16699219, + "step": 4830, + "time_per_iteration": 2.5832395553588867 + }, + { + "auxiliary_loss_clip": 0.01143533, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.05002165, + "balance_loss_mlp": 1.02927029, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.4280116995390153, + "language_loss": 0.78965896, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81155968, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.17272949, + "step": 4831, + "time_per_iteration": 2.452075481414795 + }, + { + "auxiliary_loss_clip": 0.01149566, + "auxiliary_loss_mlp": 0.01041044, + "balance_loss_clip": 1.0592016, + "balance_loss_mlp": 1.02530813, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.7396823960578607, + "language_loss": 0.73917079, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76107693, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.15722656, + "step": 4832, + "time_per_iteration": 2.5402321815490723 + }, + { + "auxiliary_loss_clip": 0.01137041, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.04641783, + "balance_loss_mlp": 1.01957655, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6937927453924124, + "language_loss": 0.75905645, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78077078, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.14807129, + "step": 4833, + "time_per_iteration": 2.462918281555176 + }, + { + "auxiliary_loss_clip": 0.01141526, + "auxiliary_loss_mlp": 0.01042192, + "balance_loss_clip": 1.05206239, + "balance_loss_mlp": 1.02514482, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.9388686470629148, + "language_loss": 0.64694262, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66877973, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.17053223, + "step": 4834, + "time_per_iteration": 2.6835386753082275 + }, + { + "auxiliary_loss_clip": 0.01143125, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.05062485, + "balance_loss_mlp": 1.02355099, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.6882764169281088, + "language_loss": 0.71729726, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.7391181, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.15405273, + "step": 4835, + "time_per_iteration": 2.4791152477264404 + }, + { + "auxiliary_loss_clip": 0.01142791, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.05201292, + "balance_loss_mlp": 1.0216738, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 1.6937375369517513, + "language_loss": 0.7958653, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81765574, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.14587402, + "step": 4836, + "time_per_iteration": 2.53774094581604 + }, + { + "auxiliary_loss_clip": 0.01151681, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_clip": 1.0626111, + "balance_loss_mlp": 1.02658594, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 2.2370900603472625, + "language_loss": 0.81470603, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83664656, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.15771484, + "step": 4837, + "time_per_iteration": 2.49094820022583 + }, + { + "auxiliary_loss_clip": 0.01148218, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.05605292, + "balance_loss_mlp": 1.02203488, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.954634849968805, + "language_loss": 0.6719262, + "learning_rate": 3.327556630259381e-06, + "loss": 0.6937896, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.1607666, + "step": 4838, + "time_per_iteration": 2.5620710849761963 + }, + { + "auxiliary_loss_clip": 0.01148622, + "auxiliary_loss_mlp": 0.01049676, + "balance_loss_clip": 1.05273247, + "balance_loss_mlp": 1.03188944, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.694256843961293, + "language_loss": 0.71837413, + "learning_rate": 3.327265315259095e-06, + "loss": 0.74035704, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17797852, + "step": 4839, + "time_per_iteration": 2.4734742641448975 + }, + { + "auxiliary_loss_clip": 0.01153638, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.06181169, + "balance_loss_mlp": 1.02253413, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 2.0184213182077757, + "language_loss": 0.75987214, + "learning_rate": 3.326973949928776e-06, + "loss": 0.78178978, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15594482, + "step": 4840, + "time_per_iteration": 2.62597918510437 + }, + { + "auxiliary_loss_clip": 0.01146479, + "auxiliary_loss_mlp": 0.01045937, + "balance_loss_clip": 1.05620372, + "balance_loss_mlp": 1.02949774, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 2.1561553497589054, + "language_loss": 0.60539711, + "learning_rate": 3.326682534279471e-06, + "loss": 0.6273213, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.16430664, + "step": 4841, + "time_per_iteration": 2.5681328773498535 + }, + { + "auxiliary_loss_clip": 0.01150375, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_clip": 1.05394959, + "balance_loss_mlp": 1.02893686, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.4309414921592845, + "language_loss": 0.71283388, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73480415, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.17724609, + "step": 4842, + "time_per_iteration": 2.5586845874786377 + }, + { + "auxiliary_loss_clip": 0.01146478, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.05461216, + "balance_loss_mlp": 1.02276444, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.4782892152582339, + "language_loss": 0.73186183, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75370419, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.14990234, + "step": 4843, + "time_per_iteration": 2.50913143157959 + }, + { + "auxiliary_loss_clip": 0.01144296, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.05294347, + "balance_loss_mlp": 1.02254558, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.1709082880508377, + "language_loss": 0.58698708, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60880876, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.15325928, + "step": 4844, + "time_per_iteration": 4.087522029876709 + }, + { + "auxiliary_loss_clip": 0.01156167, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_clip": 1.06159878, + "balance_loss_mlp": 1.0235821, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 1.8327221933375797, + "language_loss": 0.86734921, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88932073, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.17407227, + "step": 4845, + "time_per_iteration": 2.4598283767700195 + }, + { + "auxiliary_loss_clip": 0.01153758, + "auxiliary_loss_mlp": 0.01047758, + "balance_loss_clip": 1.06027591, + "balance_loss_mlp": 1.03079438, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.787279678297868, + "language_loss": 0.66981399, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69182914, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.16967773, + "step": 4846, + "time_per_iteration": 2.4678547382354736 + }, + { + "auxiliary_loss_clip": 0.01155151, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.0644418, + "balance_loss_mlp": 1.02200842, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.9755362273575034, + "language_loss": 0.7059865, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72791833, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.16021729, + "step": 4847, + "time_per_iteration": 2.652432918548584 + }, + { + "auxiliary_loss_clip": 0.01153266, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.05814767, + "balance_loss_mlp": 1.01671696, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.4842578092949443, + "language_loss": 0.73884368, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76071095, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.1673584, + "step": 4848, + "time_per_iteration": 2.5195207595825195 + }, + { + "auxiliary_loss_clip": 0.01152573, + "auxiliary_loss_mlp": 0.01045842, + "balance_loss_clip": 1.05866337, + "balance_loss_mlp": 1.02880657, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.520508412550984, + "language_loss": 0.76820964, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.7901938, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.17041016, + "step": 4849, + "time_per_iteration": 2.46817946434021 + }, + { + "auxiliary_loss_clip": 0.01158416, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.06087208, + "balance_loss_mlp": 1.02442312, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.571824382481699, + "language_loss": 0.79073668, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.81273544, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.17028809, + "step": 4850, + "time_per_iteration": 2.457874059677124 + }, + { + "auxiliary_loss_clip": 0.01148505, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.05874383, + "balance_loss_mlp": 1.02510953, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.9512324572131021, + "language_loss": 0.75832516, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78023499, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.17358398, + "step": 4851, + "time_per_iteration": 2.5023891925811768 + }, + { + "auxiliary_loss_clip": 0.01143787, + "auxiliary_loss_mlp": 0.01049833, + "balance_loss_clip": 1.05378997, + "balance_loss_mlp": 1.03376365, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.5903638963216136, + "language_loss": 0.77519923, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79713541, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.16064453, + "step": 4852, + "time_per_iteration": 2.5819411277770996 + }, + { + "auxiliary_loss_clip": 0.01145911, + "auxiliary_loss_mlp": 0.01057799, + "balance_loss_clip": 1.05347109, + "balance_loss_mlp": 1.03939295, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 5.739072309901991, + "language_loss": 0.77908552, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80112267, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.1842041, + "step": 4853, + "time_per_iteration": 2.549678325653076 + }, + { + "auxiliary_loss_clip": 0.01155691, + "auxiliary_loss_mlp": 0.01052274, + "balance_loss_clip": 1.05618167, + "balance_loss_mlp": 1.0350486, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 9.02654687631161, + "language_loss": 0.88353568, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90561533, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.17236328, + "step": 4854, + "time_per_iteration": 5.333487510681152 + }, + { + "auxiliary_loss_clip": 0.01156761, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.06486619, + "balance_loss_mlp": 1.03611362, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.672732315870503, + "language_loss": 0.86381721, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88593513, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.18920898, + "step": 4855, + "time_per_iteration": 2.491788387298584 + }, + { + "auxiliary_loss_clip": 0.0106945, + "auxiliary_loss_mlp": 0.01006945, + "balance_loss_clip": 1.03772664, + "balance_loss_mlp": 1.00479007, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.84504079293719, + "language_loss": 0.60188359, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62264752, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.31689453, + "router_z_loss_mlp": 0.02154541, + "step": 4856, + "time_per_iteration": 3.2146198749542236 + }, + { + "auxiliary_loss_clip": 0.01149277, + "auxiliary_loss_mlp": 0.01043685, + "balance_loss_clip": 1.05808306, + "balance_loss_mlp": 1.02812779, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 2.046838837858727, + "language_loss": 0.67866403, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70059359, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.15563965, + "step": 4857, + "time_per_iteration": 2.4637293815612793 + }, + { + "auxiliary_loss_clip": 0.01152449, + "auxiliary_loss_mlp": 0.01051819, + "balance_loss_clip": 1.05770791, + "balance_loss_mlp": 1.03586864, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.8615414841957025, + "language_loss": 0.83685863, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85890132, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.15942383, + "step": 4858, + "time_per_iteration": 2.555842161178589 + }, + { + "auxiliary_loss_clip": 0.0115765, + "auxiliary_loss_mlp": 0.01048658, + "balance_loss_clip": 1.06124043, + "balance_loss_mlp": 1.03240955, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0656779714947255, + "language_loss": 0.77664459, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79870772, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16223145, + "step": 4859, + "time_per_iteration": 2.4246678352355957 + }, + { + "auxiliary_loss_clip": 0.01164948, + "auxiliary_loss_mlp": 0.01043777, + "balance_loss_clip": 1.06608891, + "balance_loss_mlp": 1.0265156, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.961584803120683, + "language_loss": 0.68649095, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.70857823, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.17260742, + "step": 4860, + "time_per_iteration": 4.169926881790161 + }, + { + "auxiliary_loss_clip": 0.01154231, + "auxiliary_loss_mlp": 0.0104103, + "balance_loss_clip": 1.06470716, + "balance_loss_mlp": 1.02606916, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.237880865281748, + "language_loss": 0.75436056, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77631313, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.1496582, + "step": 4861, + "time_per_iteration": 2.5811948776245117 + }, + { + "auxiliary_loss_clip": 0.01171158, + "auxiliary_loss_mlp": 0.0104861, + "balance_loss_clip": 1.07546735, + "balance_loss_mlp": 1.03294611, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.8348107808521086, + "language_loss": 0.91443986, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93663758, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15649414, + "step": 4862, + "time_per_iteration": 2.420309543609619 + }, + { + "auxiliary_loss_clip": 0.0114908, + "auxiliary_loss_mlp": 0.01042281, + "balance_loss_clip": 1.05825377, + "balance_loss_mlp": 1.02730834, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.33241559257528, + "language_loss": 0.73324245, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75515604, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.14978027, + "step": 4863, + "time_per_iteration": 2.4315359592437744 + }, + { + "auxiliary_loss_clip": 0.01148265, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.06173193, + "balance_loss_mlp": 1.02198911, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.8395434156939037, + "language_loss": 0.78040427, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80225265, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14593506, + "step": 4864, + "time_per_iteration": 2.4777188301086426 + }, + { + "auxiliary_loss_clip": 0.01152903, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_clip": 1.05997276, + "balance_loss_mlp": 1.03040028, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.6773819097122287, + "language_loss": 0.81726873, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8392719, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.17016602, + "step": 4865, + "time_per_iteration": 2.5349972248077393 + }, + { + "auxiliary_loss_clip": 0.01159171, + "auxiliary_loss_mlp": 0.01046618, + "balance_loss_clip": 1.06665635, + "balance_loss_mlp": 1.02995205, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.9388341464340204, + "language_loss": 0.85309422, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87515211, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.16650391, + "step": 4866, + "time_per_iteration": 2.5025646686553955 + }, + { + "auxiliary_loss_clip": 0.01152486, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.06374454, + "balance_loss_mlp": 1.02167165, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 2.1520891466886454, + "language_loss": 0.75433606, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77623188, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.15429688, + "step": 4867, + "time_per_iteration": 2.564605712890625 + }, + { + "auxiliary_loss_clip": 0.01146082, + "auxiliary_loss_mlp": 0.0104476, + "balance_loss_clip": 1.05272067, + "balance_loss_mlp": 1.02814269, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.7422057183296733, + "language_loss": 0.72937965, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75128806, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16601562, + "step": 4868, + "time_per_iteration": 2.443575382232666 + }, + { + "auxiliary_loss_clip": 0.01152284, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.06239402, + "balance_loss_mlp": 1.01922202, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.386024614912941, + "language_loss": 0.74499261, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.7668612, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.15368652, + "step": 4869, + "time_per_iteration": 2.4378178119659424 + }, + { + "auxiliary_loss_clip": 0.01150242, + "auxiliary_loss_mlp": 0.01037883, + "balance_loss_clip": 1.05896258, + "balance_loss_mlp": 1.02151608, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.4707231306445638, + "language_loss": 0.76627123, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78815246, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.16369629, + "step": 4870, + "time_per_iteration": 2.476855754852295 + }, + { + "auxiliary_loss_clip": 0.01157613, + "auxiliary_loss_mlp": 0.01047737, + "balance_loss_clip": 1.05953944, + "balance_loss_mlp": 1.03039241, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.857348125851349, + "language_loss": 0.67893273, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.70098621, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17346191, + "step": 4871, + "time_per_iteration": 2.442491292953491 + }, + { + "auxiliary_loss_clip": 0.01145387, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.05475402, + "balance_loss_mlp": 1.02304614, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 2.0534049809762047, + "language_loss": 0.76884484, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79067832, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.14923096, + "step": 4872, + "time_per_iteration": 2.5074820518493652 + }, + { + "auxiliary_loss_clip": 0.01151804, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.05601978, + "balance_loss_mlp": 1.02317929, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 1.8490058977538277, + "language_loss": 0.73667765, + "learning_rate": 3.317330731292164e-06, + "loss": 0.758605, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17749023, + "step": 4873, + "time_per_iteration": 2.4459941387176514 + }, + { + "auxiliary_loss_clip": 0.01149037, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.0554316, + "balance_loss_mlp": 1.02503955, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 2.4028798555387705, + "language_loss": 0.78091192, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80282271, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16992188, + "step": 4874, + "time_per_iteration": 2.6760783195495605 + }, + { + "auxiliary_loss_clip": 0.01156848, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.06124067, + "balance_loss_mlp": 1.02702713, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 5.098135460790677, + "language_loss": 0.77639574, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.798406, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17163086, + "step": 4875, + "time_per_iteration": 2.449298143386841 + }, + { + "auxiliary_loss_clip": 0.01148666, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.05627418, + "balance_loss_mlp": 1.02016342, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6240845436983273, + "language_loss": 0.69266963, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71451718, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15917969, + "step": 4876, + "time_per_iteration": 2.420743227005005 + }, + { + "auxiliary_loss_clip": 0.01150758, + "auxiliary_loss_mlp": 0.01057274, + "balance_loss_clip": 1.05680597, + "balance_loss_mlp": 1.03998923, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.1661919409074564, + "language_loss": 0.82268143, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84476173, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.17285156, + "step": 4877, + "time_per_iteration": 2.4912755489349365 + }, + { + "auxiliary_loss_clip": 0.01157744, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.06142557, + "balance_loss_mlp": 1.02287054, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.9537078955113778, + "language_loss": 0.67689013, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69886005, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16381836, + "step": 4878, + "time_per_iteration": 2.40283203125 + }, + { + "auxiliary_loss_clip": 0.01143766, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.05279016, + "balance_loss_mlp": 1.03412986, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 2.6846203915420874, + "language_loss": 0.73659182, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75854379, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.1729126, + "step": 4879, + "time_per_iteration": 2.5446901321411133 + }, + { + "auxiliary_loss_clip": 0.01155828, + "auxiliary_loss_mlp": 0.01053866, + "balance_loss_clip": 1.05725026, + "balance_loss_mlp": 1.03499484, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.173887813270735, + "language_loss": 0.66377431, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68587124, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.1887207, + "step": 4880, + "time_per_iteration": 2.59209942817688 + }, + { + "auxiliary_loss_clip": 0.01151193, + "auxiliary_loss_mlp": 0.01051619, + "balance_loss_clip": 1.0560286, + "balance_loss_mlp": 1.03434515, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 4.879806389094521, + "language_loss": 0.70323658, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72526473, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.17272949, + "step": 4881, + "time_per_iteration": 2.5027809143066406 + }, + { + "auxiliary_loss_clip": 0.01157714, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.06521058, + "balance_loss_mlp": 1.02169037, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.5765250610512769, + "language_loss": 0.83274287, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85470688, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.16986084, + "step": 4882, + "time_per_iteration": 2.4477975368499756 + }, + { + "auxiliary_loss_clip": 0.01148736, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_clip": 1.05312037, + "balance_loss_mlp": 1.02567029, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.280122529622165, + "language_loss": 0.71632153, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73823893, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17321777, + "step": 4883, + "time_per_iteration": 2.4878387451171875 + }, + { + "auxiliary_loss_clip": 0.01149017, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.05361962, + "balance_loss_mlp": 1.0222466, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.089047026059425, + "language_loss": 0.91939372, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94127756, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.17114258, + "step": 4884, + "time_per_iteration": 2.4462146759033203 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.06742811, + "balance_loss_mlp": 1.02330947, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 3.572413202885037, + "language_loss": 0.73393226, + "learning_rate": 3.313810597972234e-06, + "loss": 0.7559377, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15991211, + "step": 4885, + "time_per_iteration": 2.4910991191864014 + }, + { + "auxiliary_loss_clip": 0.01146024, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.05579054, + "balance_loss_mlp": 1.02597117, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 1.8420807671527213, + "language_loss": 0.85051841, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87239653, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.15814209, + "step": 4886, + "time_per_iteration": 2.417187213897705 + }, + { + "auxiliary_loss_clip": 0.01154796, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.06099033, + "balance_loss_mlp": 1.02024937, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.449705567081194, + "language_loss": 0.76959932, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79151076, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.16088867, + "step": 4887, + "time_per_iteration": 2.4345526695251465 + }, + { + "auxiliary_loss_clip": 0.01147968, + "auxiliary_loss_mlp": 0.01058983, + "balance_loss_clip": 1.05458581, + "balance_loss_mlp": 1.04054189, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.058747171760711, + "language_loss": 0.79132974, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81339926, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.18444824, + "step": 4888, + "time_per_iteration": 2.804368734359741 + }, + { + "auxiliary_loss_clip": 0.01146696, + "auxiliary_loss_mlp": 0.0104051, + "balance_loss_clip": 1.05303311, + "balance_loss_mlp": 1.02509618, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.7554125961729377, + "language_loss": 0.55550289, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57737494, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15429688, + "step": 4889, + "time_per_iteration": 4.037097692489624 + }, + { + "auxiliary_loss_clip": 0.01156018, + "auxiliary_loss_mlp": 0.01039186, + "balance_loss_clip": 1.06006598, + "balance_loss_mlp": 1.02190089, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 3.3229745810965916, + "language_loss": 0.8445338, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86648583, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17272949, + "step": 4890, + "time_per_iteration": 2.417645215988159 + }, + { + "auxiliary_loss_clip": 0.01149911, + "auxiliary_loss_mlp": 0.01048477, + "balance_loss_clip": 1.05566251, + "balance_loss_mlp": 1.03169215, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.7002143827590763, + "language_loss": 0.7223165, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74430031, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16772461, + "step": 4891, + "time_per_iteration": 2.4576187133789062 + }, + { + "auxiliary_loss_clip": 0.01150753, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.0561173, + "balance_loss_mlp": 1.02532268, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.637898164378624, + "language_loss": 0.77138692, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79332078, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.1730957, + "step": 4892, + "time_per_iteration": 2.5045580863952637 + }, + { + "auxiliary_loss_clip": 0.01144837, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.05268598, + "balance_loss_mlp": 1.01984298, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.8168997376935552, + "language_loss": 0.77823228, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80004466, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.16552734, + "step": 4893, + "time_per_iteration": 2.6292428970336914 + }, + { + "auxiliary_loss_clip": 0.01144982, + "auxiliary_loss_mlp": 0.01041246, + "balance_loss_clip": 1.05304921, + "balance_loss_mlp": 1.025594, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.822788901880527, + "language_loss": 0.85195553, + "learning_rate": 3.311165788957864e-06, + "loss": 0.8738178, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.15673828, + "step": 4894, + "time_per_iteration": 2.5906245708465576 + }, + { + "auxiliary_loss_clip": 0.01149193, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.05399418, + "balance_loss_mlp": 1.02100563, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.0865551868121632, + "language_loss": 0.90332675, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92518669, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.15820312, + "step": 4895, + "time_per_iteration": 2.433682680130005 + }, + { + "auxiliary_loss_clip": 0.0115539, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.05867553, + "balance_loss_mlp": 1.02825558, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.8437138022829425, + "language_loss": 0.86661398, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88862389, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.17346191, + "step": 4896, + "time_per_iteration": 2.436283826828003 + }, + { + "auxiliary_loss_clip": 0.01148625, + "auxiliary_loss_mlp": 0.01055208, + "balance_loss_clip": 1.05369401, + "balance_loss_mlp": 1.03792286, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 1.8382690698692066, + "language_loss": 0.7367034, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.7587418, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.17260742, + "step": 4897, + "time_per_iteration": 2.513638973236084 + }, + { + "auxiliary_loss_clip": 0.01148751, + "auxiliary_loss_mlp": 0.01040728, + "balance_loss_clip": 1.05154991, + "balance_loss_mlp": 1.02270317, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.9839861111933026, + "language_loss": 0.74266344, + "learning_rate": 3.309989025093813e-06, + "loss": 0.76455826, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.18017578, + "step": 4898, + "time_per_iteration": 5.27046012878418 + }, + { + "auxiliary_loss_clip": 0.01167208, + "auxiliary_loss_mlp": 0.01046017, + "balance_loss_clip": 1.06769347, + "balance_loss_mlp": 1.02677703, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.679796559517322, + "language_loss": 0.70756376, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72969592, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.19250488, + "step": 4899, + "time_per_iteration": 2.4231767654418945 + }, + { + "auxiliary_loss_clip": 0.01156541, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.0615145, + "balance_loss_mlp": 1.02963531, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 2.58102982752021, + "language_loss": 0.7918781, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.81390864, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.16882324, + "step": 4900, + "time_per_iteration": 2.9629194736480713 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.0524838, + "balance_loss_mlp": 1.02638888, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.9410416761664808, + "language_loss": 0.80778265, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.82968271, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.1628418, + "step": 4901, + "time_per_iteration": 2.486470937728882 + }, + { + "auxiliary_loss_clip": 0.01146799, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.05750322, + "balance_loss_mlp": 1.01995516, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.0445503346792346, + "language_loss": 0.57993472, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60175741, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.1552124, + "step": 4902, + "time_per_iteration": 2.4994070529937744 + }, + { + "auxiliary_loss_clip": 0.0115003, + "auxiliary_loss_mlp": 0.01046132, + "balance_loss_clip": 1.05777431, + "balance_loss_mlp": 1.02904916, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6434171945813074, + "language_loss": 0.76073092, + "learning_rate": 3.308516952661925e-06, + "loss": 0.78269255, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.1706543, + "step": 4903, + "time_per_iteration": 2.460458993911743 + }, + { + "auxiliary_loss_clip": 0.01151075, + "auxiliary_loss_mlp": 0.01040005, + "balance_loss_clip": 1.05907631, + "balance_loss_mlp": 1.02255249, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.7250188070375754, + "language_loss": 0.61976576, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64167655, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.17443848, + "step": 4904, + "time_per_iteration": 3.9136617183685303 + }, + { + "auxiliary_loss_clip": 0.01155903, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.05805516, + "balance_loss_mlp": 1.02804708, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.6843896435654209, + "language_loss": 0.73422146, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75622708, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.16589355, + "step": 4905, + "time_per_iteration": 2.4660165309906006 + }, + { + "auxiliary_loss_clip": 0.01147182, + "auxiliary_loss_mlp": 0.01049692, + "balance_loss_clip": 1.05429173, + "balance_loss_mlp": 1.03117859, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.7265922479214106, + "language_loss": 0.81529438, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83726317, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.18505859, + "step": 4906, + "time_per_iteration": 2.493208646774292 + }, + { + "auxiliary_loss_clip": 0.01139136, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.04915571, + "balance_loss_mlp": 1.02745795, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 2.2169065175156772, + "language_loss": 0.8772006, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89904279, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.17614746, + "step": 4907, + "time_per_iteration": 2.506267547607422 + }, + { + "auxiliary_loss_clip": 0.01156066, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.06017327, + "balance_loss_mlp": 1.02605534, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 6.66448085102141, + "language_loss": 0.82101989, + "learning_rate": 3.307043639752782e-06, + "loss": 0.8430146, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.17346191, + "step": 4908, + "time_per_iteration": 2.465794086456299 + }, + { + "auxiliary_loss_clip": 0.01073035, + "auxiliary_loss_mlp": 0.01015398, + "balance_loss_clip": 1.04139686, + "balance_loss_mlp": 1.0130322, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7690334332404062, + "language_loss": 0.57229829, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59318268, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.31689453, + "router_z_loss_mlp": 0.02365112, + "step": 4909, + "time_per_iteration": 2.964507818222046 + }, + { + "auxiliary_loss_clip": 0.01161889, + "auxiliary_loss_mlp": 0.01044043, + "balance_loss_clip": 1.0691452, + "balance_loss_mlp": 1.02834308, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 3.204573503240577, + "language_loss": 0.86643577, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88849509, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15698242, + "step": 4910, + "time_per_iteration": 2.491466760635376 + }, + { + "auxiliary_loss_clip": 0.01148352, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.05895996, + "balance_loss_mlp": 1.02222526, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.7618325323763495, + "language_loss": 0.72855681, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75041389, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.15136719, + "step": 4911, + "time_per_iteration": 2.561394453048706 + }, + { + "auxiliary_loss_clip": 0.01154066, + "auxiliary_loss_mlp": 0.01041452, + "balance_loss_clip": 1.06125045, + "balance_loss_mlp": 1.02526367, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.6656700330746201, + "language_loss": 0.9002316, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92218679, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.16186523, + "step": 4912, + "time_per_iteration": 2.47904372215271 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01048816, + "balance_loss_clip": 1.05835271, + "balance_loss_mlp": 1.03236485, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.4543884529240658, + "language_loss": 0.83315825, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85514718, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.16455078, + "step": 4913, + "time_per_iteration": 2.6614649295806885 + }, + { + "auxiliary_loss_clip": 0.01154031, + "auxiliary_loss_mlp": 0.01044252, + "balance_loss_clip": 1.06151962, + "balance_loss_mlp": 1.02844501, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.7993719715390437, + "language_loss": 0.77055317, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79253602, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.15808105, + "step": 4914, + "time_per_iteration": 2.6971652507781982 + }, + { + "auxiliary_loss_clip": 0.01156218, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.06567609, + "balance_loss_mlp": 1.02529359, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.9824407623941247, + "language_loss": 0.81726861, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.839252, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16821289, + "step": 4915, + "time_per_iteration": 2.7135982513427734 + }, + { + "auxiliary_loss_clip": 0.01148885, + "auxiliary_loss_mlp": 0.01038957, + "balance_loss_clip": 1.05520368, + "balance_loss_mlp": 1.02262568, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.951834741233008, + "language_loss": 0.84823692, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8701154, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.16333008, + "step": 4916, + "time_per_iteration": 2.638031005859375 + }, + { + "auxiliary_loss_clip": 0.01139585, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.05004704, + "balance_loss_mlp": 1.02258372, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 1.9105510626554567, + "language_loss": 0.69521177, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71700466, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.17102051, + "step": 4917, + "time_per_iteration": 2.569218158721924 + }, + { + "auxiliary_loss_clip": 0.01149101, + "auxiliary_loss_mlp": 0.01046733, + "balance_loss_clip": 1.05661392, + "balance_loss_mlp": 1.02850568, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1184749729915318, + "language_loss": 0.91072583, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93268424, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.18225098, + "step": 4918, + "time_per_iteration": 2.595144033432007 + }, + { + "auxiliary_loss_clip": 0.01143389, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.05183959, + "balance_loss_mlp": 1.02911043, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 2.115312303350786, + "language_loss": 0.72844929, + "learning_rate": 3.303797991757425e-06, + "loss": 0.75034714, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.17297363, + "step": 4919, + "time_per_iteration": 3.092984914779663 + }, + { + "auxiliary_loss_clip": 0.01143789, + "auxiliary_loss_mlp": 0.01044481, + "balance_loss_clip": 1.05414462, + "balance_loss_mlp": 1.02842307, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 2.194554067569275, + "language_loss": 0.75700569, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.7788884, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.16052246, + "step": 4920, + "time_per_iteration": 2.731037139892578 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01050278, + "balance_loss_clip": 1.0581634, + "balance_loss_mlp": 1.03339815, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.9849241549546224, + "language_loss": 0.68833727, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71038246, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.16882324, + "step": 4921, + "time_per_iteration": 2.856712818145752 + }, + { + "auxiliary_loss_clip": 0.0114902, + "auxiliary_loss_mlp": 0.01052042, + "balance_loss_clip": 1.05384207, + "balance_loss_mlp": 1.03170514, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.9410639009991988, + "language_loss": 0.7473135, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76932406, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.20324707, + "step": 4922, + "time_per_iteration": 3.009937047958374 + }, + { + "auxiliary_loss_clip": 0.01153084, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.05548406, + "balance_loss_mlp": 1.02135789, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.8934642472174086, + "language_loss": 0.76641321, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78833586, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.17834473, + "step": 4923, + "time_per_iteration": 2.7058796882629395 + }, + { + "auxiliary_loss_clip": 0.01152744, + "auxiliary_loss_mlp": 0.0106, + "balance_loss_clip": 1.06058919, + "balance_loss_mlp": 1.0409385, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.6380709124209227, + "language_loss": 0.86549461, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.887622, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.19067383, + "step": 4924, + "time_per_iteration": 2.7590548992156982 + }, + { + "auxiliary_loss_clip": 0.01146101, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.0529573, + "balance_loss_mlp": 1.01899588, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.5768876224689028, + "language_loss": 0.81839818, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84022641, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.17736816, + "step": 4925, + "time_per_iteration": 2.7505898475646973 + }, + { + "auxiliary_loss_clip": 0.01143323, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.05231488, + "balance_loss_mlp": 1.02838755, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 7.21475659739793, + "language_loss": 0.85828197, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88016975, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.17053223, + "step": 4926, + "time_per_iteration": 2.539757013320923 + }, + { + "auxiliary_loss_clip": 0.01152711, + "auxiliary_loss_mlp": 0.01036593, + "balance_loss_clip": 1.05639505, + "balance_loss_mlp": 1.02034438, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.125798663140117, + "language_loss": 0.86087024, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88276327, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16259766, + "step": 4927, + "time_per_iteration": 2.580981969833374 + }, + { + "auxiliary_loss_clip": 0.01157909, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.06477189, + "balance_loss_mlp": 1.02274156, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.7239642807389775, + "language_loss": 0.80883574, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83080184, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15979004, + "step": 4928, + "time_per_iteration": 2.6415302753448486 + }, + { + "auxiliary_loss_clip": 0.01160436, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.06204796, + "balance_loss_mlp": 1.02227128, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 2.5862546362987735, + "language_loss": 0.72835052, + "learning_rate": 3.300842211064773e-06, + "loss": 0.75037634, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.19873047, + "step": 4929, + "time_per_iteration": 2.495162010192871 + }, + { + "auxiliary_loss_clip": 0.01152243, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.05607629, + "balance_loss_mlp": 1.0278337, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 3.316744402125808, + "language_loss": 0.72168654, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74366963, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.18212891, + "step": 4930, + "time_per_iteration": 2.4258062839508057 + }, + { + "auxiliary_loss_clip": 0.01068397, + "auxiliary_loss_mlp": 0.0100527, + "balance_loss_clip": 1.03625631, + "balance_loss_mlp": 1.00297201, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8127223085715218, + "language_loss": 0.60633153, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62706816, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.32128906, + "router_z_loss_mlp": 0.02297974, + "step": 4931, + "time_per_iteration": 3.0192694664001465 + }, + { + "auxiliary_loss_clip": 0.01076766, + "auxiliary_loss_mlp": 0.0100906, + "balance_loss_clip": 1.04414332, + "balance_loss_mlp": 1.00656271, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7375701562312961, + "language_loss": 0.52331656, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54417479, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.02496338, + "step": 4932, + "time_per_iteration": 4.3396172523498535 + }, + { + "auxiliary_loss_clip": 0.01148911, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.05519354, + "balance_loss_mlp": 1.03089345, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 5.07122081916455, + "language_loss": 0.81823719, + "learning_rate": 3.299658516973972e-06, + "loss": 0.84020376, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.1685791, + "step": 4933, + "time_per_iteration": 2.527318000793457 + }, + { + "auxiliary_loss_clip": 0.0115824, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.06804538, + "balance_loss_mlp": 1.01756358, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.883038759724732, + "language_loss": 0.7535277, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77544844, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.16271973, + "step": 4934, + "time_per_iteration": 2.4958460330963135 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01048449, + "balance_loss_clip": 1.0575676, + "balance_loss_mlp": 1.03140163, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 2.054004203523822, + "language_loss": 0.62388533, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64588642, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.17041016, + "step": 4935, + "time_per_iteration": 2.4255571365356445 + }, + { + "auxiliary_loss_clip": 0.01142651, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.05357218, + "balance_loss_mlp": 1.0259769, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4833368679433645, + "language_loss": 0.79580605, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81765294, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.1607666, + "step": 4936, + "time_per_iteration": 2.5107855796813965 + }, + { + "auxiliary_loss_clip": 0.01156258, + "auxiliary_loss_mlp": 0.01046103, + "balance_loss_clip": 1.0619061, + "balance_loss_mlp": 1.02816176, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8207889556550132, + "language_loss": 0.74244511, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76446867, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.17932129, + "step": 4937, + "time_per_iteration": 2.6070444583892822 + }, + { + "auxiliary_loss_clip": 0.01153846, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_clip": 1.06270623, + "balance_loss_mlp": 1.02650046, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.8500146553112404, + "language_loss": 0.78352487, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80549634, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16796875, + "step": 4938, + "time_per_iteration": 2.48563289642334 + }, + { + "auxiliary_loss_clip": 0.01158329, + "auxiliary_loss_mlp": 0.01051273, + "balance_loss_clip": 1.0595268, + "balance_loss_mlp": 1.03386879, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.2371113083940823, + "language_loss": 0.76855218, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79064822, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.17407227, + "step": 4939, + "time_per_iteration": 2.3966562747955322 + }, + { + "auxiliary_loss_clip": 0.01156455, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.06006777, + "balance_loss_mlp": 1.02596259, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.6340010713868678, + "language_loss": 0.78134388, + "learning_rate": 3.297585155344979e-06, + "loss": 0.8033312, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16314697, + "step": 4940, + "time_per_iteration": 2.565507411956787 + }, + { + "auxiliary_loss_clip": 0.0116783, + "auxiliary_loss_mlp": 0.01040217, + "balance_loss_clip": 1.07135689, + "balance_loss_mlp": 1.02185869, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.409632446003147, + "language_loss": 0.75315678, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77523726, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.18371582, + "step": 4941, + "time_per_iteration": 2.716204881668091 + }, + { + "auxiliary_loss_clip": 0.01161402, + "auxiliary_loss_mlp": 0.01056918, + "balance_loss_clip": 1.06245863, + "balance_loss_mlp": 1.03903651, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.7820415442538162, + "language_loss": 0.74216199, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.76434517, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.17895508, + "step": 4942, + "time_per_iteration": 4.0119287967681885 + }, + { + "auxiliary_loss_clip": 0.01169281, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.07045043, + "balance_loss_mlp": 1.02541161, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 2.2040746162971305, + "language_loss": 0.70462859, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72675574, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.18017578, + "step": 4943, + "time_per_iteration": 2.4841132164001465 + }, + { + "auxiliary_loss_clip": 0.01165006, + "auxiliary_loss_mlp": 0.01041262, + "balance_loss_clip": 1.06773591, + "balance_loss_mlp": 1.02384555, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 2.889437517124799, + "language_loss": 0.79874825, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82081097, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.17419434, + "step": 4944, + "time_per_iteration": 2.4932687282562256 + }, + { + "auxiliary_loss_clip": 0.01160649, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_clip": 1.0715723, + "balance_loss_mlp": 1.02727008, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.165756737519095, + "language_loss": 0.8363744, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85840803, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.15441895, + "step": 4945, + "time_per_iteration": 2.4701361656188965 + }, + { + "auxiliary_loss_clip": 0.01152542, + "auxiliary_loss_mlp": 0.01047509, + "balance_loss_clip": 1.06315315, + "balance_loss_mlp": 1.03140366, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.7161504661434708, + "language_loss": 0.66850781, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69050831, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.16101074, + "step": 4946, + "time_per_iteration": 2.525394916534424 + }, + { + "auxiliary_loss_clip": 0.01155237, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.06258333, + "balance_loss_mlp": 1.01914132, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.8665481074560257, + "language_loss": 0.73672324, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75862592, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15869141, + "step": 4947, + "time_per_iteration": 2.490772247314453 + }, + { + "auxiliary_loss_clip": 0.01157441, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.05886555, + "balance_loss_mlp": 1.02690792, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 1.9689938439622, + "language_loss": 0.73506409, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75708759, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.17993164, + "step": 4948, + "time_per_iteration": 3.8404223918914795 + }, + { + "auxiliary_loss_clip": 0.01151174, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.06097126, + "balance_loss_mlp": 1.01806748, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 1.9191528930440764, + "language_loss": 0.83309132, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.8549425, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.15869141, + "step": 4949, + "time_per_iteration": 2.497042179107666 + }, + { + "auxiliary_loss_clip": 0.01139992, + "auxiliary_loss_mlp": 0.01056608, + "balance_loss_clip": 1.05044317, + "balance_loss_mlp": 1.03789818, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 3.1722654396579775, + "language_loss": 0.7117976, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73376358, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.18713379, + "step": 4950, + "time_per_iteration": 2.504631757736206 + }, + { + "auxiliary_loss_clip": 0.01149368, + "auxiliary_loss_mlp": 0.01047115, + "balance_loss_clip": 1.05867219, + "balance_loss_mlp": 1.03116441, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 3.0343116017834184, + "language_loss": 0.8222841, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84424895, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.1595459, + "step": 4951, + "time_per_iteration": 2.511155366897583 + }, + { + "auxiliary_loss_clip": 0.01146434, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.05324864, + "balance_loss_mlp": 1.01914573, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.9216159351735023, + "language_loss": 0.74248862, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76430756, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.16320801, + "step": 4952, + "time_per_iteration": 2.473228931427002 + }, + { + "auxiliary_loss_clip": 0.01148625, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.05647206, + "balance_loss_mlp": 1.02118587, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.7240193367522576, + "language_loss": 0.83884394, + "learning_rate": 3.293728232937228e-06, + "loss": 0.86071426, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.17224121, + "step": 4953, + "time_per_iteration": 2.476499080657959 + }, + { + "auxiliary_loss_clip": 0.01153011, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.06147623, + "balance_loss_mlp": 1.02364588, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 1.9996988740100514, + "language_loss": 0.73839259, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.7603147, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15551758, + "step": 4954, + "time_per_iteration": 2.587108612060547 + }, + { + "auxiliary_loss_clip": 0.01147399, + "auxiliary_loss_mlp": 0.01037263, + "balance_loss_clip": 1.05589783, + "balance_loss_mlp": 1.0216707, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.8572409260212537, + "language_loss": 0.7551769, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77702355, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15588379, + "step": 4955, + "time_per_iteration": 2.5312390327453613 + }, + { + "auxiliary_loss_clip": 0.01150832, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.05694091, + "balance_loss_mlp": 1.01654696, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 2.5222878486244387, + "language_loss": 0.72601366, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74784857, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.16119385, + "step": 4956, + "time_per_iteration": 2.4826903343200684 + }, + { + "auxiliary_loss_clip": 0.01158184, + "auxiliary_loss_mlp": 0.01041144, + "balance_loss_clip": 1.05891323, + "balance_loss_mlp": 1.02418065, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6707617006220798, + "language_loss": 0.79460841, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81660169, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.1697998, + "step": 4957, + "time_per_iteration": 2.620781898498535 + }, + { + "auxiliary_loss_clip": 0.01172754, + "auxiliary_loss_mlp": 0.01044824, + "balance_loss_clip": 1.07619691, + "balance_loss_mlp": 1.02776504, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5904584018552383, + "language_loss": 0.70675504, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72893071, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.17053223, + "step": 4958, + "time_per_iteration": 2.488320827484131 + }, + { + "auxiliary_loss_clip": 0.01151499, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.06221128, + "balance_loss_mlp": 1.02201009, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.6097857300414902, + "language_loss": 0.79011571, + "learning_rate": 3.291945317082743e-06, + "loss": 0.81201243, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.16174316, + "step": 4959, + "time_per_iteration": 2.586901903152466 + }, + { + "auxiliary_loss_clip": 0.01153508, + "auxiliary_loss_mlp": 0.01045391, + "balance_loss_clip": 1.06239843, + "balance_loss_mlp": 1.02926159, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.7432633565646358, + "language_loss": 0.79711449, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81910348, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16149902, + "step": 4960, + "time_per_iteration": 2.494821786880493 + }, + { + "auxiliary_loss_clip": 0.01147652, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.05435467, + "balance_loss_mlp": 1.0200758, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.6747189544008236, + "language_loss": 0.73967779, + "learning_rate": 3.291350619752129e-06, + "loss": 0.76152825, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.17333984, + "step": 4961, + "time_per_iteration": 2.4373772144317627 + }, + { + "auxiliary_loss_clip": 0.01156027, + "auxiliary_loss_mlp": 0.01038209, + "balance_loss_clip": 1.06452978, + "balance_loss_mlp": 1.02275944, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 2.086523605916897, + "language_loss": 0.62257761, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64451993, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15454102, + "step": 4962, + "time_per_iteration": 2.5104193687438965 + }, + { + "auxiliary_loss_clip": 0.01143311, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.05369234, + "balance_loss_mlp": 1.02336335, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.7810354622169131, + "language_loss": 0.82864034, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85046929, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.16204834, + "step": 4963, + "time_per_iteration": 2.4467453956604004 + }, + { + "auxiliary_loss_clip": 0.01147336, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.05877614, + "balance_loss_mlp": 1.01597667, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.3239421595851475, + "language_loss": 0.66439754, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68619186, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.16125488, + "step": 4964, + "time_per_iteration": 2.440098524093628 + }, + { + "auxiliary_loss_clip": 0.01139961, + "auxiliary_loss_mlp": 0.01034145, + "balance_loss_clip": 1.05145216, + "balance_loss_mlp": 1.02003658, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.7419367436063062, + "language_loss": 0.71258271, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73432374, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.14117432, + "step": 4965, + "time_per_iteration": 2.4990477561950684 + }, + { + "auxiliary_loss_clip": 0.01158612, + "auxiliary_loss_mlp": 0.01046483, + "balance_loss_clip": 1.06716907, + "balance_loss_mlp": 1.02979386, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.6719075309788534, + "language_loss": 0.66291928, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68497026, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.16674805, + "step": 4966, + "time_per_iteration": 2.4500839710235596 + }, + { + "auxiliary_loss_clip": 0.01147128, + "auxiliary_loss_mlp": 0.01041723, + "balance_loss_clip": 1.05670214, + "balance_loss_mlp": 1.02579677, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.934681489588105, + "language_loss": 0.7413317, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76322019, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.15930176, + "step": 4967, + "time_per_iteration": 2.4348316192626953 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.05455375, + "balance_loss_mlp": 1.01772761, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 3.028704900859775, + "language_loss": 0.71416211, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73592854, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.14971924, + "step": 4968, + "time_per_iteration": 2.570920705795288 + }, + { + "auxiliary_loss_clip": 0.01149928, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.05895162, + "balance_loss_mlp": 1.01398015, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 1.6308172646396373, + "language_loss": 0.76429117, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.78608733, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15710449, + "step": 4969, + "time_per_iteration": 2.5976243019104004 + }, + { + "auxiliary_loss_clip": 0.01138559, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.04964435, + "balance_loss_mlp": 1.01946664, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.8165521273725767, + "language_loss": 0.70127726, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72299957, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.14221191, + "step": 4970, + "time_per_iteration": 2.5399694442749023 + }, + { + "auxiliary_loss_clip": 0.01145226, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.05090356, + "balance_loss_mlp": 1.02153766, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.3105486738828778, + "language_loss": 0.84933168, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87117159, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.17224121, + "step": 4971, + "time_per_iteration": 2.490216016769409 + }, + { + "auxiliary_loss_clip": 0.01143663, + "auxiliary_loss_mlp": 0.0104624, + "balance_loss_clip": 1.05478168, + "balance_loss_mlp": 1.03020608, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.8134558928370688, + "language_loss": 0.79614854, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81804764, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.16040039, + "step": 4972, + "time_per_iteration": 2.493217706680298 + }, + { + "auxiliary_loss_clip": 0.01154174, + "auxiliary_loss_mlp": 0.01042303, + "balance_loss_clip": 1.06396389, + "balance_loss_mlp": 1.02651989, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.9872464151748916, + "language_loss": 0.85487074, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87683547, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.15783691, + "step": 4973, + "time_per_iteration": 2.5023534297943115 + }, + { + "auxiliary_loss_clip": 0.01151325, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.06510425, + "balance_loss_mlp": 1.01893973, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.7273935807726832, + "language_loss": 0.77858829, + "learning_rate": 3.287480316742863e-06, + "loss": 0.80044591, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.1550293, + "step": 4974, + "time_per_iteration": 2.4842658042907715 + }, + { + "auxiliary_loss_clip": 0.01145857, + "auxiliary_loss_mlp": 0.01044724, + "balance_loss_clip": 1.05403399, + "balance_loss_mlp": 1.02910709, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.7973454402637439, + "language_loss": 0.72630584, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74821162, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15594482, + "step": 4975, + "time_per_iteration": 4.021557807922363 + }, + { + "auxiliary_loss_clip": 0.0114302, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.05402875, + "balance_loss_mlp": 1.03136575, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.6306541525296603, + "language_loss": 0.76447642, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78639996, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.1796875, + "step": 4976, + "time_per_iteration": 2.459341049194336 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.04914212, + "balance_loss_mlp": 1.0264461, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.099384475958494, + "language_loss": 0.86062634, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88239861, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.15234375, + "step": 4977, + "time_per_iteration": 2.4793882369995117 + }, + { + "auxiliary_loss_clip": 0.01141057, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.0521915, + "balance_loss_mlp": 1.02166343, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6252979446937048, + "language_loss": 0.68751585, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70929527, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.15222168, + "step": 4978, + "time_per_iteration": 2.5782010555267334 + }, + { + "auxiliary_loss_clip": 0.01150782, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.06088948, + "balance_loss_mlp": 1.01704633, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 6.321236826167139, + "language_loss": 0.76814497, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78998518, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.1619873, + "step": 4979, + "time_per_iteration": 2.537292003631592 + }, + { + "auxiliary_loss_clip": 0.01138228, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.04872036, + "balance_loss_mlp": 1.02433145, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7334708005130945, + "language_loss": 0.68498409, + "learning_rate": 3.285691238725484e-06, + "loss": 0.7067647, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.15527344, + "step": 4980, + "time_per_iteration": 2.643342971801758 + }, + { + "auxiliary_loss_clip": 0.01148454, + "auxiliary_loss_mlp": 0.01044248, + "balance_loss_clip": 1.05979919, + "balance_loss_mlp": 1.02867925, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 2.055006011170986, + "language_loss": 0.73819995, + "learning_rate": 3.285392888352555e-06, + "loss": 0.76012701, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15563965, + "step": 4981, + "time_per_iteration": 2.6121299266815186 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01042792, + "balance_loss_clip": 1.05709422, + "balance_loss_mlp": 1.02684188, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.498460449398006, + "language_loss": 0.86500484, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8869313, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15942383, + "step": 4982, + "time_per_iteration": 2.4925472736358643 + }, + { + "auxiliary_loss_clip": 0.01150165, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.05713248, + "balance_loss_mlp": 1.02341354, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 4.1493987656307, + "language_loss": 0.86399287, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88590217, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.17346191, + "step": 4983, + "time_per_iteration": 2.4317636489868164 + }, + { + "auxiliary_loss_clip": 0.01152004, + "auxiliary_loss_mlp": 0.01042207, + "balance_loss_clip": 1.06153691, + "balance_loss_mlp": 1.02724051, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.7820072998650582, + "language_loss": 0.78157604, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80351812, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.14959717, + "step": 4984, + "time_per_iteration": 2.547470808029175 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01040888, + "balance_loss_clip": 1.05880094, + "balance_loss_mlp": 1.02489042, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.6280092328746238, + "language_loss": 0.78777337, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80967844, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.15979004, + "step": 4985, + "time_per_iteration": 3.8959643840789795 + }, + { + "auxiliary_loss_clip": 0.0115883, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.06373715, + "balance_loss_mlp": 1.02452409, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 1.9497032308674649, + "language_loss": 0.72051483, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74252415, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.17578125, + "step": 4986, + "time_per_iteration": 4.086217641830444 + }, + { + "auxiliary_loss_clip": 0.01151713, + "auxiliary_loss_mlp": 0.01045015, + "balance_loss_clip": 1.05619442, + "balance_loss_mlp": 1.02855229, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.90183684306072, + "language_loss": 0.73242116, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75438845, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.16442871, + "step": 4987, + "time_per_iteration": 2.455564498901367 + }, + { + "auxiliary_loss_clip": 0.01138504, + "auxiliary_loss_mlp": 0.01043579, + "balance_loss_clip": 1.05118108, + "balance_loss_mlp": 1.02722311, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 1.7730625549882169, + "language_loss": 0.80459142, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82641226, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.16357422, + "step": 4988, + "time_per_iteration": 2.4166877269744873 + }, + { + "auxiliary_loss_clip": 0.01143021, + "auxiliary_loss_mlp": 0.01047415, + "balance_loss_clip": 1.05187488, + "balance_loss_mlp": 1.03069007, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.6818595926580704, + "language_loss": 0.70406783, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72597218, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16729736, + "step": 4989, + "time_per_iteration": 2.4915611743927 + }, + { + "auxiliary_loss_clip": 0.01147734, + "auxiliary_loss_mlp": 0.01042141, + "balance_loss_clip": 1.05510926, + "balance_loss_mlp": 1.02549982, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 1.9867231977653161, + "language_loss": 0.850618, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87251681, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16650391, + "step": 4990, + "time_per_iteration": 2.4733705520629883 + }, + { + "auxiliary_loss_clip": 0.01163206, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.06522918, + "balance_loss_mlp": 1.01949263, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.8074793763905175, + "language_loss": 0.66873962, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69073397, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16748047, + "step": 4991, + "time_per_iteration": 2.6449146270751953 + }, + { + "auxiliary_loss_clip": 0.01153262, + "auxiliary_loss_mlp": 0.01039477, + "balance_loss_clip": 1.05656254, + "balance_loss_mlp": 1.02221549, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.824649041371445, + "language_loss": 0.79072833, + "learning_rate": 3.28210781975363e-06, + "loss": 0.81265575, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.17248535, + "step": 4992, + "time_per_iteration": 3.837033987045288 + }, + { + "auxiliary_loss_clip": 0.01147777, + "auxiliary_loss_mlp": 0.01039694, + "balance_loss_clip": 1.05676436, + "balance_loss_mlp": 1.02364862, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 3.8844386558381263, + "language_loss": 0.82704711, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84892184, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.16052246, + "step": 4993, + "time_per_iteration": 2.868649482727051 + }, + { + "auxiliary_loss_clip": 0.01147211, + "auxiliary_loss_mlp": 0.01049078, + "balance_loss_clip": 1.05290508, + "balance_loss_mlp": 1.03132749, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 3.3099609013605766, + "language_loss": 0.8641023, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88606524, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.17749023, + "step": 4994, + "time_per_iteration": 2.462301015853882 + }, + { + "auxiliary_loss_clip": 0.01147037, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.05611968, + "balance_loss_mlp": 1.01925921, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.6205736806637938, + "language_loss": 0.81174421, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83357555, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.16833496, + "step": 4995, + "time_per_iteration": 2.4977610111236572 + }, + { + "auxiliary_loss_clip": 0.01140478, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.05292559, + "balance_loss_mlp": 1.02178061, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7306229616698885, + "language_loss": 0.67463547, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69642067, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.16259766, + "step": 4996, + "time_per_iteration": 2.6965086460113525 + }, + { + "auxiliary_loss_clip": 0.01140364, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.05002379, + "balance_loss_mlp": 1.0212903, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7376774085451667, + "language_loss": 0.7510941, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77288109, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.1706543, + "step": 4997, + "time_per_iteration": 2.4791903495788574 + }, + { + "auxiliary_loss_clip": 0.01141941, + "auxiliary_loss_mlp": 0.01045132, + "balance_loss_clip": 1.05326307, + "balance_loss_mlp": 1.03007603, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.8281408856682377, + "language_loss": 0.77559119, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79746193, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.1505127, + "step": 4998, + "time_per_iteration": 2.470233917236328 + }, + { + "auxiliary_loss_clip": 0.01150095, + "auxiliary_loss_mlp": 0.01041296, + "balance_loss_clip": 1.06296742, + "balance_loss_mlp": 1.02612078, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.8479030699944978, + "language_loss": 0.73449475, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75640869, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.15179443, + "step": 4999, + "time_per_iteration": 2.475071430206299 + }, + { + "auxiliary_loss_clip": 0.01146916, + "auxiliary_loss_mlp": 0.01043522, + "balance_loss_clip": 1.05625927, + "balance_loss_mlp": 1.02759588, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.8230775342283, + "language_loss": 0.75909531, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78099978, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.15905762, + "step": 5000, + "time_per_iteration": 2.4718587398529053 + }, + { + "auxiliary_loss_clip": 0.01152591, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.06416154, + "balance_loss_mlp": 1.02571249, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 2.07621430749592, + "language_loss": 0.82466888, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84659344, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14147949, + "step": 5001, + "time_per_iteration": 2.442460775375366 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.01044106, + "balance_loss_clip": 1.06724536, + "balance_loss_mlp": 1.02709496, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.6571796079027883, + "language_loss": 0.80514526, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82716656, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.17016602, + "step": 5002, + "time_per_iteration": 2.476581573486328 + }, + { + "auxiliary_loss_clip": 0.01154063, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.06008768, + "balance_loss_mlp": 1.02097583, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 2.142257689620785, + "language_loss": 0.71295655, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73486841, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16125488, + "step": 5003, + "time_per_iteration": 2.485670804977417 + }, + { + "auxiliary_loss_clip": 0.01161376, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_clip": 1.0686636, + "balance_loss_mlp": 1.02705848, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.9440050689111792, + "language_loss": 0.70446908, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72651505, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.16149902, + "step": 5004, + "time_per_iteration": 2.526655435562134 + }, + { + "auxiliary_loss_clip": 0.01152767, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.05994129, + "balance_loss_mlp": 1.02690148, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.5492156042815877, + "language_loss": 0.81211782, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83407736, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.1628418, + "step": 5005, + "time_per_iteration": 2.4067234992980957 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01046208, + "balance_loss_clip": 1.05813909, + "balance_loss_mlp": 1.02893484, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.448654509482912, + "language_loss": 0.74901885, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77096486, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.17248535, + "step": 5006, + "time_per_iteration": 2.5494778156280518 + }, + { + "auxiliary_loss_clip": 0.01145669, + "auxiliary_loss_mlp": 0.01046715, + "balance_loss_clip": 1.05508935, + "balance_loss_mlp": 1.03043139, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.811739341118976, + "language_loss": 0.71569788, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.73762178, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.16271973, + "step": 5007, + "time_per_iteration": 2.897076368331909 + }, + { + "auxiliary_loss_clip": 0.01150197, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.05850482, + "balance_loss_mlp": 1.02137375, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.220262283649207, + "language_loss": 0.76324058, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78512561, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.16931152, + "step": 5008, + "time_per_iteration": 2.4836089611053467 + }, + { + "auxiliary_loss_clip": 0.0115113, + "auxiliary_loss_mlp": 0.0103997, + "balance_loss_clip": 1.06195664, + "balance_loss_mlp": 1.02415061, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.9787981257994873, + "language_loss": 0.84615779, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86806881, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.15808105, + "step": 5009, + "time_per_iteration": 2.5372908115386963 + }, + { + "auxiliary_loss_clip": 0.011487, + "auxiliary_loss_mlp": 0.01041329, + "balance_loss_clip": 1.05426073, + "balance_loss_mlp": 1.02341187, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.8586892511151465, + "language_loss": 0.8412745, + "learning_rate": 3.276719570659604e-06, + "loss": 0.8631748, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.17907715, + "step": 5010, + "time_per_iteration": 2.4667646884918213 + }, + { + "auxiliary_loss_clip": 0.01145983, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.05670357, + "balance_loss_mlp": 1.02399063, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 2.787413035115638, + "language_loss": 0.85187054, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87372047, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15039062, + "step": 5011, + "time_per_iteration": 2.477128505706787 + }, + { + "auxiliary_loss_clip": 0.01156273, + "auxiliary_loss_mlp": 0.01040985, + "balance_loss_clip": 1.06006026, + "balance_loss_mlp": 1.02419996, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.696369751553641, + "language_loss": 0.7223739, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.7443465, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16796875, + "step": 5012, + "time_per_iteration": 2.436245918273926 + }, + { + "auxiliary_loss_clip": 0.01142162, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_clip": 1.05157804, + "balance_loss_mlp": 1.02595687, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.4581733264880223, + "language_loss": 0.87822795, + "learning_rate": 3.275820002334819e-06, + "loss": 0.90006828, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15917969, + "step": 5013, + "time_per_iteration": 2.461606740951538 + }, + { + "auxiliary_loss_clip": 0.0114951, + "auxiliary_loss_mlp": 0.01044121, + "balance_loss_clip": 1.05634069, + "balance_loss_mlp": 1.02626371, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 3.0764670494639748, + "language_loss": 0.82875669, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85069299, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.17858887, + "step": 5014, + "time_per_iteration": 2.446850299835205 + }, + { + "auxiliary_loss_clip": 0.01138471, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.05142975, + "balance_loss_mlp": 1.02231097, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.5920891967270159, + "language_loss": 0.684196, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70596111, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15734863, + "step": 5015, + "time_per_iteration": 2.533893346786499 + }, + { + "auxiliary_loss_clip": 0.01141308, + "auxiliary_loss_mlp": 0.010419, + "balance_loss_clip": 1.05145323, + "balance_loss_mlp": 1.02558029, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.2430026973633055, + "language_loss": 0.7422936, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76412559, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.1630249, + "step": 5016, + "time_per_iteration": 2.467158079147339 + }, + { + "auxiliary_loss_clip": 0.01148852, + "auxiliary_loss_mlp": 0.01041683, + "balance_loss_clip": 1.05537152, + "balance_loss_mlp": 1.02514851, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.6334177302285031, + "language_loss": 0.65491027, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67681563, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16540527, + "step": 5017, + "time_per_iteration": 2.503697395324707 + }, + { + "auxiliary_loss_clip": 0.01143301, + "auxiliary_loss_mlp": 0.01047129, + "balance_loss_clip": 1.05239511, + "balance_loss_mlp": 1.03061867, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.123138241214775, + "language_loss": 0.68779302, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.70969737, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.16516113, + "step": 5018, + "time_per_iteration": 3.935662031173706 + }, + { + "auxiliary_loss_clip": 0.01144629, + "auxiliary_loss_mlp": 0.01042247, + "balance_loss_clip": 1.05721271, + "balance_loss_mlp": 1.02777481, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9971312151586293, + "language_loss": 0.78548431, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.80735302, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.14477539, + "step": 5019, + "time_per_iteration": 2.43660831451416 + }, + { + "auxiliary_loss_clip": 0.01144417, + "auxiliary_loss_mlp": 0.01043654, + "balance_loss_clip": 1.05444002, + "balance_loss_mlp": 1.02806139, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 1.9592693281614018, + "language_loss": 0.69600397, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71788472, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15576172, + "step": 5020, + "time_per_iteration": 2.9656822681427 + }, + { + "auxiliary_loss_clip": 0.0115054, + "auxiliary_loss_mlp": 0.01048662, + "balance_loss_clip": 1.05695295, + "balance_loss_mlp": 1.03247333, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9297138178660393, + "language_loss": 0.78674054, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80873257, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16186523, + "step": 5021, + "time_per_iteration": 2.4891886711120605 + }, + { + "auxiliary_loss_clip": 0.01143368, + "auxiliary_loss_mlp": 0.01041238, + "balance_loss_clip": 1.05166793, + "balance_loss_mlp": 1.02493, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.1012268355184194, + "language_loss": 0.76227975, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.7841258, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16320801, + "step": 5022, + "time_per_iteration": 2.4400854110717773 + }, + { + "auxiliary_loss_clip": 0.011477, + "auxiliary_loss_mlp": 0.01043652, + "balance_loss_clip": 1.05558658, + "balance_loss_mlp": 1.02733243, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 2.0819747872735324, + "language_loss": 0.69152403, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.7134375, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.16308594, + "step": 5023, + "time_per_iteration": 2.399531126022339 + }, + { + "auxiliary_loss_clip": 0.01166819, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.07026136, + "balance_loss_mlp": 1.02022672, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.5944995123080794, + "language_loss": 0.71778786, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73982203, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16369629, + "step": 5024, + "time_per_iteration": 2.5048413276672363 + }, + { + "auxiliary_loss_clip": 0.01143668, + "auxiliary_loss_mlp": 0.01042621, + "balance_loss_clip": 1.05486703, + "balance_loss_mlp": 1.02622974, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.7948881512530301, + "language_loss": 0.74508607, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76694894, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.16394043, + "step": 5025, + "time_per_iteration": 2.538731575012207 + }, + { + "auxiliary_loss_clip": 0.01160561, + "auxiliary_loss_mlp": 0.0104631, + "balance_loss_clip": 1.06994915, + "balance_loss_mlp": 1.0310812, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.773184879251294, + "language_loss": 0.66998804, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69205666, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.15228271, + "step": 5026, + "time_per_iteration": 2.489671468734741 + }, + { + "auxiliary_loss_clip": 0.01153635, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.06280923, + "balance_loss_mlp": 1.03037357, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 3.6226627081793326, + "language_loss": 0.85435796, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87636429, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.16625977, + "step": 5027, + "time_per_iteration": 2.4513566493988037 + }, + { + "auxiliary_loss_clip": 0.01139731, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.05173695, + "balance_loss_mlp": 1.02400041, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.709350848590461, + "language_loss": 0.7876507, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80943692, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14904785, + "step": 5028, + "time_per_iteration": 2.504373550415039 + }, + { + "auxiliary_loss_clip": 0.01172593, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_clip": 1.07544899, + "balance_loss_mlp": 1.02831888, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 2.161621852452035, + "language_loss": 0.77306449, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79524434, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.1706543, + "step": 5029, + "time_per_iteration": 5.4090306758880615 + }, + { + "auxiliary_loss_clip": 0.01160149, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.06656396, + "balance_loss_mlp": 1.01969159, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 2.1456022047894594, + "language_loss": 0.81894076, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84091163, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.17248535, + "step": 5030, + "time_per_iteration": 2.4518628120422363 + }, + { + "auxiliary_loss_clip": 0.01153395, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_clip": 1.05946803, + "balance_loss_mlp": 1.02816081, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 2.7607487315342176, + "language_loss": 0.69940901, + "learning_rate": 3.270413459468905e-06, + "loss": 0.721398, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.17346191, + "step": 5031, + "time_per_iteration": 2.4552676677703857 + }, + { + "auxiliary_loss_clip": 0.01149783, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.05867934, + "balance_loss_mlp": 1.0218854, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 2.0407541501697666, + "language_loss": 0.82522118, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84710133, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16333008, + "step": 5032, + "time_per_iteration": 2.563871145248413 + }, + { + "auxiliary_loss_clip": 0.01165625, + "auxiliary_loss_mlp": 0.01045482, + "balance_loss_clip": 1.06889665, + "balance_loss_mlp": 1.02738631, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.191955808447729, + "language_loss": 0.73542404, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75753516, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.18096924, + "step": 5033, + "time_per_iteration": 2.8554062843322754 + }, + { + "auxiliary_loss_clip": 0.01145538, + "auxiliary_loss_mlp": 0.01055401, + "balance_loss_clip": 1.05398059, + "balance_loss_mlp": 1.03629196, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.849359003067634, + "language_loss": 0.74296319, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76497257, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.19116211, + "step": 5034, + "time_per_iteration": 2.5029633045196533 + }, + { + "auxiliary_loss_clip": 0.01146121, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.05549788, + "balance_loss_mlp": 1.01733816, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.8697906029313522, + "language_loss": 0.72126627, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74306381, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.16296387, + "step": 5035, + "time_per_iteration": 3.879612684249878 + }, + { + "auxiliary_loss_clip": 0.01151567, + "auxiliary_loss_mlp": 0.01043922, + "balance_loss_clip": 1.05835474, + "balance_loss_mlp": 1.02850819, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.9363498589418182, + "language_loss": 0.87648523, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89844006, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.1541748, + "step": 5036, + "time_per_iteration": 2.5363073348999023 + }, + { + "auxiliary_loss_clip": 0.01143357, + "auxiliary_loss_mlp": 0.01049115, + "balance_loss_clip": 1.05398333, + "balance_loss_mlp": 1.03134131, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 3.166800748351767, + "language_loss": 0.7790736, + "learning_rate": 3.268607806688536e-06, + "loss": 0.80099833, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.17773438, + "step": 5037, + "time_per_iteration": 2.536630392074585 + }, + { + "auxiliary_loss_clip": 0.01157934, + "auxiliary_loss_mlp": 0.01042855, + "balance_loss_clip": 1.06534767, + "balance_loss_mlp": 1.02590299, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.752988053924723, + "language_loss": 0.77746218, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79947007, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16955566, + "step": 5038, + "time_per_iteration": 2.461935520172119 + }, + { + "auxiliary_loss_clip": 0.0114623, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.05768108, + "balance_loss_mlp": 1.02335954, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 3.065576432528307, + "language_loss": 0.73952961, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76137871, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.1529541, + "step": 5039, + "time_per_iteration": 2.5044972896575928 + }, + { + "auxiliary_loss_clip": 0.01151705, + "auxiliary_loss_mlp": 0.01047173, + "balance_loss_clip": 1.06231928, + "balance_loss_mlp": 1.03079319, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.882802375205703, + "language_loss": 0.79481614, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81680495, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.16381836, + "step": 5040, + "time_per_iteration": 2.5086541175842285 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.06366336, + "balance_loss_mlp": 1.02627349, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 2.0873194986939847, + "language_loss": 0.81742048, + "learning_rate": 3.267403075901438e-06, + "loss": 0.83936399, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15057373, + "step": 5041, + "time_per_iteration": 2.447192907333374 + }, + { + "auxiliary_loss_clip": 0.01121954, + "auxiliary_loss_mlp": 0.01025007, + "balance_loss_clip": 1.08958173, + "balance_loss_mlp": 1.02305746, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.764563749610593, + "language_loss": 0.5948081, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61627775, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.32373047, + "router_z_loss_mlp": 0.01947021, + "step": 5042, + "time_per_iteration": 3.2021074295043945 + }, + { + "auxiliary_loss_clip": 0.01157739, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.06567669, + "balance_loss_mlp": 1.02162218, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.714578558379304, + "language_loss": 0.7148332, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73678654, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.1595459, + "step": 5043, + "time_per_iteration": 2.530597686767578 + }, + { + "auxiliary_loss_clip": 0.01150865, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.06116068, + "balance_loss_mlp": 1.01502657, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 1.80899212981621, + "language_loss": 0.70473665, + "learning_rate": 3.266499023140606e-06, + "loss": 0.72655261, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.15698242, + "step": 5044, + "time_per_iteration": 2.443999767303467 + }, + { + "auxiliary_loss_clip": 0.01149393, + "auxiliary_loss_mlp": 0.01040754, + "balance_loss_clip": 1.06010079, + "balance_loss_mlp": 1.0251255, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3590610779979069, + "language_loss": 0.77448201, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.7963835, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.15637207, + "step": 5045, + "time_per_iteration": 2.5771379470825195 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01037468, + "balance_loss_clip": 1.06059754, + "balance_loss_mlp": 1.02028966, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 2.3079731153268677, + "language_loss": 0.72190338, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74379349, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.171875, + "step": 5046, + "time_per_iteration": 2.762289047241211 + }, + { + "auxiliary_loss_clip": 0.01147298, + "auxiliary_loss_mlp": 0.01036052, + "balance_loss_clip": 1.05493712, + "balance_loss_mlp": 1.01739526, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.6639503681963477, + "language_loss": 0.80621862, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.8280521, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.18652344, + "step": 5047, + "time_per_iteration": 2.5029287338256836 + }, + { + "auxiliary_loss_clip": 0.01147071, + "auxiliary_loss_mlp": 0.01055298, + "balance_loss_clip": 1.05463767, + "balance_loss_mlp": 1.0379647, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 2.5444402266989923, + "language_loss": 0.72089738, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74292111, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.17333984, + "step": 5048, + "time_per_iteration": 2.4797685146331787 + }, + { + "auxiliary_loss_clip": 0.01147882, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.05630517, + "balance_loss_mlp": 1.02131033, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.818169867333547, + "language_loss": 0.75878203, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.78062749, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.15356445, + "step": 5049, + "time_per_iteration": 2.45664381980896 + }, + { + "auxiliary_loss_clip": 0.01157419, + "auxiliary_loss_mlp": 0.01038635, + "balance_loss_clip": 1.06361198, + "balance_loss_mlp": 1.0234592, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.5376558769655069, + "language_loss": 0.81933343, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84129399, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15185547, + "step": 5050, + "time_per_iteration": 2.537187099456787 + }, + { + "auxiliary_loss_clip": 0.0115231, + "auxiliary_loss_mlp": 0.01041753, + "balance_loss_clip": 1.06171823, + "balance_loss_mlp": 1.0246588, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.717700350284904, + "language_loss": 0.73587024, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.75781095, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.1706543, + "step": 5051, + "time_per_iteration": 2.482675075531006 + }, + { + "auxiliary_loss_clip": 0.01150629, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.06090879, + "balance_loss_mlp": 1.02120984, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 2.873227639869455, + "language_loss": 0.76446676, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78633094, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.14599609, + "step": 5052, + "time_per_iteration": 2.46034574508667 + }, + { + "auxiliary_loss_clip": 0.01149155, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.05711532, + "balance_loss_mlp": 1.02385628, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 2.2007243914424626, + "language_loss": 0.82454222, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84643316, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.16082764, + "step": 5053, + "time_per_iteration": 2.633629560470581 + }, + { + "auxiliary_loss_clip": 0.01150017, + "auxiliary_loss_mlp": 0.01046502, + "balance_loss_clip": 1.05497718, + "balance_loss_mlp": 1.02937162, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.6619798298110646, + "language_loss": 0.71031404, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73227924, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.17126465, + "step": 5054, + "time_per_iteration": 2.4255404472351074 + }, + { + "auxiliary_loss_clip": 0.01155478, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.06403422, + "balance_loss_mlp": 1.02835035, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.8810599974054314, + "language_loss": 0.69142044, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71342266, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.16394043, + "step": 5055, + "time_per_iteration": 2.542083501815796 + }, + { + "auxiliary_loss_clip": 0.0114999, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.05877733, + "balance_loss_mlp": 1.02242732, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 3.4955344660317813, + "language_loss": 0.67567301, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69755721, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.16015625, + "step": 5056, + "time_per_iteration": 2.474222183227539 + }, + { + "auxiliary_loss_clip": 0.01152015, + "auxiliary_loss_mlp": 0.01045911, + "balance_loss_clip": 1.0609467, + "balance_loss_mlp": 1.02978849, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.7676985473739861, + "language_loss": 0.82567871, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84765792, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.16131592, + "step": 5057, + "time_per_iteration": 2.5191545486450195 + }, + { + "auxiliary_loss_clip": 0.01140283, + "auxiliary_loss_mlp": 0.01051933, + "balance_loss_clip": 1.05161786, + "balance_loss_mlp": 1.03394461, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 2.3528605823484035, + "language_loss": 0.8911773, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91309941, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.17993164, + "step": 5058, + "time_per_iteration": 2.5957019329071045 + }, + { + "auxiliary_loss_clip": 0.01152794, + "auxiliary_loss_mlp": 0.01045357, + "balance_loss_clip": 1.06117189, + "balance_loss_mlp": 1.02902555, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 1.852600150442258, + "language_loss": 0.71701431, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73899585, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.16320801, + "step": 5059, + "time_per_iteration": 2.7468373775482178 + }, + { + "auxiliary_loss_clip": 0.01151544, + "auxiliary_loss_mlp": 0.01047655, + "balance_loss_clip": 1.06134331, + "balance_loss_mlp": 1.03047657, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 1.643785556506754, + "language_loss": 0.7318064, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75379837, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.17175293, + "step": 5060, + "time_per_iteration": 2.8193023204803467 + }, + { + "auxiliary_loss_clip": 0.01146371, + "auxiliary_loss_mlp": 0.01039912, + "balance_loss_clip": 1.05574036, + "balance_loss_mlp": 1.02300835, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.8664217176088218, + "language_loss": 0.77000123, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79186404, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16894531, + "step": 5061, + "time_per_iteration": 4.196758508682251 + }, + { + "auxiliary_loss_clip": 0.01153628, + "auxiliary_loss_mlp": 0.01049136, + "balance_loss_clip": 1.06109059, + "balance_loss_mlp": 1.02918053, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.0515470026389253, + "language_loss": 0.82232678, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84435445, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.19958496, + "step": 5062, + "time_per_iteration": 2.645975112915039 + }, + { + "auxiliary_loss_clip": 0.01146066, + "auxiliary_loss_mlp": 0.0104783, + "balance_loss_clip": 1.05343413, + "balance_loss_mlp": 1.03185606, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 2.095567738124567, + "language_loss": 0.74937272, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.7713117, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15966797, + "step": 5063, + "time_per_iteration": 2.511848211288452 + }, + { + "auxiliary_loss_clip": 0.0115205, + "auxiliary_loss_mlp": 0.0104051, + "balance_loss_clip": 1.06078255, + "balance_loss_mlp": 1.02314126, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.6995473828558247, + "language_loss": 0.83917844, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86110401, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.17382812, + "step": 5064, + "time_per_iteration": 2.518739700317383 + }, + { + "auxiliary_loss_clip": 0.01151051, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.0598855, + "balance_loss_mlp": 1.02156043, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 2.152541228615868, + "language_loss": 0.76443684, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78633618, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.1730957, + "step": 5065, + "time_per_iteration": 2.450239419937134 + }, + { + "auxiliary_loss_clip": 0.01163791, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.06477475, + "balance_loss_mlp": 1.02801776, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 1.6041856860349162, + "language_loss": 0.61863649, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64072901, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.17443848, + "step": 5066, + "time_per_iteration": 2.57863712310791 + }, + { + "auxiliary_loss_clip": 0.01160229, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.06649756, + "balance_loss_mlp": 1.02853906, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.9020504526200923, + "language_loss": 0.82902366, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85107452, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.16320801, + "step": 5067, + "time_per_iteration": 2.441462278366089 + }, + { + "auxiliary_loss_clip": 0.01142303, + "auxiliary_loss_mlp": 0.01040346, + "balance_loss_clip": 1.05467224, + "balance_loss_mlp": 1.02525449, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.208931418812665, + "language_loss": 0.63449562, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65632212, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.15106201, + "step": 5068, + "time_per_iteration": 2.4985220432281494 + }, + { + "auxiliary_loss_clip": 0.01146151, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.05715156, + "balance_loss_mlp": 1.01691282, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.7946540003717828, + "language_loss": 0.7527535, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77454489, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.16070557, + "step": 5069, + "time_per_iteration": 2.524085760116577 + }, + { + "auxiliary_loss_clip": 0.01149418, + "auxiliary_loss_mlp": 0.0105112, + "balance_loss_clip": 1.06184888, + "balance_loss_mlp": 1.03406143, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 2.005628554728748, + "language_loss": 0.75382817, + "learning_rate": 3.258645826569261e-06, + "loss": 0.77583361, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.17053223, + "step": 5070, + "time_per_iteration": 2.5041232109069824 + }, + { + "auxiliary_loss_clip": 0.0116642, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.07079172, + "balance_loss_mlp": 1.02400136, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.8186541281516428, + "language_loss": 0.81166267, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.8337394, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17236328, + "step": 5071, + "time_per_iteration": 2.5266242027282715 + }, + { + "auxiliary_loss_clip": 0.01151555, + "auxiliary_loss_mlp": 0.01050143, + "balance_loss_clip": 1.05712581, + "balance_loss_mlp": 1.03095055, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.7153977570281533, + "language_loss": 0.7571705, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.77918756, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.19189453, + "step": 5072, + "time_per_iteration": 3.9442696571350098 + }, + { + "auxiliary_loss_clip": 0.01147721, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.05779803, + "balance_loss_mlp": 1.02826214, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.8417476902278773, + "language_loss": 0.71107388, + "learning_rate": 3.257737608512723e-06, + "loss": 0.732997, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.16320801, + "step": 5073, + "time_per_iteration": 4.167989492416382 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.05457747, + "balance_loss_mlp": 1.02619505, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.3831876446199405, + "language_loss": 0.76179671, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78373486, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.17492676, + "step": 5074, + "time_per_iteration": 2.4291911125183105 + }, + { + "auxiliary_loss_clip": 0.01151697, + "auxiliary_loss_mlp": 0.0104078, + "balance_loss_clip": 1.06012964, + "balance_loss_mlp": 1.02443671, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 2.080119126812702, + "language_loss": 0.74450517, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76642996, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.16345215, + "step": 5075, + "time_per_iteration": 2.5080583095550537 + }, + { + "auxiliary_loss_clip": 0.01159082, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.06622243, + "balance_loss_mlp": 1.02473402, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.352270297352086, + "language_loss": 0.75779092, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77980649, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.17736816, + "step": 5076, + "time_per_iteration": 2.418264389038086 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.05869603, + "balance_loss_mlp": 1.02098823, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.6558259830789746, + "language_loss": 0.79526365, + "learning_rate": 3.25652598344811e-06, + "loss": 0.817137, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.1595459, + "step": 5077, + "time_per_iteration": 2.4887309074401855 + }, + { + "auxiliary_loss_clip": 0.01145913, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.05874181, + "balance_loss_mlp": 1.01832366, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.7001644909189861, + "language_loss": 0.74809444, + "learning_rate": 3.256222958034259e-06, + "loss": 0.7698828, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.14599609, + "step": 5078, + "time_per_iteration": 3.9468812942504883 + }, + { + "auxiliary_loss_clip": 0.01140914, + "auxiliary_loss_mlp": 0.01052137, + "balance_loss_clip": 1.0532341, + "balance_loss_mlp": 1.03439808, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.264275996138001, + "language_loss": 0.67056245, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69249296, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1776123, + "step": 5079, + "time_per_iteration": 2.4351704120635986 + }, + { + "auxiliary_loss_clip": 0.01147796, + "auxiliary_loss_mlp": 0.0103906, + "balance_loss_clip": 1.0568006, + "balance_loss_mlp": 1.02297902, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.9666156088331275, + "language_loss": 0.79811865, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.81998718, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.16088867, + "step": 5080, + "time_per_iteration": 2.4720444679260254 + }, + { + "auxiliary_loss_clip": 0.01154901, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.0620259, + "balance_loss_mlp": 1.02086246, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.9228168044047114, + "language_loss": 0.80724347, + "learning_rate": 3.255313596022074e-06, + "loss": 0.82915866, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.15759277, + "step": 5081, + "time_per_iteration": 2.517587423324585 + }, + { + "auxiliary_loss_clip": 0.01161199, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.07018435, + "balance_loss_mlp": 1.02540755, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.6431658970615135, + "language_loss": 0.71732509, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73935258, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.16131592, + "step": 5082, + "time_per_iteration": 2.5396015644073486 + }, + { + "auxiliary_loss_clip": 0.01151242, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.05594015, + "balance_loss_mlp": 1.0271194, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.893092605537779, + "language_loss": 0.73015493, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75211656, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.17797852, + "step": 5083, + "time_per_iteration": 2.5996365547180176 + }, + { + "auxiliary_loss_clip": 0.01164268, + "auxiliary_loss_mlp": 0.01040543, + "balance_loss_clip": 1.06857502, + "balance_loss_mlp": 1.02306652, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 2.029674107728799, + "language_loss": 0.70901537, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73106349, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.17480469, + "step": 5084, + "time_per_iteration": 2.4491541385650635 + }, + { + "auxiliary_loss_clip": 0.01174113, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.07857263, + "balance_loss_mlp": 1.01746607, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 1.8435060221254913, + "language_loss": 0.78082228, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80290616, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.16821289, + "step": 5085, + "time_per_iteration": 2.4582159519195557 + }, + { + "auxiliary_loss_clip": 0.01149676, + "auxiliary_loss_mlp": 0.01039725, + "balance_loss_clip": 1.05821276, + "balance_loss_mlp": 1.02334559, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.7341588526849616, + "language_loss": 0.78550541, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80739945, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16369629, + "step": 5086, + "time_per_iteration": 2.941755771636963 + }, + { + "auxiliary_loss_clip": 0.01167546, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.07379532, + "balance_loss_mlp": 1.0283289, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.9058170688679943, + "language_loss": 0.76702368, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78915656, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.17425537, + "step": 5087, + "time_per_iteration": 2.5072436332702637 + }, + { + "auxiliary_loss_clip": 0.01162387, + "auxiliary_loss_mlp": 0.01039321, + "balance_loss_clip": 1.06957221, + "balance_loss_mlp": 1.02174997, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 3.774207930524722, + "language_loss": 0.72409451, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74611163, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.17565918, + "step": 5088, + "time_per_iteration": 2.560868501663208 + }, + { + "auxiliary_loss_clip": 0.01158603, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.06201839, + "balance_loss_mlp": 1.02227962, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.6797290829574076, + "language_loss": 0.792934, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81491077, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16784668, + "step": 5089, + "time_per_iteration": 2.492558479309082 + }, + { + "auxiliary_loss_clip": 0.01165488, + "auxiliary_loss_mlp": 0.01045367, + "balance_loss_clip": 1.07182479, + "balance_loss_mlp": 1.02835608, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.814631948853059, + "language_loss": 0.773399, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79550761, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.17028809, + "step": 5090, + "time_per_iteration": 2.553398847579956 + }, + { + "auxiliary_loss_clip": 0.01156052, + "auxiliary_loss_mlp": 0.01050498, + "balance_loss_clip": 1.06248295, + "balance_loss_mlp": 1.03385627, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 2.465205518782285, + "language_loss": 0.76396257, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78602803, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16638184, + "step": 5091, + "time_per_iteration": 2.5915510654449463 + }, + { + "auxiliary_loss_clip": 0.01154646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.05954814, + "balance_loss_mlp": 1.02297544, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.8926374186838741, + "language_loss": 0.72210348, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74404478, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16516113, + "step": 5092, + "time_per_iteration": 2.4587676525115967 + }, + { + "auxiliary_loss_clip": 0.01157445, + "auxiliary_loss_mlp": 0.01043163, + "balance_loss_clip": 1.06331897, + "balance_loss_mlp": 1.02771342, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 2.3005217857797753, + "language_loss": 0.82565701, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84766304, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15441895, + "step": 5093, + "time_per_iteration": 2.4409706592559814 + }, + { + "auxiliary_loss_clip": 0.01152152, + "auxiliary_loss_mlp": 0.01043633, + "balance_loss_clip": 1.05984163, + "balance_loss_mlp": 1.0276829, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 2.057387059471377, + "language_loss": 0.75133675, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77329463, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.1595459, + "step": 5094, + "time_per_iteration": 2.4813480377197266 + }, + { + "auxiliary_loss_clip": 0.01157708, + "auxiliary_loss_mlp": 0.0104358, + "balance_loss_clip": 1.06454229, + "balance_loss_mlp": 1.02749884, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 2.0248600788711806, + "language_loss": 0.7589376, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78095043, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16094971, + "step": 5095, + "time_per_iteration": 2.5407135486602783 + }, + { + "auxiliary_loss_clip": 0.01157783, + "auxiliary_loss_mlp": 0.01043508, + "balance_loss_clip": 1.06945109, + "balance_loss_mlp": 1.02774906, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 4.217339450395093, + "language_loss": 0.80363208, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82564497, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15771484, + "step": 5096, + "time_per_iteration": 2.4792370796203613 + }, + { + "auxiliary_loss_clip": 0.01151516, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.05858326, + "balance_loss_mlp": 1.02235508, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.071041357881467, + "language_loss": 0.81655973, + "learning_rate": 3.250456437422258e-06, + "loss": 0.8384577, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15930176, + "step": 5097, + "time_per_iteration": 2.4298813343048096 + }, + { + "auxiliary_loss_clip": 0.01166087, + "auxiliary_loss_mlp": 0.01043115, + "balance_loss_clip": 1.07121134, + "balance_loss_mlp": 1.02537632, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 2.9179351765772594, + "language_loss": 0.7810533, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80314535, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17724609, + "step": 5098, + "time_per_iteration": 2.4792964458465576 + }, + { + "auxiliary_loss_clip": 0.0115478, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.0632447, + "balance_loss_mlp": 1.02558982, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.853514081808921, + "language_loss": 0.84151399, + "learning_rate": 3.249848438115917e-06, + "loss": 0.8634727, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.1552124, + "step": 5099, + "time_per_iteration": 2.501483917236328 + }, + { + "auxiliary_loss_clip": 0.01152176, + "auxiliary_loss_mlp": 0.01048106, + "balance_loss_clip": 1.0585506, + "balance_loss_mlp": 1.0310235, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.920955090527403, + "language_loss": 0.85477901, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87678182, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.17077637, + "step": 5100, + "time_per_iteration": 2.882011651992798 + }, + { + "auxiliary_loss_clip": 0.01154527, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.06183875, + "balance_loss_mlp": 1.0197258, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 2.3724680694312013, + "language_loss": 0.79305267, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81497651, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.18115234, + "step": 5101, + "time_per_iteration": 2.470465898513794 + }, + { + "auxiliary_loss_clip": 0.01160413, + "auxiliary_loss_mlp": 0.01051923, + "balance_loss_clip": 1.06538129, + "balance_loss_mlp": 1.03377974, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.561219930626924, + "language_loss": 0.80214185, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82426518, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.18139648, + "step": 5102, + "time_per_iteration": 2.543151617050171 + }, + { + "auxiliary_loss_clip": 0.01153706, + "auxiliary_loss_mlp": 0.01042871, + "balance_loss_clip": 1.06235194, + "balance_loss_mlp": 1.02515662, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.996035676068977, + "language_loss": 0.88824594, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91021168, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.17712402, + "step": 5103, + "time_per_iteration": 2.4577245712280273 + }, + { + "auxiliary_loss_clip": 0.01159574, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.06654215, + "balance_loss_mlp": 1.03014326, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.8778884954495874, + "language_loss": 0.73658121, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.75864625, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.16784668, + "step": 5104, + "time_per_iteration": 2.5322272777557373 + }, + { + "auxiliary_loss_clip": 0.01154085, + "auxiliary_loss_mlp": 0.01043169, + "balance_loss_clip": 1.05864453, + "balance_loss_mlp": 1.02602696, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8499343608456356, + "language_loss": 0.72866231, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75063485, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.17150879, + "step": 5105, + "time_per_iteration": 3.869065761566162 + }, + { + "auxiliary_loss_clip": 0.01166235, + "auxiliary_loss_mlp": 0.01044013, + "balance_loss_clip": 1.07301676, + "balance_loss_mlp": 1.02678692, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 3.217652920679291, + "language_loss": 0.87014169, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.8922441, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.17236328, + "step": 5106, + "time_per_iteration": 2.5888636112213135 + }, + { + "auxiliary_loss_clip": 0.01164015, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.06661654, + "balance_loss_mlp": 1.03053308, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.333939393151208, + "language_loss": 0.71738899, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73951316, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.17871094, + "step": 5107, + "time_per_iteration": 2.556987762451172 + }, + { + "auxiliary_loss_clip": 0.01157835, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.06704307, + "balance_loss_mlp": 1.03321147, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 3.369756618846619, + "language_loss": 0.72102916, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74311602, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.1763916, + "step": 5108, + "time_per_iteration": 2.5101847648620605 + }, + { + "auxiliary_loss_clip": 0.01164172, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.07082748, + "balance_loss_mlp": 1.02170217, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.516569930264392, + "language_loss": 0.85869741, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.88071787, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.16143799, + "step": 5109, + "time_per_iteration": 2.5024960041046143 + }, + { + "auxiliary_loss_clip": 0.01155915, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.06252468, + "balance_loss_mlp": 1.02065456, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 2.1089235888706463, + "language_loss": 0.67552316, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69744575, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.15704346, + "step": 5110, + "time_per_iteration": 2.6217751502990723 + }, + { + "auxiliary_loss_clip": 0.01148135, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.06024528, + "balance_loss_mlp": 1.01580644, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5903575664498635, + "language_loss": 0.76856232, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79035211, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.15039062, + "step": 5111, + "time_per_iteration": 2.717653751373291 + }, + { + "auxiliary_loss_clip": 0.01146792, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.05632341, + "balance_loss_mlp": 1.02176666, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 2.1779346087218414, + "language_loss": 0.67075551, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69261301, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.17193604, + "step": 5112, + "time_per_iteration": 2.6782307624816895 + }, + { + "auxiliary_loss_clip": 0.01154118, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_clip": 1.05800045, + "balance_loss_mlp": 1.02526522, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 2.04637434885995, + "language_loss": 0.79534459, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81733644, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.19787598, + "step": 5113, + "time_per_iteration": 2.9861247539520264 + }, + { + "auxiliary_loss_clip": 0.01156407, + "auxiliary_loss_mlp": 0.01040862, + "balance_loss_clip": 1.06145024, + "balance_loss_mlp": 1.02449489, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 2.2637432364328376, + "language_loss": 0.76504719, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.78701985, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16369629, + "step": 5114, + "time_per_iteration": 2.591888904571533 + }, + { + "auxiliary_loss_clip": 0.01157785, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.06677127, + "balance_loss_mlp": 1.01614761, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 3.389332633889315, + "language_loss": 0.61951506, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64142996, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.17553711, + "step": 5115, + "time_per_iteration": 3.9656739234924316 + }, + { + "auxiliary_loss_clip": 0.01151231, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_clip": 1.05918205, + "balance_loss_mlp": 1.0273416, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 2.1514489844773457, + "language_loss": 0.82612908, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84808159, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.16674805, + "step": 5116, + "time_per_iteration": 3.8798787593841553 + }, + { + "auxiliary_loss_clip": 0.01146335, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_clip": 1.05702484, + "balance_loss_mlp": 1.04157209, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 2.2867580766783653, + "language_loss": 0.76444793, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78651989, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.19299316, + "step": 5117, + "time_per_iteration": 2.5210185050964355 + }, + { + "auxiliary_loss_clip": 0.01156881, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_clip": 1.06431389, + "balance_loss_mlp": 1.02538872, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 3.621012182583853, + "language_loss": 0.71759409, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.73958999, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.17321777, + "step": 5118, + "time_per_iteration": 2.484755516052246 + }, + { + "auxiliary_loss_clip": 0.0115877, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.06799388, + "balance_loss_mlp": 1.02496099, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.6969392057267012, + "language_loss": 0.74285305, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76484966, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15917969, + "step": 5119, + "time_per_iteration": 2.4679372310638428 + }, + { + "auxiliary_loss_clip": 0.0116381, + "auxiliary_loss_mlp": 0.0105239, + "balance_loss_clip": 1.06952012, + "balance_loss_mlp": 1.03401995, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 2.296723241208475, + "language_loss": 0.80399704, + "learning_rate": 3.243453017305926e-06, + "loss": 0.826159, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.18383789, + "step": 5120, + "time_per_iteration": 2.5713515281677246 + }, + { + "auxiliary_loss_clip": 0.01151558, + "auxiliary_loss_mlp": 0.01047281, + "balance_loss_clip": 1.06218028, + "balance_loss_mlp": 1.03121114, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.810162471220395, + "language_loss": 0.79947966, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82146806, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.1607666, + "step": 5121, + "time_per_iteration": 3.862185001373291 + }, + { + "auxiliary_loss_clip": 0.01153195, + "auxiliary_loss_mlp": 0.01035674, + "balance_loss_clip": 1.06220746, + "balance_loss_mlp": 1.01998019, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5631999067718625, + "language_loss": 0.82937437, + "learning_rate": 3.242842843433319e-06, + "loss": 0.85126305, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15692139, + "step": 5122, + "time_per_iteration": 2.502624988555908 + }, + { + "auxiliary_loss_clip": 0.01094239, + "auxiliary_loss_mlp": 0.01016662, + "balance_loss_clip": 1.06375837, + "balance_loss_mlp": 1.01476634, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7938190709020839, + "language_loss": 0.58596206, + "learning_rate": 3.242537685798143e-06, + "loss": 0.6070711, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.0189209, + "step": 5123, + "time_per_iteration": 3.2648003101348877 + }, + { + "auxiliary_loss_clip": 0.01154742, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.0601815, + "balance_loss_mlp": 1.01819348, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 2.1565766797401897, + "language_loss": 0.83381939, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85572726, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.17858887, + "step": 5124, + "time_per_iteration": 2.500314712524414 + }, + { + "auxiliary_loss_clip": 0.01160966, + "auxiliary_loss_mlp": 0.0104425, + "balance_loss_clip": 1.06735587, + "balance_loss_mlp": 1.02777541, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.006434067858237, + "language_loss": 0.79626715, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81831932, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16467285, + "step": 5125, + "time_per_iteration": 2.7369720935821533 + }, + { + "auxiliary_loss_clip": 0.01151891, + "auxiliary_loss_mlp": 0.01041125, + "balance_loss_clip": 1.05740142, + "balance_loss_mlp": 1.02305329, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 2.6095670098781723, + "language_loss": 0.64652038, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66845053, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.18066406, + "step": 5126, + "time_per_iteration": 2.6222519874572754 + }, + { + "auxiliary_loss_clip": 0.01157391, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.06855106, + "balance_loss_mlp": 1.02339053, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.7743466775088932, + "language_loss": 0.86904025, + "learning_rate": 3.241316584201646e-06, + "loss": 0.89101291, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.16479492, + "step": 5127, + "time_per_iteration": 2.5415217876434326 + }, + { + "auxiliary_loss_clip": 0.01154352, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_clip": 1.06406116, + "balance_loss_mlp": 1.02505112, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.996634633341862, + "language_loss": 0.68768144, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70964205, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.16662598, + "step": 5128, + "time_per_iteration": 2.5648908615112305 + }, + { + "auxiliary_loss_clip": 0.01148618, + "auxiliary_loss_mlp": 0.0104397, + "balance_loss_clip": 1.05709958, + "balance_loss_mlp": 1.02657735, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.7326736250574908, + "language_loss": 0.71317172, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73509753, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.17407227, + "step": 5129, + "time_per_iteration": 2.6369521617889404 + }, + { + "auxiliary_loss_clip": 0.01087756, + "auxiliary_loss_mlp": 0.0100908, + "balance_loss_clip": 1.05789018, + "balance_loss_mlp": 1.00678205, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8251637560353164, + "language_loss": 0.59153235, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61250073, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.29833984, + "router_z_loss_mlp": 0.02297974, + "step": 5130, + "time_per_iteration": 3.0755131244659424 + }, + { + "auxiliary_loss_clip": 0.01160395, + "auxiliary_loss_mlp": 0.01041708, + "balance_loss_clip": 1.06617296, + "balance_loss_mlp": 1.02480364, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.58988954258064, + "language_loss": 0.72723925, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.74926031, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.16906738, + "step": 5131, + "time_per_iteration": 2.452043294906616 + }, + { + "auxiliary_loss_clip": 0.01156306, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.06277061, + "balance_loss_mlp": 1.02196121, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.5435010385366699, + "language_loss": 0.71018064, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73211986, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15649414, + "step": 5132, + "time_per_iteration": 2.47123646736145 + }, + { + "auxiliary_loss_clip": 0.01145267, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.05858254, + "balance_loss_mlp": 1.02529192, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 2.52892107984952, + "language_loss": 0.89901841, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92087972, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.15563965, + "step": 5133, + "time_per_iteration": 2.482381820678711 + }, + { + "auxiliary_loss_clip": 0.01158916, + "auxiliary_loss_mlp": 0.01047793, + "balance_loss_clip": 1.06579256, + "balance_loss_mlp": 1.03072274, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.9975028993980386, + "language_loss": 0.67148626, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69355333, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.1706543, + "step": 5134, + "time_per_iteration": 2.584843158721924 + }, + { + "auxiliary_loss_clip": 0.01152654, + "auxiliary_loss_mlp": 0.01044654, + "balance_loss_clip": 1.05874074, + "balance_loss_mlp": 1.02776217, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.0223673135386178, + "language_loss": 0.82898504, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85095811, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.16894531, + "step": 5135, + "time_per_iteration": 2.5234363079071045 + }, + { + "auxiliary_loss_clip": 0.0108065, + "auxiliary_loss_mlp": 0.01006927, + "balance_loss_clip": 1.05103421, + "balance_loss_mlp": 1.00495088, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7006597524765578, + "language_loss": 0.55311406, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57398981, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.01974487, + "step": 5136, + "time_per_iteration": 3.150080680847168 + }, + { + "auxiliary_loss_clip": 0.01147358, + "auxiliary_loss_mlp": 0.01040729, + "balance_loss_clip": 1.0559597, + "balance_loss_mlp": 1.02440906, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 2.8311377161090765, + "language_loss": 0.76068485, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78256571, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.16320801, + "step": 5137, + "time_per_iteration": 2.991583824157715 + }, + { + "auxiliary_loss_clip": 0.01154339, + "auxiliary_loss_mlp": 0.01043874, + "balance_loss_clip": 1.06247568, + "balance_loss_mlp": 1.02759027, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 2.5980937607213472, + "language_loss": 0.79919076, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82117283, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.16271973, + "step": 5138, + "time_per_iteration": 2.8944034576416016 + }, + { + "auxiliary_loss_clip": 0.01166126, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_clip": 1.06905043, + "balance_loss_mlp": 1.02537203, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.9902721517065334, + "language_loss": 0.81425124, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83633876, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.17248535, + "step": 5139, + "time_per_iteration": 2.5199077129364014 + }, + { + "auxiliary_loss_clip": 0.01153585, + "auxiliary_loss_mlp": 0.01039821, + "balance_loss_clip": 1.05873549, + "balance_loss_mlp": 1.0220592, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 1.8504456255590678, + "language_loss": 0.76876676, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79070079, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17749023, + "step": 5140, + "time_per_iteration": 2.565221071243286 + }, + { + "auxiliary_loss_clip": 0.01141758, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.05609417, + "balance_loss_mlp": 1.02747154, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.445559659724317, + "language_loss": 0.79036307, + "learning_rate": 3.237036802553252e-06, + "loss": 0.81221068, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.15527344, + "step": 5141, + "time_per_iteration": 2.497023344039917 + }, + { + "auxiliary_loss_clip": 0.01150026, + "auxiliary_loss_mlp": 0.01048252, + "balance_loss_clip": 1.05762911, + "balance_loss_mlp": 1.03114498, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.838381317257382, + "language_loss": 0.86997676, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89195955, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.17089844, + "step": 5142, + "time_per_iteration": 2.4490253925323486 + }, + { + "auxiliary_loss_clip": 0.01148256, + "auxiliary_loss_mlp": 0.01044609, + "balance_loss_clip": 1.05733156, + "balance_loss_mlp": 1.02863503, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 2.250046675251591, + "language_loss": 0.79471564, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81664431, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.159729, + "step": 5143, + "time_per_iteration": 2.4681408405303955 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_clip": 1.0501157, + "balance_loss_mlp": 1.02507067, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 2.4174794751645003, + "language_loss": 0.72102511, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74283588, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.16687012, + "step": 5144, + "time_per_iteration": 2.526038646697998 + }, + { + "auxiliary_loss_clip": 0.01144462, + "auxiliary_loss_mlp": 0.01051839, + "balance_loss_clip": 1.05076003, + "balance_loss_mlp": 1.0334096, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.6901674465958647, + "language_loss": 0.74312866, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76509178, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.18432617, + "step": 5145, + "time_per_iteration": 2.487581729888916 + }, + { + "auxiliary_loss_clip": 0.0114551, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.05532289, + "balance_loss_mlp": 1.02323925, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.859224131912782, + "language_loss": 0.76584733, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78770137, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.16650391, + "step": 5146, + "time_per_iteration": 2.5220773220062256 + }, + { + "auxiliary_loss_clip": 0.01144346, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.05389333, + "balance_loss_mlp": 1.02714062, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.0545363533653367, + "language_loss": 0.66156971, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68345541, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.17077637, + "step": 5147, + "time_per_iteration": 2.423020124435425 + }, + { + "auxiliary_loss_clip": 0.01156096, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.06233513, + "balance_loss_mlp": 1.02759671, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 1.7594016325541049, + "language_loss": 0.74684465, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.76883864, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15710449, + "step": 5148, + "time_per_iteration": 3.8912744522094727 + }, + { + "auxiliary_loss_clip": 0.01156382, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.06060529, + "balance_loss_mlp": 1.03068328, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.0874670801027544, + "language_loss": 0.72241199, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74445772, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.17504883, + "step": 5149, + "time_per_iteration": 2.4274137020111084 + }, + { + "auxiliary_loss_clip": 0.01147342, + "auxiliary_loss_mlp": 0.01046306, + "balance_loss_clip": 1.05425322, + "balance_loss_mlp": 1.02837634, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 1.9799572632856448, + "language_loss": 0.84718335, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.86911982, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.17932129, + "step": 5150, + "time_per_iteration": 2.459991693496704 + }, + { + "auxiliary_loss_clip": 0.01152783, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_clip": 1.05797946, + "balance_loss_mlp": 1.02952933, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.7604682964983795, + "language_loss": 0.78574973, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80774838, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.17553711, + "step": 5151, + "time_per_iteration": 2.6590402126312256 + }, + { + "auxiliary_loss_clip": 0.01158484, + "auxiliary_loss_mlp": 0.01041279, + "balance_loss_clip": 1.06439614, + "balance_loss_mlp": 1.02395749, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 2.2175365186172398, + "language_loss": 0.67302746, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69502509, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.17297363, + "step": 5152, + "time_per_iteration": 2.697222948074341 + }, + { + "auxiliary_loss_clip": 0.01151511, + "auxiliary_loss_mlp": 0.01053407, + "balance_loss_clip": 1.05922389, + "balance_loss_mlp": 1.03436959, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 1.984385373226155, + "language_loss": 0.82420915, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.8462584, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.19030762, + "step": 5153, + "time_per_iteration": 2.5772335529327393 + }, + { + "auxiliary_loss_clip": 0.01152608, + "auxiliary_loss_mlp": 0.01048442, + "balance_loss_clip": 1.05903268, + "balance_loss_mlp": 1.03197956, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 2.866087930912306, + "language_loss": 0.73809534, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76010585, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16455078, + "step": 5154, + "time_per_iteration": 2.5265588760375977 + }, + { + "auxiliary_loss_clip": 0.01158239, + "auxiliary_loss_mlp": 0.01036744, + "balance_loss_clip": 1.06616032, + "balance_loss_mlp": 1.02069879, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.0520161657189537, + "language_loss": 0.75645798, + "learning_rate": 3.232747826832858e-06, + "loss": 0.77840781, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.16052246, + "step": 5155, + "time_per_iteration": 2.5011367797851562 + }, + { + "auxiliary_loss_clip": 0.0114997, + "auxiliary_loss_mlp": 0.01048131, + "balance_loss_clip": 1.05731297, + "balance_loss_mlp": 1.03077412, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7841977409483079, + "language_loss": 0.78724432, + "learning_rate": 3.232441120452094e-06, + "loss": 0.80922532, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.17370605, + "step": 5156, + "time_per_iteration": 2.439455032348633 + }, + { + "auxiliary_loss_clip": 0.0115154, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.05429304, + "balance_loss_mlp": 1.02930701, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.2129647222896693, + "language_loss": 0.75155926, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77355909, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.19152832, + "step": 5157, + "time_per_iteration": 2.501838445663452 + }, + { + "auxiliary_loss_clip": 0.01145748, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_clip": 1.05256796, + "balance_loss_mlp": 1.02942109, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 3.882134988615825, + "language_loss": 0.69460177, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71650875, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15527344, + "step": 5158, + "time_per_iteration": 2.5835587978363037 + }, + { + "auxiliary_loss_clip": 0.01151633, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.05848861, + "balance_loss_mlp": 1.02650452, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.9998244884237333, + "language_loss": 0.84833217, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.87026429, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15063477, + "step": 5159, + "time_per_iteration": 3.8693017959594727 + }, + { + "auxiliary_loss_clip": 0.01148004, + "auxiliary_loss_mlp": 0.01040171, + "balance_loss_clip": 1.05577803, + "balance_loss_mlp": 1.0230999, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 2.068719250909651, + "language_loss": 0.85291129, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87479305, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.1706543, + "step": 5160, + "time_per_iteration": 3.85690975189209 + }, + { + "auxiliary_loss_clip": 0.01151389, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.06148732, + "balance_loss_mlp": 1.02459657, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.7374592369805706, + "language_loss": 0.75546247, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77738547, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.16320801, + "step": 5161, + "time_per_iteration": 2.4388034343719482 + }, + { + "auxiliary_loss_clip": 0.01151007, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.05684352, + "balance_loss_mlp": 1.01968813, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.929480530824901, + "language_loss": 0.81739426, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83926392, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.16271973, + "step": 5162, + "time_per_iteration": 2.4180853366851807 + }, + { + "auxiliary_loss_clip": 0.01142563, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.05257893, + "balance_loss_mlp": 1.0306778, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.900734257588648, + "language_loss": 0.83003402, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85193145, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.16503906, + "step": 5163, + "time_per_iteration": 2.46256160736084 + }, + { + "auxiliary_loss_clip": 0.01156641, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.06312096, + "balance_loss_mlp": 1.030056, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 2.910995464467983, + "language_loss": 0.7610116, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78304976, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.17114258, + "step": 5164, + "time_per_iteration": 2.7997608184814453 + }, + { + "auxiliary_loss_clip": 0.01151615, + "auxiliary_loss_mlp": 0.01040542, + "balance_loss_clip": 1.05778885, + "balance_loss_mlp": 1.02368569, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.7557256564076968, + "language_loss": 0.74921167, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.7711333, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.16851807, + "step": 5165, + "time_per_iteration": 3.971229076385498 + }, + { + "auxiliary_loss_clip": 0.01154334, + "auxiliary_loss_mlp": 0.0104197, + "balance_loss_clip": 1.0591464, + "balance_loss_mlp": 1.02553093, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.6479168758622589, + "language_loss": 0.76067626, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78263927, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16418457, + "step": 5166, + "time_per_iteration": 2.4540305137634277 + }, + { + "auxiliary_loss_clip": 0.01151373, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.05789709, + "balance_loss_mlp": 1.02182925, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.84005344344375, + "language_loss": 0.73718423, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75908327, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16699219, + "step": 5167, + "time_per_iteration": 2.3994579315185547 + }, + { + "auxiliary_loss_clip": 0.01088356, + "auxiliary_loss_mlp": 0.01009128, + "balance_loss_clip": 1.05728078, + "balance_loss_mlp": 1.0071907, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7358008052343818, + "language_loss": 0.52974421, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55071902, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.01934814, + "step": 5168, + "time_per_iteration": 3.135601043701172 + }, + { + "auxiliary_loss_clip": 0.01154176, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.06090856, + "balance_loss_mlp": 1.02807713, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.9119150178210056, + "language_loss": 0.78914189, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.81113958, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.1751709, + "step": 5169, + "time_per_iteration": 2.4431874752044678 + }, + { + "auxiliary_loss_clip": 0.01151164, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.05758142, + "balance_loss_mlp": 1.02147615, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.8154064020040444, + "language_loss": 0.64026958, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66215813, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16210938, + "step": 5170, + "time_per_iteration": 2.5496020317077637 + }, + { + "auxiliary_loss_clip": 0.01154807, + "auxiliary_loss_mlp": 0.01046697, + "balance_loss_clip": 1.05973589, + "balance_loss_mlp": 1.02993608, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.291890639871084, + "language_loss": 0.77428126, + "learning_rate": 3.22783492314295e-06, + "loss": 0.7962963, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.16760254, + "step": 5171, + "time_per_iteration": 2.5075652599334717 + }, + { + "auxiliary_loss_clip": 0.01140393, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.05014277, + "balance_loss_mlp": 1.03067088, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.8010369514205342, + "language_loss": 0.84264559, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86451674, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.16040039, + "step": 5172, + "time_per_iteration": 2.464465856552124 + }, + { + "auxiliary_loss_clip": 0.01148235, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.05590272, + "balance_loss_mlp": 1.03129137, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 2.3212771659907347, + "language_loss": 0.84202188, + "learning_rate": 3.227219971129842e-06, + "loss": 0.863994, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.17675781, + "step": 5173, + "time_per_iteration": 2.434530258178711 + }, + { + "auxiliary_loss_clip": 0.01144877, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.05479527, + "balance_loss_mlp": 1.02480745, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.6253912358149365, + "language_loss": 0.83333391, + "learning_rate": 3.226912425313001e-06, + "loss": 0.8551873, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.15655518, + "step": 5174, + "time_per_iteration": 2.552323341369629 + }, + { + "auxiliary_loss_clip": 0.01157128, + "auxiliary_loss_mlp": 0.01041248, + "balance_loss_clip": 1.06326783, + "balance_loss_mlp": 1.0249759, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 3.39481814682953, + "language_loss": 0.85336423, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87534797, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.16271973, + "step": 5175, + "time_per_iteration": 2.4911272525787354 + }, + { + "auxiliary_loss_clip": 0.01142769, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.05587673, + "balance_loss_mlp": 1.02365994, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.7556632685618223, + "language_loss": 0.83498919, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85681611, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.16259766, + "step": 5176, + "time_per_iteration": 2.4523425102233887 + }, + { + "auxiliary_loss_clip": 0.01147182, + "auxiliary_loss_mlp": 0.01041289, + "balance_loss_clip": 1.05746746, + "balance_loss_mlp": 1.02434993, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.9167978511576054, + "language_loss": 0.80387974, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.82576448, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.16955566, + "step": 5177, + "time_per_iteration": 2.8193657398223877 + }, + { + "auxiliary_loss_clip": 0.01153323, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.06428087, + "balance_loss_mlp": 1.02682066, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.65487349316754, + "language_loss": 0.80938607, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.83134562, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.15820312, + "step": 5178, + "time_per_iteration": 2.5437734127044678 + }, + { + "auxiliary_loss_clip": 0.01147096, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.05589366, + "balance_loss_mlp": 1.02740622, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 2.527627010342039, + "language_loss": 0.81268167, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83458328, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15661621, + "step": 5179, + "time_per_iteration": 2.4631893634796143 + }, + { + "auxiliary_loss_clip": 0.0114564, + "auxiliary_loss_mlp": 0.01044646, + "balance_loss_clip": 1.05645621, + "balance_loss_mlp": 1.02857709, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 3.521958091459207, + "language_loss": 0.78426534, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.8061682, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.16064453, + "step": 5180, + "time_per_iteration": 2.55692720413208 + }, + { + "auxiliary_loss_clip": 0.01147567, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.0582397, + "balance_loss_mlp": 1.01961517, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.8693930831189973, + "language_loss": 0.83024132, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85207617, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.16296387, + "step": 5181, + "time_per_iteration": 2.5179457664489746 + }, + { + "auxiliary_loss_clip": 0.01143092, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.0540818, + "balance_loss_mlp": 1.02547848, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.7595302266091055, + "language_loss": 0.74541295, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76725245, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.15362549, + "step": 5182, + "time_per_iteration": 2.5398662090301514 + }, + { + "auxiliary_loss_clip": 0.01143191, + "auxiliary_loss_mlp": 0.01049368, + "balance_loss_clip": 1.05071354, + "balance_loss_mlp": 1.03304863, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.9687965598924235, + "language_loss": 0.706141, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72806656, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16320801, + "step": 5183, + "time_per_iteration": 2.547497272491455 + }, + { + "auxiliary_loss_clip": 0.01068528, + "auxiliary_loss_mlp": 0.01006177, + "balance_loss_clip": 1.03722668, + "balance_loss_mlp": 1.00425816, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 1.0505982522778214, + "language_loss": 0.59603542, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61678243, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.31298828, + "router_z_loss_mlp": 0.01916504, + "step": 5184, + "time_per_iteration": 3.097728729248047 + }, + { + "auxiliary_loss_clip": 0.01143406, + "auxiliary_loss_mlp": 0.01044559, + "balance_loss_clip": 1.05370903, + "balance_loss_mlp": 1.02944338, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 8.939139399915646, + "language_loss": 0.70080411, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72268379, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.15106201, + "step": 5185, + "time_per_iteration": 2.442099094390869 + }, + { + "auxiliary_loss_clip": 0.01144589, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.05311871, + "balance_loss_mlp": 1.03364873, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.439875588684215, + "language_loss": 0.64201659, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66395819, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15905762, + "step": 5186, + "time_per_iteration": 2.5095996856689453 + }, + { + "auxiliary_loss_clip": 0.01153304, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.05723619, + "balance_loss_mlp": 1.02968836, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.4539652404309, + "language_loss": 0.86660957, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88861448, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.17504883, + "step": 5187, + "time_per_iteration": 2.481003761291504 + }, + { + "auxiliary_loss_clip": 0.01155242, + "auxiliary_loss_mlp": 0.01043454, + "balance_loss_clip": 1.0625546, + "balance_loss_mlp": 1.02691984, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.4834379027314837, + "language_loss": 0.62992233, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.65190929, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.1652832, + "step": 5188, + "time_per_iteration": 2.569291114807129 + }, + { + "auxiliary_loss_clip": 0.01147616, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_clip": 1.0576508, + "balance_loss_mlp": 1.02642143, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 3.2912587981183665, + "language_loss": 0.82382154, + "learning_rate": 3.222293661638346e-06, + "loss": 0.84571898, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.15710449, + "step": 5189, + "time_per_iteration": 2.4467530250549316 + }, + { + "auxiliary_loss_clip": 0.01140169, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.04955971, + "balance_loss_mlp": 1.01752591, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.917466951264988, + "language_loss": 0.79166639, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81340659, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.16308594, + "step": 5190, + "time_per_iteration": 2.418668270111084 + }, + { + "auxiliary_loss_clip": 0.01149265, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.05729938, + "balance_loss_mlp": 1.01889515, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.527914247456703, + "language_loss": 0.75275874, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.7746067, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.1663208, + "step": 5191, + "time_per_iteration": 2.8353281021118164 + }, + { + "auxiliary_loss_clip": 0.01086078, + "auxiliary_loss_mlp": 0.0100435, + "balance_loss_clip": 1.05440664, + "balance_loss_mlp": 1.0024097, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8441021459160407, + "language_loss": 0.63929784, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66020215, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.01937866, + "step": 5192, + "time_per_iteration": 3.157318353652954 + }, + { + "auxiliary_loss_clip": 0.01145645, + "auxiliary_loss_mlp": 0.01053764, + "balance_loss_clip": 1.05084753, + "balance_loss_mlp": 1.03404641, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 2.3726601279846378, + "language_loss": 0.80158478, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82357883, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.19702148, + "step": 5193, + "time_per_iteration": 3.872516632080078 + }, + { + "auxiliary_loss_clip": 0.01150271, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.05598032, + "balance_loss_mlp": 1.02735305, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 3.1331440884986166, + "language_loss": 0.72789514, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.7498396, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.16821289, + "step": 5194, + "time_per_iteration": 2.5124075412750244 + }, + { + "auxiliary_loss_clip": 0.01147392, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.05702996, + "balance_loss_mlp": 1.02576852, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.460404558128154, + "language_loss": 0.76517701, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78706455, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.15600586, + "step": 5195, + "time_per_iteration": 2.4679207801818848 + }, + { + "auxiliary_loss_clip": 0.01150823, + "auxiliary_loss_mlp": 0.01043409, + "balance_loss_clip": 1.05775809, + "balance_loss_mlp": 1.02741075, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 2.912412065300203, + "language_loss": 0.78080618, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80274844, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.16015625, + "step": 5196, + "time_per_iteration": 2.576793909072876 + }, + { + "auxiliary_loss_clip": 0.01091139, + "auxiliary_loss_mlp": 0.01006459, + "balance_loss_clip": 1.05965245, + "balance_loss_mlp": 1.00462937, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7683832960906742, + "language_loss": 0.54747432, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56845033, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.01831055, + "step": 5197, + "time_per_iteration": 3.1167800426483154 + }, + { + "auxiliary_loss_clip": 0.01146206, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.0580802, + "balance_loss_mlp": 1.02124453, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 2.384470298380855, + "language_loss": 0.66414535, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68597966, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.15985107, + "step": 5198, + "time_per_iteration": 2.571773052215576 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01041569, + "balance_loss_clip": 1.05961752, + "balance_loss_mlp": 1.0245223, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.507087290888745, + "language_loss": 0.69293588, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71487004, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.17041016, + "step": 5199, + "time_per_iteration": 2.412696361541748 + }, + { + "auxiliary_loss_clip": 0.01146145, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.05629683, + "balance_loss_mlp": 1.0280087, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.7678324731087458, + "language_loss": 0.78909057, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81099439, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.16210938, + "step": 5200, + "time_per_iteration": 2.4414942264556885 + }, + { + "auxiliary_loss_clip": 0.01142673, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.0553174, + "balance_loss_mlp": 1.01787043, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 1.9973123455812796, + "language_loss": 0.83607507, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85783219, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.1517334, + "step": 5201, + "time_per_iteration": 2.4628233909606934 + }, + { + "auxiliary_loss_clip": 0.01145647, + "auxiliary_loss_mlp": 0.01046771, + "balance_loss_clip": 1.05625904, + "balance_loss_mlp": 1.03035045, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.217857442084615, + "language_loss": 0.69298583, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71491003, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.16424561, + "step": 5202, + "time_per_iteration": 2.5383007526397705 + }, + { + "auxiliary_loss_clip": 0.01155077, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.06174052, + "balance_loss_mlp": 1.02616942, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 2.3829391165703258, + "language_loss": 0.84336376, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86532223, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.14611816, + "step": 5203, + "time_per_iteration": 5.711359262466431 + }, + { + "auxiliary_loss_clip": 0.01146426, + "auxiliary_loss_mlp": 0.01048926, + "balance_loss_clip": 1.05398703, + "balance_loss_mlp": 1.03086638, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.176817314998807, + "language_loss": 0.60809684, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.6300503, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.18054199, + "step": 5204, + "time_per_iteration": 2.521411180496216 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.0104024, + "balance_loss_clip": 1.05355072, + "balance_loss_mlp": 1.02603006, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.721529733467438, + "language_loss": 0.66050494, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68234408, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.14221191, + "step": 5205, + "time_per_iteration": 2.4784014225006104 + }, + { + "auxiliary_loss_clip": 0.01139103, + "auxiliary_loss_mlp": 0.01048734, + "balance_loss_clip": 1.0488795, + "balance_loss_mlp": 1.03061438, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.5880104574984193, + "language_loss": 0.76874459, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79062295, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.18103027, + "step": 5206, + "time_per_iteration": 2.514345169067383 + }, + { + "auxiliary_loss_clip": 0.0113739, + "auxiliary_loss_mlp": 0.01036595, + "balance_loss_clip": 1.04949522, + "balance_loss_mlp": 1.02137232, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 2.28033886457569, + "language_loss": 0.8310678, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85280764, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.15222168, + "step": 5207, + "time_per_iteration": 2.4276788234710693 + }, + { + "auxiliary_loss_clip": 0.0113371, + "auxiliary_loss_mlp": 0.01041442, + "balance_loss_clip": 1.04751539, + "balance_loss_mlp": 1.02710712, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 4.886348666548161, + "language_loss": 0.71174657, + "learning_rate": 3.216428261810999e-06, + "loss": 0.7334981, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14318848, + "step": 5208, + "time_per_iteration": 3.869601249694824 + }, + { + "auxiliary_loss_clip": 0.0114034, + "auxiliary_loss_mlp": 0.01044912, + "balance_loss_clip": 1.05006516, + "balance_loss_mlp": 1.0283891, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.9261144407423094, + "language_loss": 0.74467218, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76652473, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.16516113, + "step": 5209, + "time_per_iteration": 2.4455764293670654 + }, + { + "auxiliary_loss_clip": 0.01140703, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.0491693, + "balance_loss_mlp": 1.02611077, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.748708379528902, + "language_loss": 0.77405393, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79587507, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15283203, + "step": 5210, + "time_per_iteration": 2.4527742862701416 + }, + { + "auxiliary_loss_clip": 0.01138384, + "auxiliary_loss_mlp": 0.01039447, + "balance_loss_clip": 1.05199051, + "balance_loss_mlp": 1.02423549, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 8.019793938217335, + "language_loss": 0.79223835, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.8140167, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15197754, + "step": 5211, + "time_per_iteration": 2.4703774452209473 + }, + { + "auxiliary_loss_clip": 0.01159499, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.06747174, + "balance_loss_mlp": 1.01986718, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.9629922400516524, + "language_loss": 0.79585993, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81780016, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.14642334, + "step": 5212, + "time_per_iteration": 2.4729268550872803 + }, + { + "auxiliary_loss_clip": 0.01150468, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_clip": 1.05865562, + "balance_loss_mlp": 1.03372633, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.7810554436919297, + "language_loss": 0.7089805, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73098767, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.16540527, + "step": 5213, + "time_per_iteration": 2.4755935668945312 + }, + { + "auxiliary_loss_clip": 0.01153973, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.06313658, + "balance_loss_mlp": 1.02208948, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 4.431724246968537, + "language_loss": 0.77973008, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.80164135, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.15075684, + "step": 5214, + "time_per_iteration": 2.492896556854248 + }, + { + "auxiliary_loss_clip": 0.01134052, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.04858017, + "balance_loss_mlp": 1.01957726, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.8713264456655774, + "language_loss": 0.82452369, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84621078, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.15081787, + "step": 5215, + "time_per_iteration": 2.5258967876434326 + }, + { + "auxiliary_loss_clip": 0.01152562, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.06259394, + "balance_loss_mlp": 1.01865971, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.002781604302811, + "language_loss": 0.79280508, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81467873, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.16125488, + "step": 5216, + "time_per_iteration": 2.7991833686828613 + }, + { + "auxiliary_loss_clip": 0.01138249, + "auxiliary_loss_mlp": 0.01047039, + "balance_loss_clip": 1.04756677, + "balance_loss_mlp": 1.02950311, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.8445171947331103, + "language_loss": 0.68671972, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70857263, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.17541504, + "step": 5217, + "time_per_iteration": 2.514206647872925 + }, + { + "auxiliary_loss_clip": 0.01147449, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_clip": 1.05430162, + "balance_loss_mlp": 1.02682006, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.8873404273490115, + "language_loss": 0.80666429, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82856596, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15887451, + "step": 5218, + "time_per_iteration": 2.464121103286743 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.05481875, + "balance_loss_mlp": 1.02433276, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.6131152207646466, + "language_loss": 0.6813184, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70319098, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.16418457, + "step": 5219, + "time_per_iteration": 2.440965175628662 + }, + { + "auxiliary_loss_clip": 0.01145103, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_clip": 1.05613256, + "balance_loss_mlp": 1.02629447, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 2.127919817531186, + "language_loss": 0.79673487, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.8185991, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.15039062, + "step": 5220, + "time_per_iteration": 2.536029577255249 + }, + { + "auxiliary_loss_clip": 0.01140644, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.05253577, + "balance_loss_mlp": 1.0221926, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.8186273528532229, + "language_loss": 0.72747636, + "learning_rate": 3.212405494206986e-06, + "loss": 0.74925727, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.15258789, + "step": 5221, + "time_per_iteration": 2.4101955890655518 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01037854, + "balance_loss_clip": 1.04892099, + "balance_loss_mlp": 1.02277982, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 2.0212512628562274, + "language_loss": 0.82125652, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84299344, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.1506958, + "step": 5222, + "time_per_iteration": 2.468538761138916 + }, + { + "auxiliary_loss_clip": 0.01156421, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.063519, + "balance_loss_mlp": 1.02316833, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 2.049143524862699, + "language_loss": 0.70160961, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72357231, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16699219, + "step": 5223, + "time_per_iteration": 2.4296271800994873 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.05788374, + "balance_loss_mlp": 1.01697278, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.558050903391408, + "language_loss": 0.8025763, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82436752, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.14916992, + "step": 5224, + "time_per_iteration": 2.4952049255371094 + }, + { + "auxiliary_loss_clip": 0.01159759, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.06371069, + "balance_loss_mlp": 1.02933466, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.11202231084487, + "language_loss": 0.57698286, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.59903896, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16503906, + "step": 5225, + "time_per_iteration": 2.5259413719177246 + }, + { + "auxiliary_loss_clip": 0.01149064, + "auxiliary_loss_mlp": 0.01034208, + "balance_loss_clip": 1.06112325, + "balance_loss_mlp": 1.01937294, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.8687444254829184, + "language_loss": 0.81462514, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83645785, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14825439, + "step": 5226, + "time_per_iteration": 2.4721579551696777 + }, + { + "auxiliary_loss_clip": 0.011434, + "auxiliary_loss_mlp": 0.01058491, + "balance_loss_clip": 1.0524745, + "balance_loss_mlp": 1.04014444, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 2.5314383189422487, + "language_loss": 0.74086368, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76288259, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.18359375, + "step": 5227, + "time_per_iteration": 2.4675328731536865 + }, + { + "auxiliary_loss_clip": 0.01152261, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.06025839, + "balance_loss_mlp": 1.02135277, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 2.076341563755556, + "language_loss": 0.67763174, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69952607, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.1583252, + "step": 5228, + "time_per_iteration": 2.514986753463745 + }, + { + "auxiliary_loss_clip": 0.0114776, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.05691218, + "balance_loss_mlp": 1.02614248, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.9012030852060051, + "language_loss": 0.79698932, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.81887352, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.1451416, + "step": 5229, + "time_per_iteration": 2.5783166885375977 + }, + { + "auxiliary_loss_clip": 0.01150111, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.06160927, + "balance_loss_mlp": 1.0181402, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8442628922268307, + "language_loss": 0.69800848, + "learning_rate": 3.209615948222611e-06, + "loss": 0.71984696, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.15600586, + "step": 5230, + "time_per_iteration": 2.7369563579559326 + }, + { + "auxiliary_loss_clip": 0.0114504, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.05204368, + "balance_loss_mlp": 1.0261631, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.7523139193522368, + "language_loss": 0.79577583, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81765354, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16564941, + "step": 5231, + "time_per_iteration": 2.4980626106262207 + }, + { + "auxiliary_loss_clip": 0.01150012, + "auxiliary_loss_mlp": 0.01041176, + "balance_loss_clip": 1.06051314, + "balance_loss_mlp": 1.02420056, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0379822865586026, + "language_loss": 0.85186875, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87378061, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.16955566, + "step": 5232, + "time_per_iteration": 2.454901933670044 + }, + { + "auxiliary_loss_clip": 0.01136865, + "auxiliary_loss_mlp": 0.01054701, + "balance_loss_clip": 1.04908276, + "balance_loss_mlp": 1.03730857, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.823647033446072, + "language_loss": 0.80186015, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82377577, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.1739502, + "step": 5233, + "time_per_iteration": 2.454796314239502 + }, + { + "auxiliary_loss_clip": 0.01149306, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.05725408, + "balance_loss_mlp": 1.02282143, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 7.779587654689428, + "language_loss": 0.7092222, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.73110366, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.16015625, + "step": 5234, + "time_per_iteration": 2.7726471424102783 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.05077124, + "balance_loss_mlp": 1.02147686, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.867094189355821, + "language_loss": 0.72266787, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74446303, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.17248535, + "step": 5235, + "time_per_iteration": 2.5532922744750977 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.05811787, + "balance_loss_mlp": 1.02174091, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 1.8694806610070236, + "language_loss": 0.79041201, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.81228256, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.1529541, + "step": 5236, + "time_per_iteration": 3.8634862899780273 + }, + { + "auxiliary_loss_clip": 0.01153467, + "auxiliary_loss_mlp": 0.01039248, + "balance_loss_clip": 1.05821288, + "balance_loss_mlp": 1.02221322, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.6309148671166938, + "language_loss": 0.75689852, + "learning_rate": 3.207443732256881e-06, + "loss": 0.7788257, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.17028809, + "step": 5237, + "time_per_iteration": 2.5501341819763184 + }, + { + "auxiliary_loss_clip": 0.01144497, + "auxiliary_loss_mlp": 0.01040153, + "balance_loss_clip": 1.05647707, + "balance_loss_mlp": 1.02554417, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.0046629372571485, + "language_loss": 0.79859829, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82044476, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.1461792, + "step": 5238, + "time_per_iteration": 2.461418390274048 + }, + { + "auxiliary_loss_clip": 0.01087469, + "auxiliary_loss_mlp": 0.01007224, + "balance_loss_clip": 1.05705166, + "balance_loss_mlp": 1.00506973, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8329879492019097, + "language_loss": 0.67928243, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70022941, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.02157593, + "step": 5239, + "time_per_iteration": 3.100184679031372 + }, + { + "auxiliary_loss_clip": 0.01150897, + "auxiliary_loss_mlp": 0.0104603, + "balance_loss_clip": 1.05696416, + "balance_loss_mlp": 1.02788651, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.4469504028822513, + "language_loss": 0.82641214, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.8483814, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.18127441, + "step": 5240, + "time_per_iteration": 2.46512508392334 + }, + { + "auxiliary_loss_clip": 0.01147262, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_clip": 1.05785012, + "balance_loss_mlp": 1.02828407, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.151381388580013, + "language_loss": 0.81339711, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83532619, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.17382812, + "step": 5241, + "time_per_iteration": 2.5222814083099365 + }, + { + "auxiliary_loss_clip": 0.01151878, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.06316447, + "balance_loss_mlp": 1.02084231, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.6241594723995272, + "language_loss": 0.74300319, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76488799, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.15771484, + "step": 5242, + "time_per_iteration": 2.781240940093994 + }, + { + "auxiliary_loss_clip": 0.01147633, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.05762744, + "balance_loss_mlp": 1.02093232, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9071429358161944, + "language_loss": 0.73622632, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.75808311, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.17102051, + "step": 5243, + "time_per_iteration": 2.763390302658081 + }, + { + "auxiliary_loss_clip": 0.01140225, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.05030704, + "balance_loss_mlp": 1.02580798, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 1.8453393531950804, + "language_loss": 0.64541143, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66722763, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.15594482, + "step": 5244, + "time_per_iteration": 2.4544570446014404 + }, + { + "auxiliary_loss_clip": 0.01148605, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.05596638, + "balance_loss_mlp": 1.02872753, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.8914316497846575, + "language_loss": 0.91646278, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93838841, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.15234375, + "step": 5245, + "time_per_iteration": 2.507179021835327 + }, + { + "auxiliary_loss_clip": 0.0115069, + "auxiliary_loss_mlp": 0.01045408, + "balance_loss_clip": 1.05705106, + "balance_loss_mlp": 1.02895761, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 2.170084447000311, + "language_loss": 0.75359833, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77555931, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.16467285, + "step": 5246, + "time_per_iteration": 3.934692144393921 + }, + { + "auxiliary_loss_clip": 0.01137829, + "auxiliary_loss_mlp": 0.0104288, + "balance_loss_clip": 1.04642034, + "balance_loss_mlp": 1.02667952, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.599079053441663, + "language_loss": 0.61988521, + "learning_rate": 3.204336675750321e-06, + "loss": 0.64169228, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.1619873, + "step": 5247, + "time_per_iteration": 4.111630439758301 + }, + { + "auxiliary_loss_clip": 0.01144071, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.05193782, + "balance_loss_mlp": 1.02291834, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.114652490252292, + "language_loss": 0.82563555, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84746474, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.15917969, + "step": 5248, + "time_per_iteration": 2.471238613128662 + }, + { + "auxiliary_loss_clip": 0.01140733, + "auxiliary_loss_mlp": 0.01057295, + "balance_loss_clip": 1.05087113, + "balance_loss_mlp": 1.03791201, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.7456136050408244, + "language_loss": 0.84602511, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86800539, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19396973, + "step": 5249, + "time_per_iteration": 2.4176647663116455 + }, + { + "auxiliary_loss_clip": 0.01147087, + "auxiliary_loss_mlp": 0.01047206, + "balance_loss_clip": 1.05231309, + "balance_loss_mlp": 1.02939653, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 2.246700851899081, + "language_loss": 0.85642201, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87836492, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.17810059, + "step": 5250, + "time_per_iteration": 2.443371534347534 + }, + { + "auxiliary_loss_clip": 0.01149918, + "auxiliary_loss_mlp": 0.01046752, + "balance_loss_clip": 1.05819058, + "balance_loss_mlp": 1.02964556, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 2.256135284797609, + "language_loss": 0.68902814, + "learning_rate": 3.203092573767835e-06, + "loss": 0.7109949, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.17102051, + "step": 5251, + "time_per_iteration": 2.4540135860443115 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.0597508, + "balance_loss_mlp": 1.02853, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.798201837223645, + "language_loss": 0.79001099, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81195569, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.16369629, + "step": 5252, + "time_per_iteration": 2.6793086528778076 + }, + { + "auxiliary_loss_clip": 0.01151781, + "auxiliary_loss_mlp": 0.01045632, + "balance_loss_clip": 1.06203759, + "balance_loss_mlp": 1.02875769, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 2.2609380427348182, + "language_loss": 0.74140251, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76337659, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16888428, + "step": 5253, + "time_per_iteration": 4.013114929199219 + }, + { + "auxiliary_loss_clip": 0.01145556, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.05170155, + "balance_loss_mlp": 1.02216053, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.9710295360341321, + "language_loss": 0.73297489, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75481969, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.16772461, + "step": 5254, + "time_per_iteration": 2.8372395038604736 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01046053, + "balance_loss_clip": 1.05314863, + "balance_loss_mlp": 1.03013873, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.7280547624033062, + "language_loss": 0.77532238, + "learning_rate": 3.201847741843128e-06, + "loss": 0.79724097, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.15924072, + "step": 5255, + "time_per_iteration": 2.521998405456543 + }, + { + "auxiliary_loss_clip": 0.01143457, + "auxiliary_loss_mlp": 0.01046608, + "balance_loss_clip": 1.05374897, + "balance_loss_mlp": 1.02886987, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 1.835547362870954, + "language_loss": 0.77854133, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80044198, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.17749023, + "step": 5256, + "time_per_iteration": 2.434910535812378 + }, + { + "auxiliary_loss_clip": 0.01148471, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.06099844, + "balance_loss_mlp": 1.03024197, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.5578523686885872, + "language_loss": 0.71381354, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.7357499, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14923096, + "step": 5257, + "time_per_iteration": 2.455003499984741 + }, + { + "auxiliary_loss_clip": 0.01163171, + "auxiliary_loss_mlp": 0.01039109, + "balance_loss_clip": 1.06994247, + "balance_loss_mlp": 1.02171671, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 2.0519994399100754, + "language_loss": 0.76643109, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.78845394, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.17382812, + "step": 5258, + "time_per_iteration": 2.447801351547241 + }, + { + "auxiliary_loss_clip": 0.0115586, + "auxiliary_loss_mlp": 0.01044621, + "balance_loss_clip": 1.06226659, + "balance_loss_mlp": 1.02795553, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 1.9413823841098727, + "language_loss": 0.72806954, + "learning_rate": 3.200602180731467e-06, + "loss": 0.75007439, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16662598, + "step": 5259, + "time_per_iteration": 2.564215898513794 + }, + { + "auxiliary_loss_clip": 0.01158491, + "auxiliary_loss_mlp": 0.01044914, + "balance_loss_clip": 1.06447649, + "balance_loss_mlp": 1.02921414, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.8739615717293974, + "language_loss": 0.6664089, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68844295, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15686035, + "step": 5260, + "time_per_iteration": 2.476919651031494 + }, + { + "auxiliary_loss_clip": 0.01149024, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.05613613, + "balance_loss_mlp": 1.01916885, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 2.3771979619271075, + "language_loss": 0.7259928, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74783283, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.15820312, + "step": 5261, + "time_per_iteration": 2.500648021697998 + }, + { + "auxiliary_loss_clip": 0.01088594, + "auxiliary_loss_mlp": 0.01006008, + "balance_loss_clip": 1.05837405, + "balance_loss_mlp": 1.00386786, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7454849536944109, + "language_loss": 0.50565875, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52660471, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.30224609, + "router_z_loss_mlp": 0.02139282, + "step": 5262, + "time_per_iteration": 3.091625452041626 + }, + { + "auxiliary_loss_clip": 0.01148249, + "auxiliary_loss_mlp": 0.01039446, + "balance_loss_clip": 1.05658126, + "balance_loss_mlp": 1.02341259, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.7913590444845846, + "language_loss": 0.85262489, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87450182, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.16027832, + "step": 5263, + "time_per_iteration": 2.5089943408966064 + }, + { + "auxiliary_loss_clip": 0.01144096, + "auxiliary_loss_mlp": 0.01040122, + "balance_loss_clip": 1.05480027, + "balance_loss_mlp": 1.02510142, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.6324085670367394, + "language_loss": 0.82294738, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84478951, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.15014648, + "step": 5264, + "time_per_iteration": 2.4830098152160645 + }, + { + "auxiliary_loss_clip": 0.01154303, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.0627439, + "balance_loss_mlp": 1.01853073, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.7468881018689486, + "language_loss": 0.79326057, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81515718, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16821289, + "step": 5265, + "time_per_iteration": 2.4688005447387695 + }, + { + "auxiliary_loss_clip": 0.01147778, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.05683398, + "balance_loss_mlp": 1.02356577, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.6253876666554457, + "language_loss": 0.74748921, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.76936805, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.16564941, + "step": 5266, + "time_per_iteration": 2.4839231967926025 + }, + { + "auxiliary_loss_clip": 0.01163816, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.06770003, + "balance_loss_mlp": 1.02325916, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.3888945453039305, + "language_loss": 0.79589891, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81793118, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16149902, + "step": 5267, + "time_per_iteration": 2.470773220062256 + }, + { + "auxiliary_loss_clip": 0.01077612, + "auxiliary_loss_mlp": 0.0100413, + "balance_loss_clip": 1.04566622, + "balance_loss_mlp": 1.0019958, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.736784612136107, + "language_loss": 0.57813752, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59895492, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.31982422, + "router_z_loss_mlp": 0.0213623, + "step": 5268, + "time_per_iteration": 3.3148226737976074 + }, + { + "auxiliary_loss_clip": 0.01146512, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.05678749, + "balance_loss_mlp": 1.02178574, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.4381669812966695, + "language_loss": 0.73001862, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75186306, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16125488, + "step": 5269, + "time_per_iteration": 2.5177619457244873 + }, + { + "auxiliary_loss_clip": 0.01156774, + "auxiliary_loss_mlp": 0.01049866, + "balance_loss_clip": 1.06115103, + "balance_loss_mlp": 1.03357029, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 1.8180410550371369, + "language_loss": 0.79774451, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.81981087, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.16308594, + "step": 5270, + "time_per_iteration": 2.4795820713043213 + }, + { + "auxiliary_loss_clip": 0.01155598, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.06273222, + "balance_loss_mlp": 1.02857912, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 2.012443607344736, + "language_loss": 0.79180443, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81381595, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.16986084, + "step": 5271, + "time_per_iteration": 2.445511817932129 + }, + { + "auxiliary_loss_clip": 0.01139803, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.05008006, + "balance_loss_mlp": 1.02337956, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 2.0028199225727272, + "language_loss": 0.72898912, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75078654, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16552734, + "step": 5272, + "time_per_iteration": 2.514052629470825 + }, + { + "auxiliary_loss_clip": 0.01147645, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.05124629, + "balance_loss_mlp": 1.0273819, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.5675528497178846, + "language_loss": 0.69494665, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71688122, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.18432617, + "step": 5273, + "time_per_iteration": 2.6188418865203857 + }, + { + "auxiliary_loss_clip": 0.01144289, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.05438638, + "balance_loss_mlp": 1.01888037, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.7578128460626141, + "language_loss": 0.673334, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69512755, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.16210938, + "step": 5274, + "time_per_iteration": 2.466686487197876 + }, + { + "auxiliary_loss_clip": 0.01139669, + "auxiliary_loss_mlp": 0.01041526, + "balance_loss_clip": 1.05353999, + "balance_loss_mlp": 1.02481318, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.4538736594960302, + "language_loss": 0.80795777, + "learning_rate": 3.195612659536081e-06, + "loss": 0.82976973, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.16723633, + "step": 5275, + "time_per_iteration": 2.4614620208740234 + }, + { + "auxiliary_loss_clip": 0.01154842, + "auxiliary_loss_mlp": 0.01042778, + "balance_loss_clip": 1.06264615, + "balance_loss_mlp": 1.02629089, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 2.1108499933004854, + "language_loss": 0.72868919, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.75066543, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.16479492, + "step": 5276, + "time_per_iteration": 2.4379453659057617 + }, + { + "auxiliary_loss_clip": 0.01139737, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.05157852, + "balance_loss_mlp": 1.02530432, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.6324946014564232, + "language_loss": 0.7812922, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80309683, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.1541748, + "step": 5277, + "time_per_iteration": 2.4658453464508057 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.05559826, + "balance_loss_mlp": 1.02566338, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.9141129780274182, + "language_loss": 0.79203308, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81395704, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.1751709, + "step": 5278, + "time_per_iteration": 2.4485039710998535 + }, + { + "auxiliary_loss_clip": 0.01087477, + "auxiliary_loss_mlp": 0.01010636, + "balance_loss_clip": 1.05648077, + "balance_loss_mlp": 1.00803399, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.871333277961018, + "language_loss": 0.62795687, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64893806, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.30908203, + "router_z_loss_mlp": 0.02606201, + "step": 5279, + "time_per_iteration": 4.228985071182251 + }, + { + "auxiliary_loss_clip": 0.01152376, + "auxiliary_loss_mlp": 0.01042114, + "balance_loss_clip": 1.05685282, + "balance_loss_mlp": 1.02486432, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.517977132569175, + "language_loss": 0.81175244, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83369732, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.17248535, + "step": 5280, + "time_per_iteration": 2.5275449752807617 + }, + { + "auxiliary_loss_clip": 0.01144798, + "auxiliary_loss_mlp": 0.01044152, + "balance_loss_clip": 1.05625343, + "balance_loss_mlp": 1.02880955, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.8584914758807454, + "language_loss": 0.78317237, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80506194, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15368652, + "step": 5281, + "time_per_iteration": 2.90924334526062 + }, + { + "auxiliary_loss_clip": 0.01147321, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.05654752, + "balance_loss_mlp": 1.02327764, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6451176059066333, + "language_loss": 0.78415197, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80602556, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.16778564, + "step": 5282, + "time_per_iteration": 2.4930596351623535 + }, + { + "auxiliary_loss_clip": 0.01151519, + "auxiliary_loss_mlp": 0.01040941, + "balance_loss_clip": 1.05901313, + "balance_loss_mlp": 1.02488327, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 3.351202110763531, + "language_loss": 0.67334354, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69526809, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.16040039, + "step": 5283, + "time_per_iteration": 2.5729191303253174 + }, + { + "auxiliary_loss_clip": 0.01070767, + "auxiliary_loss_mlp": 0.01005066, + "balance_loss_clip": 1.0379281, + "balance_loss_mlp": 1.00302196, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7306635533349009, + "language_loss": 0.52752954, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54828787, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.32861328, + "router_z_loss_mlp": 0.02044678, + "step": 5284, + "time_per_iteration": 3.0895862579345703 + }, + { + "auxiliary_loss_clip": 0.01152704, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.0573895, + "balance_loss_mlp": 1.02884936, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.9925411543090001, + "language_loss": 0.70507592, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72705948, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.16796875, + "step": 5285, + "time_per_iteration": 2.4793877601623535 + }, + { + "auxiliary_loss_clip": 0.01077071, + "auxiliary_loss_mlp": 0.01009397, + "balance_loss_clip": 1.04231524, + "balance_loss_mlp": 1.00740886, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8277911168739733, + "language_loss": 0.60517865, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62604332, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.01986694, + "step": 5286, + "time_per_iteration": 3.081153392791748 + }, + { + "auxiliary_loss_clip": 0.01152635, + "auxiliary_loss_mlp": 0.01041629, + "balance_loss_clip": 1.06058812, + "balance_loss_mlp": 1.02501154, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.856581777900336, + "language_loss": 0.72370148, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74564409, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.16638184, + "step": 5287, + "time_per_iteration": 2.516442060470581 + }, + { + "auxiliary_loss_clip": 0.01151121, + "auxiliary_loss_mlp": 0.01048182, + "balance_loss_clip": 1.05750084, + "balance_loss_mlp": 1.03124273, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.9133996576494745, + "language_loss": 0.75255966, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77455264, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.16918945, + "step": 5288, + "time_per_iteration": 2.453158140182495 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01037493, + "balance_loss_clip": 1.05061364, + "balance_loss_mlp": 1.02322376, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 1.7625760043282666, + "language_loss": 0.87724739, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89901245, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.14276123, + "step": 5289, + "time_per_iteration": 4.016849517822266 + }, + { + "auxiliary_loss_clip": 0.01148943, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.06195378, + "balance_loss_mlp": 1.02123785, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.5316749154779268, + "language_loss": 0.67868567, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70053065, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14318848, + "step": 5290, + "time_per_iteration": 2.5199038982391357 + }, + { + "auxiliary_loss_clip": 0.01151868, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.06064153, + "balance_loss_mlp": 1.0238682, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 2.0046697401703875, + "language_loss": 0.79929161, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.8212074, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.1583252, + "step": 5291, + "time_per_iteration": 3.8706393241882324 + }, + { + "auxiliary_loss_clip": 0.01144572, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.05443835, + "balance_loss_mlp": 1.02193892, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.1618975235209885, + "language_loss": 0.80302978, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82486033, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.16552734, + "step": 5292, + "time_per_iteration": 2.621354103088379 + }, + { + "auxiliary_loss_clip": 0.01137608, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.05191195, + "balance_loss_mlp": 1.01850164, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6487144461987107, + "language_loss": 0.74996793, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.7716704, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.14129639, + "step": 5293, + "time_per_iteration": 2.850597858428955 + }, + { + "auxiliary_loss_clip": 0.01143623, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.05731392, + "balance_loss_mlp": 1.02723014, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.9204404503276067, + "language_loss": 0.74651074, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76835608, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.13684082, + "step": 5294, + "time_per_iteration": 2.5450313091278076 + }, + { + "auxiliary_loss_clip": 0.0114403, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.05129099, + "balance_loss_mlp": 1.01896238, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 2.222360490674481, + "language_loss": 0.76154977, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78334671, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.16699219, + "step": 5295, + "time_per_iteration": 2.44014310836792 + }, + { + "auxiliary_loss_clip": 0.01148399, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.05454326, + "balance_loss_mlp": 1.02398801, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 5.204070332551236, + "language_loss": 0.69715977, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71903908, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.15539551, + "step": 5296, + "time_per_iteration": 3.900543689727783 + }, + { + "auxiliary_loss_clip": 0.01144145, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.05566335, + "balance_loss_mlp": 1.02553999, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.7812722441846676, + "language_loss": 0.7772373, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79908532, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.15100098, + "step": 5297, + "time_per_iteration": 2.600579261779785 + }, + { + "auxiliary_loss_clip": 0.01135095, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.04680848, + "balance_loss_mlp": 1.01849914, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.9475727650580918, + "language_loss": 0.7953757, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81707066, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.15905762, + "step": 5298, + "time_per_iteration": 2.5228309631347656 + }, + { + "auxiliary_loss_clip": 0.0114137, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_clip": 1.04807103, + "balance_loss_mlp": 1.02633107, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 1.8839480300731808, + "language_loss": 0.74100685, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76283765, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15393066, + "step": 5299, + "time_per_iteration": 2.59956693649292 + }, + { + "auxiliary_loss_clip": 0.01149083, + "auxiliary_loss_mlp": 0.01046991, + "balance_loss_clip": 1.05547154, + "balance_loss_mlp": 1.03115368, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.0987126184827916, + "language_loss": 0.78504288, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80700362, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.15838623, + "step": 5300, + "time_per_iteration": 2.5322234630584717 + }, + { + "auxiliary_loss_clip": 0.01147112, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_clip": 1.05716348, + "balance_loss_mlp": 1.02553749, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 1.883792602083301, + "language_loss": 0.83854151, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86043954, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.17163086, + "step": 5301, + "time_per_iteration": 2.495746612548828 + }, + { + "auxiliary_loss_clip": 0.0114822, + "auxiliary_loss_mlp": 0.01045712, + "balance_loss_clip": 1.06212735, + "balance_loss_mlp": 1.0295589, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.3954283116146997, + "language_loss": 0.77229655, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79423589, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.16162109, + "step": 5302, + "time_per_iteration": 2.522315502166748 + }, + { + "auxiliary_loss_clip": 0.01134535, + "auxiliary_loss_mlp": 0.01039678, + "balance_loss_clip": 1.04928935, + "balance_loss_mlp": 1.02391815, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.7411539530432545, + "language_loss": 0.7941795, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81592166, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.1574707, + "step": 5303, + "time_per_iteration": 2.552158832550049 + }, + { + "auxiliary_loss_clip": 0.01146458, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.05184829, + "balance_loss_mlp": 1.02751422, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.7669923234746787, + "language_loss": 0.72848314, + "learning_rate": 3.186539603020047e-06, + "loss": 0.75039023, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.1673584, + "step": 5304, + "time_per_iteration": 2.4834372997283936 + }, + { + "auxiliary_loss_clip": 0.01152031, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.06465983, + "balance_loss_mlp": 1.0245707, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.918935362688439, + "language_loss": 0.71909201, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74100757, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.1494751, + "step": 5305, + "time_per_iteration": 2.5108234882354736 + }, + { + "auxiliary_loss_clip": 0.01146424, + "auxiliary_loss_mlp": 0.01041021, + "balance_loss_clip": 1.05556738, + "balance_loss_mlp": 1.02560711, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.839469564860509, + "language_loss": 0.64012527, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66199976, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.1541748, + "step": 5306, + "time_per_iteration": 2.9017837047576904 + }, + { + "auxiliary_loss_clip": 0.01146669, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.05556929, + "balance_loss_mlp": 1.02782106, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.8843276868678696, + "language_loss": 0.79670131, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81860715, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.16088867, + "step": 5307, + "time_per_iteration": 2.9268057346343994 + }, + { + "auxiliary_loss_clip": 0.01139358, + "auxiliary_loss_mlp": 0.01042264, + "balance_loss_clip": 1.05195689, + "balance_loss_mlp": 1.0257889, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.721273290992854, + "language_loss": 0.77324468, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79506087, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.16448975, + "step": 5308, + "time_per_iteration": 2.474259614944458 + }, + { + "auxiliary_loss_clip": 0.01152503, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.05560458, + "balance_loss_mlp": 1.02734804, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 2.2641357344888644, + "language_loss": 0.74526131, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76724935, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.1895752, + "step": 5309, + "time_per_iteration": 2.4843764305114746 + }, + { + "auxiliary_loss_clip": 0.0114573, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.05720019, + "balance_loss_mlp": 1.0191946, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.964207720164094, + "language_loss": 0.82065368, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84244627, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.14324951, + "step": 5310, + "time_per_iteration": 2.574244737625122 + }, + { + "auxiliary_loss_clip": 0.0113659, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.04882288, + "balance_loss_mlp": 1.02071118, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4473915687625147, + "language_loss": 0.78716362, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80888557, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14892578, + "step": 5311, + "time_per_iteration": 2.6334033012390137 + }, + { + "auxiliary_loss_clip": 0.01147367, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.05751157, + "balance_loss_mlp": 1.0211637, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 1.9309620870352744, + "language_loss": 0.84639561, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86823839, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.15734863, + "step": 5312, + "time_per_iteration": 2.644432783126831 + }, + { + "auxiliary_loss_clip": 0.01143467, + "auxiliary_loss_mlp": 0.01043029, + "balance_loss_clip": 1.05116463, + "balance_loss_mlp": 1.02625668, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.399417663534226, + "language_loss": 0.79203033, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.81389529, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.16772461, + "step": 5313, + "time_per_iteration": 2.608565330505371 + }, + { + "auxiliary_loss_clip": 0.01147147, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.05763221, + "balance_loss_mlp": 1.01801705, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.945497628175462, + "language_loss": 0.86915636, + "learning_rate": 3.183402174406057e-06, + "loss": 0.89096391, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15588379, + "step": 5314, + "time_per_iteration": 2.663417339324951 + }, + { + "auxiliary_loss_clip": 0.01141063, + "auxiliary_loss_mlp": 0.01040435, + "balance_loss_clip": 1.05176783, + "balance_loss_mlp": 1.0240202, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 2.331433776273153, + "language_loss": 0.80279112, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82460606, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.16418457, + "step": 5315, + "time_per_iteration": 2.5305306911468506 + }, + { + "auxiliary_loss_clip": 0.01152617, + "auxiliary_loss_mlp": 0.01044605, + "balance_loss_clip": 1.06081653, + "balance_loss_mlp": 1.02808845, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 2.0326982407141503, + "language_loss": 0.67746925, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69944155, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.16516113, + "step": 5316, + "time_per_iteration": 2.4495022296905518 + }, + { + "auxiliary_loss_clip": 0.01135864, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.04732132, + "balance_loss_mlp": 1.02140045, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.6340359724885516, + "language_loss": 0.69150209, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71321559, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.140625, + "step": 5317, + "time_per_iteration": 2.5433349609375 + }, + { + "auxiliary_loss_clip": 0.01061391, + "auxiliary_loss_mlp": 0.01004937, + "balance_loss_clip": 1.03040004, + "balance_loss_mlp": 1.00313747, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7314574269315646, + "language_loss": 0.53082442, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55148768, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.01794434, + "step": 5318, + "time_per_iteration": 3.2330174446105957 + }, + { + "auxiliary_loss_clip": 0.01136814, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.0489347, + "balance_loss_mlp": 1.02599478, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 14.169656725670905, + "language_loss": 0.83864808, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86042666, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.1505127, + "step": 5319, + "time_per_iteration": 2.4315788745880127 + }, + { + "auxiliary_loss_clip": 0.01132018, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.04545081, + "balance_loss_mlp": 1.02690578, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.8765240667342835, + "language_loss": 0.63574743, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65748966, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.15289307, + "step": 5320, + "time_per_iteration": 2.637056827545166 + }, + { + "auxiliary_loss_clip": 0.01141762, + "auxiliary_loss_mlp": 0.01043608, + "balance_loss_clip": 1.0501132, + "balance_loss_mlp": 1.02653742, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 2.3547745707326695, + "language_loss": 0.70657831, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72843194, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.1706543, + "step": 5321, + "time_per_iteration": 2.8063528537750244 + }, + { + "auxiliary_loss_clip": 0.01162689, + "auxiliary_loss_mlp": 0.01063578, + "balance_loss_clip": 1.06441236, + "balance_loss_mlp": 1.0465188, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 2.4694392504244633, + "language_loss": 0.86763698, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88989961, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.1706543, + "step": 5322, + "time_per_iteration": 3.914607048034668 + }, + { + "auxiliary_loss_clip": 0.011426, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.05432343, + "balance_loss_mlp": 1.01912534, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.9210516494735121, + "language_loss": 0.83281034, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85457993, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15216064, + "step": 5323, + "time_per_iteration": 2.4542276859283447 + }, + { + "auxiliary_loss_clip": 0.01143829, + "auxiliary_loss_mlp": 0.01044915, + "balance_loss_clip": 1.05440784, + "balance_loss_mlp": 1.02679527, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.7840479626203571, + "language_loss": 0.77718723, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.79907465, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.18115234, + "step": 5324, + "time_per_iteration": 2.4461183547973633 + }, + { + "auxiliary_loss_clip": 0.01148821, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.05745673, + "balance_loss_mlp": 1.02001357, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.773774388591054, + "language_loss": 0.80356276, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82540691, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.15563965, + "step": 5325, + "time_per_iteration": 2.498856782913208 + }, + { + "auxiliary_loss_clip": 0.01145287, + "auxiliary_loss_mlp": 0.01042188, + "balance_loss_clip": 1.05631804, + "balance_loss_mlp": 1.02692366, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.6634434492105166, + "language_loss": 0.75009191, + "learning_rate": 3.179631337655037e-06, + "loss": 0.7719667, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.15246582, + "step": 5326, + "time_per_iteration": 2.6052236557006836 + }, + { + "auxiliary_loss_clip": 0.0114387, + "auxiliary_loss_mlp": 0.01040715, + "balance_loss_clip": 1.05569851, + "balance_loss_mlp": 1.02540874, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4541981801120503, + "language_loss": 0.8107962, + "learning_rate": 3.179316810218701e-06, + "loss": 0.83264208, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.1529541, + "step": 5327, + "time_per_iteration": 2.491551399230957 + }, + { + "auxiliary_loss_clip": 0.01143379, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.05112851, + "balance_loss_mlp": 1.02112591, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.4348647224052886, + "language_loss": 0.77630991, + "learning_rate": 3.179002238062554e-06, + "loss": 0.79811734, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.16223145, + "step": 5328, + "time_per_iteration": 2.502359390258789 + }, + { + "auxiliary_loss_clip": 0.0114274, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.05303764, + "balance_loss_mlp": 1.02856731, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.6337111731227851, + "language_loss": 0.73338735, + "learning_rate": 3.178687621198524e-06, + "loss": 0.75526285, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16223145, + "step": 5329, + "time_per_iteration": 2.5105133056640625 + }, + { + "auxiliary_loss_clip": 0.01139131, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.0555687, + "balance_loss_mlp": 1.02090859, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 7.995719909179644, + "language_loss": 0.70782077, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.72956413, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.14300537, + "step": 5330, + "time_per_iteration": 2.4530582427978516 + }, + { + "auxiliary_loss_clip": 0.01147594, + "auxiliary_loss_mlp": 0.01046473, + "balance_loss_clip": 1.05238867, + "balance_loss_mlp": 1.02850819, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.6818335453814879, + "language_loss": 0.80021858, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82215917, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.1796875, + "step": 5331, + "time_per_iteration": 2.614802598953247 + }, + { + "auxiliary_loss_clip": 0.0106497, + "auxiliary_loss_mlp": 0.01009228, + "balance_loss_clip": 1.03346741, + "balance_loss_mlp": 1.00739789, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8296061165347872, + "language_loss": 0.57808793, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59882981, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.01828003, + "step": 5332, + "time_per_iteration": 3.0496487617492676 + }, + { + "auxiliary_loss_clip": 0.01152701, + "auxiliary_loss_mlp": 0.01044122, + "balance_loss_clip": 1.06181157, + "balance_loss_mlp": 1.02798665, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.6327902847329239, + "language_loss": 0.73252404, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75449228, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.16137695, + "step": 5333, + "time_per_iteration": 5.603939533233643 + }, + { + "auxiliary_loss_clip": 0.01151803, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.06172776, + "balance_loss_mlp": 1.02282405, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.8242925188544044, + "language_loss": 0.70599884, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72790337, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15844727, + "step": 5334, + "time_per_iteration": 2.561962366104126 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01041262, + "balance_loss_clip": 1.04670262, + "balance_loss_mlp": 1.02534747, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9578996772635193, + "language_loss": 0.77077252, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79254568, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.15893555, + "step": 5335, + "time_per_iteration": 2.513974666595459 + }, + { + "auxiliary_loss_clip": 0.01138511, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.05031133, + "balance_loss_mlp": 1.0251143, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.4673266730473766, + "language_loss": 0.68371588, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70551181, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.15979004, + "step": 5336, + "time_per_iteration": 2.626838445663452 + }, + { + "auxiliary_loss_clip": 0.01148243, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.05762827, + "balance_loss_mlp": 1.02249956, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 6.998694331160597, + "language_loss": 0.78968334, + "learning_rate": 3.176169078234487e-06, + "loss": 0.8115474, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15661621, + "step": 5337, + "time_per_iteration": 2.462761402130127 + }, + { + "auxiliary_loss_clip": 0.01142708, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.05797756, + "balance_loss_mlp": 1.02104485, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.598500097848184, + "language_loss": 0.74001235, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76179385, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14398193, + "step": 5338, + "time_per_iteration": 2.5380122661590576 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.04560161, + "balance_loss_mlp": 1.02199674, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 3.336455510817833, + "language_loss": 0.627002, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.64874589, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.16027832, + "step": 5339, + "time_per_iteration": 2.5181283950805664 + }, + { + "auxiliary_loss_clip": 0.01146555, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.05789196, + "balance_loss_mlp": 1.02452397, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.2729346982449776, + "language_loss": 0.81842613, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84029144, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.15460205, + "step": 5340, + "time_per_iteration": 3.8170158863067627 + }, + { + "auxiliary_loss_clip": 0.01144147, + "auxiliary_loss_mlp": 0.01040803, + "balance_loss_clip": 1.05576992, + "balance_loss_mlp": 1.02550888, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.9895304000512875, + "language_loss": 0.76562977, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78747928, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.15289307, + "step": 5341, + "time_per_iteration": 2.4528138637542725 + }, + { + "auxiliary_loss_clip": 0.01135539, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.0501039, + "balance_loss_mlp": 1.02529752, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.623249641261877, + "language_loss": 0.78953344, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81130528, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.16333008, + "step": 5342, + "time_per_iteration": 2.4309775829315186 + }, + { + "auxiliary_loss_clip": 0.01153357, + "auxiliary_loss_mlp": 0.01043978, + "balance_loss_clip": 1.05889547, + "balance_loss_mlp": 1.02720499, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 2.9098106667396753, + "language_loss": 0.74658799, + "learning_rate": 3.174278297458438e-06, + "loss": 0.7685613, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.16784668, + "step": 5343, + "time_per_iteration": 2.4478306770324707 + }, + { + "auxiliary_loss_clip": 0.01142692, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.05312347, + "balance_loss_mlp": 1.01849878, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.687213589187269, + "language_loss": 0.82660323, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84837121, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.15600586, + "step": 5344, + "time_per_iteration": 2.5404787063598633 + }, + { + "auxiliary_loss_clip": 0.01147901, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.05836415, + "balance_loss_mlp": 1.01898205, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.098370465336332, + "language_loss": 0.79256749, + "learning_rate": 3.173647680842262e-06, + "loss": 0.8143909, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15435791, + "step": 5345, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.01140225, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.05085325, + "balance_loss_mlp": 1.02144861, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.7816019576947502, + "language_loss": 0.83149582, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85326415, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.15161133, + "step": 5346, + "time_per_iteration": 2.5986578464508057 + }, + { + "auxiliary_loss_clip": 0.01151641, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.05940592, + "balance_loss_mlp": 1.0195055, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.479657602811351, + "language_loss": 0.81802881, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83990008, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.15985107, + "step": 5347, + "time_per_iteration": 2.744142532348633 + }, + { + "auxiliary_loss_clip": 0.01136417, + "auxiliary_loss_mlp": 0.01039087, + "balance_loss_clip": 1.0489012, + "balance_loss_mlp": 1.02257621, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 2.0887064290534054, + "language_loss": 0.80050689, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82226193, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.1652832, + "step": 5348, + "time_per_iteration": 2.471748113632202 + }, + { + "auxiliary_loss_clip": 0.01145167, + "auxiliary_loss_mlp": 0.01047732, + "balance_loss_clip": 1.0533694, + "balance_loss_mlp": 1.03166294, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 2.5418115758067588, + "language_loss": 0.85703659, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87896562, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.16064453, + "step": 5349, + "time_per_iteration": 2.5006701946258545 + }, + { + "auxiliary_loss_clip": 0.01149817, + "auxiliary_loss_mlp": 0.01042255, + "balance_loss_clip": 1.05766273, + "balance_loss_mlp": 1.02580392, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.6876183837229874, + "language_loss": 0.80556828, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82748902, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.16455078, + "step": 5350, + "time_per_iteration": 2.4153006076812744 + }, + { + "auxiliary_loss_clip": 0.01142405, + "auxiliary_loss_mlp": 0.01040551, + "balance_loss_clip": 1.05252802, + "balance_loss_mlp": 1.02531588, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 2.2521352164295996, + "language_loss": 0.79914773, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82097727, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.15228271, + "step": 5351, + "time_per_iteration": 2.5715606212615967 + }, + { + "auxiliary_loss_clip": 0.01145835, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.05797005, + "balance_loss_mlp": 1.02676058, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.7618023029358958, + "language_loss": 0.75791085, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77979797, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.16113281, + "step": 5352, + "time_per_iteration": 2.4641668796539307 + }, + { + "auxiliary_loss_clip": 0.01140398, + "auxiliary_loss_mlp": 0.01039564, + "balance_loss_clip": 1.05109334, + "balance_loss_mlp": 1.02341676, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 2.2253222649585713, + "language_loss": 0.81947261, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.84127218, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.16143799, + "step": 5353, + "time_per_iteration": 2.502073287963867 + }, + { + "auxiliary_loss_clip": 0.01132068, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.04764795, + "balance_loss_mlp": 1.02843845, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.9958716684030802, + "language_loss": 0.73203743, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75381505, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.17254639, + "step": 5354, + "time_per_iteration": 2.4692766666412354 + }, + { + "auxiliary_loss_clip": 0.01138861, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.04993761, + "balance_loss_mlp": 1.02350485, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 2.2198533789647197, + "language_loss": 0.83868963, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.86047173, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.15856934, + "step": 5355, + "time_per_iteration": 2.4822652339935303 + }, + { + "auxiliary_loss_clip": 0.01146199, + "auxiliary_loss_mlp": 0.01056515, + "balance_loss_clip": 1.05417991, + "balance_loss_mlp": 1.03901482, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 1.8154736085712497, + "language_loss": 0.71309978, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73512691, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.17492676, + "step": 5356, + "time_per_iteration": 2.4331095218658447 + }, + { + "auxiliary_loss_clip": 0.0115246, + "auxiliary_loss_mlp": 0.0104908, + "balance_loss_clip": 1.05418015, + "balance_loss_mlp": 1.03173566, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5380505048950686, + "language_loss": 0.68252659, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70454192, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.17333984, + "step": 5357, + "time_per_iteration": 2.5250093936920166 + }, + { + "auxiliary_loss_clip": 0.01103588, + "auxiliary_loss_mlp": 0.01007875, + "balance_loss_clip": 1.07112873, + "balance_loss_mlp": 1.005193, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7055522753603655, + "language_loss": 0.58255398, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60366857, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.32470703, + "router_z_loss_mlp": 0.02685547, + "step": 5358, + "time_per_iteration": 3.1733481884002686 + }, + { + "auxiliary_loss_clip": 0.01148994, + "auxiliary_loss_mlp": 0.01050694, + "balance_loss_clip": 1.05462623, + "balance_loss_mlp": 1.033957, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 5.237830875311917, + "language_loss": 0.83462983, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85662675, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.1673584, + "step": 5359, + "time_per_iteration": 2.4566054344177246 + }, + { + "auxiliary_loss_clip": 0.01146098, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.05596459, + "balance_loss_mlp": 1.02303982, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 2.793320604288175, + "language_loss": 0.79695618, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81879777, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.15026855, + "step": 5360, + "time_per_iteration": 2.8097376823425293 + }, + { + "auxiliary_loss_clip": 0.01095723, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.06499147, + "balance_loss_mlp": 1.00175798, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.65043859040599, + "language_loss": 0.56954348, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59053969, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.30712891, + "router_z_loss_mlp": 0.02139282, + "step": 5361, + "time_per_iteration": 2.9460813999176025 + }, + { + "auxiliary_loss_clip": 0.01148977, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_clip": 1.05907321, + "balance_loss_mlp": 1.02831745, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.9641075862858248, + "language_loss": 0.71448576, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73642361, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.16491699, + "step": 5362, + "time_per_iteration": 2.503737211227417 + }, + { + "auxiliary_loss_clip": 0.01143871, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.05557942, + "balance_loss_mlp": 1.03072321, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.749400996699699, + "language_loss": 0.73871309, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76059949, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.14056396, + "step": 5363, + "time_per_iteration": 2.5058093070983887 + }, + { + "auxiliary_loss_clip": 0.01147285, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_clip": 1.05211782, + "balance_loss_mlp": 1.02858984, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.553969589988457, + "language_loss": 0.76625049, + "learning_rate": 3.167647957801365e-06, + "loss": 0.78816766, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15844727, + "step": 5364, + "time_per_iteration": 2.4365532398223877 + }, + { + "auxiliary_loss_clip": 0.01155395, + "auxiliary_loss_mlp": 0.01038791, + "balance_loss_clip": 1.06232619, + "balance_loss_mlp": 1.02310324, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.2427355706224383, + "language_loss": 0.76808888, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79003072, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.15673828, + "step": 5365, + "time_per_iteration": 2.460050344467163 + }, + { + "auxiliary_loss_clip": 0.0115039, + "auxiliary_loss_mlp": 0.01044134, + "balance_loss_clip": 1.06109083, + "balance_loss_mlp": 1.0287199, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.9729740833015514, + "language_loss": 0.76670831, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78865355, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15405273, + "step": 5366, + "time_per_iteration": 3.9388394355773926 + }, + { + "auxiliary_loss_clip": 0.01145983, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.05431247, + "balance_loss_mlp": 1.02884007, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.9435217529015956, + "language_loss": 0.72255063, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74446017, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.16137695, + "step": 5367, + "time_per_iteration": 2.4281511306762695 + }, + { + "auxiliary_loss_clip": 0.01138325, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.05106437, + "balance_loss_mlp": 1.0211525, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 11.526940492525261, + "language_loss": 0.74140644, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76314723, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14611816, + "step": 5368, + "time_per_iteration": 2.4990346431732178 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.05528903, + "balance_loss_mlp": 1.01995277, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 2.918209117529622, + "language_loss": 0.78878015, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81053388, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14916992, + "step": 5369, + "time_per_iteration": 2.4895100593566895 + }, + { + "auxiliary_loss_clip": 0.01135369, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.05156755, + "balance_loss_mlp": 1.01908886, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.9963412103397125, + "language_loss": 0.83352661, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85521531, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14416504, + "step": 5370, + "time_per_iteration": 2.474339008331299 + }, + { + "auxiliary_loss_clip": 0.01149018, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.0586642, + "balance_loss_mlp": 1.02215552, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 3.7886356344061785, + "language_loss": 0.83154583, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85340703, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.14953613, + "step": 5371, + "time_per_iteration": 2.4475603103637695 + }, + { + "auxiliary_loss_clip": 0.01142724, + "auxiliary_loss_mlp": 0.01044267, + "balance_loss_clip": 1.05169082, + "balance_loss_mlp": 1.02748239, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 3.338190414523831, + "language_loss": 0.88618326, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90805316, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.16772461, + "step": 5372, + "time_per_iteration": 2.4774374961853027 + }, + { + "auxiliary_loss_clip": 0.01137669, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.05111349, + "balance_loss_mlp": 1.02445674, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 20.665094803989316, + "language_loss": 0.72913671, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75091016, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15246582, + "step": 5373, + "time_per_iteration": 2.666637420654297 + }, + { + "auxiliary_loss_clip": 0.0114034, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.05543566, + "balance_loss_mlp": 1.02056289, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.2058822304166497, + "language_loss": 0.81264734, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83440447, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14794922, + "step": 5374, + "time_per_iteration": 2.621082067489624 + }, + { + "auxiliary_loss_clip": 0.01136915, + "auxiliary_loss_mlp": 0.01037525, + "balance_loss_clip": 1.04994965, + "balance_loss_mlp": 1.02256429, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.1138741396968026, + "language_loss": 0.87108755, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89283192, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.1494751, + "step": 5375, + "time_per_iteration": 2.4534380435943604 + }, + { + "auxiliary_loss_clip": 0.01153232, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.06317663, + "balance_loss_mlp": 1.0188334, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 2.058789417521758, + "language_loss": 0.76446271, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78634751, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.16418457, + "step": 5376, + "time_per_iteration": 2.592787265777588 + }, + { + "auxiliary_loss_clip": 0.01131756, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.04633045, + "balance_loss_mlp": 1.02106881, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.499410651918161, + "language_loss": 0.66879094, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69045573, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13671875, + "step": 5377, + "time_per_iteration": 3.8860747814178467 + }, + { + "auxiliary_loss_clip": 0.01139216, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.05103767, + "balance_loss_mlp": 1.02824938, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.6048092795538487, + "language_loss": 0.72094738, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74279034, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.16833496, + "step": 5378, + "time_per_iteration": 2.4931397438049316 + }, + { + "auxiliary_loss_clip": 0.01145894, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.05584311, + "balance_loss_mlp": 1.01955271, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.971010269256266, + "language_loss": 0.82104504, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84285003, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.1505127, + "step": 5379, + "time_per_iteration": 2.633704423904419 + }, + { + "auxiliary_loss_clip": 0.01138805, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.04974174, + "balance_loss_mlp": 1.02126694, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.952435773462618, + "language_loss": 0.78524733, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80699253, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14440918, + "step": 5380, + "time_per_iteration": 2.6330177783966064 + }, + { + "auxiliary_loss_clip": 0.01136284, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.04891276, + "balance_loss_mlp": 1.02604508, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.763581747257545, + "language_loss": 0.77616751, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79793835, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14758301, + "step": 5381, + "time_per_iteration": 2.5233426094055176 + }, + { + "auxiliary_loss_clip": 0.01129115, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.04543447, + "balance_loss_mlp": 1.02305233, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.812443667438591, + "language_loss": 0.7186057, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74026769, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14019775, + "step": 5382, + "time_per_iteration": 2.478925943374634 + }, + { + "auxiliary_loss_clip": 0.01138085, + "auxiliary_loss_mlp": 0.01047437, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.03108191, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.324328782806388, + "language_loss": 0.70907903, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.73093426, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.16357422, + "step": 5383, + "time_per_iteration": 2.52343487739563 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01046876, + "balance_loss_clip": 1.05105472, + "balance_loss_mlp": 1.03126562, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 1.9022315888306434, + "language_loss": 0.7836988, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80553889, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.15612793, + "step": 5384, + "time_per_iteration": 3.930454969406128 + }, + { + "auxiliary_loss_clip": 0.01143199, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.05396914, + "balance_loss_mlp": 1.02451146, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.525246264449695, + "language_loss": 0.75374579, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77558863, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.16589355, + "step": 5385, + "time_per_iteration": 2.442979335784912 + }, + { + "auxiliary_loss_clip": 0.01138017, + "auxiliary_loss_mlp": 0.01035468, + "balance_loss_clip": 1.05072594, + "balance_loss_mlp": 1.02020359, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.9426959287033492, + "language_loss": 0.71766073, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.73939556, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.15252686, + "step": 5386, + "time_per_iteration": 2.884058952331543 + }, + { + "auxiliary_loss_clip": 0.01138636, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.04692829, + "balance_loss_mlp": 1.02085555, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 2.160451601003301, + "language_loss": 0.94450653, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.9662596, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.1583252, + "step": 5387, + "time_per_iteration": 2.5964128971099854 + }, + { + "auxiliary_loss_clip": 0.01152534, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.06161106, + "balance_loss_mlp": 1.02583146, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.486970069766145, + "language_loss": 0.77465117, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79660106, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.16625977, + "step": 5388, + "time_per_iteration": 2.5513627529144287 + }, + { + "auxiliary_loss_clip": 0.01137869, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.04881716, + "balance_loss_mlp": 1.01728618, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.0316691338634407, + "language_loss": 0.71705341, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73875737, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.15234375, + "step": 5389, + "time_per_iteration": 2.9732580184936523 + }, + { + "auxiliary_loss_clip": 0.01136377, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.01756823, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 1.8167812381150747, + "language_loss": 0.80905116, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83074868, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.15808105, + "step": 5390, + "time_per_iteration": 2.5333738327026367 + }, + { + "auxiliary_loss_clip": 0.01141454, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.05363977, + "balance_loss_mlp": 1.026618, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.432134136623934, + "language_loss": 0.73057222, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75240648, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.15350342, + "step": 5391, + "time_per_iteration": 2.453504800796509 + }, + { + "auxiliary_loss_clip": 0.01135071, + "auxiliary_loss_mlp": 0.01037387, + "balance_loss_clip": 1.04966879, + "balance_loss_mlp": 1.02172339, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 2.120454241595208, + "language_loss": 0.77124435, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79296893, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.15649414, + "step": 5392, + "time_per_iteration": 2.4271469116210938 + }, + { + "auxiliary_loss_clip": 0.01138779, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.04792237, + "balance_loss_mlp": 1.02367926, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 6.972318557489506, + "language_loss": 0.62482309, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64661968, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.17199707, + "step": 5393, + "time_per_iteration": 2.5240495204925537 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.05693316, + "balance_loss_mlp": 1.0225842, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.9343806592928348, + "language_loss": 0.82887161, + "learning_rate": 3.158142199443371e-06, + "loss": 0.85070395, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.1550293, + "step": 5394, + "time_per_iteration": 2.4895057678222656 + }, + { + "auxiliary_loss_clip": 0.01140105, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_clip": 1.05466485, + "balance_loss_mlp": 1.03311169, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.9788382991697098, + "language_loss": 0.82026196, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.84213531, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14117432, + "step": 5395, + "time_per_iteration": 2.5159480571746826 + }, + { + "auxiliary_loss_clip": 0.01135397, + "auxiliary_loss_mlp": 0.01048821, + "balance_loss_clip": 1.05229306, + "balance_loss_mlp": 1.03257275, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.442113110019837, + "language_loss": 0.83169937, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85354155, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.16247559, + "step": 5396, + "time_per_iteration": 2.480055093765259 + }, + { + "auxiliary_loss_clip": 0.0115425, + "auxiliary_loss_mlp": 0.01038424, + "balance_loss_clip": 1.06078744, + "balance_loss_mlp": 1.02266443, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 2.0494083076129472, + "language_loss": 0.75621432, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77814102, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.15771484, + "step": 5397, + "time_per_iteration": 2.466346025466919 + }, + { + "auxiliary_loss_clip": 0.01146967, + "auxiliary_loss_mlp": 0.01035727, + "balance_loss_clip": 1.05983114, + "balance_loss_mlp": 1.02019429, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.3018458844282113, + "language_loss": 0.67064238, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.69246924, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15545654, + "step": 5398, + "time_per_iteration": 2.4797918796539307 + }, + { + "auxiliary_loss_clip": 0.01146907, + "auxiliary_loss_mlp": 0.01030835, + "balance_loss_clip": 1.0594629, + "balance_loss_mlp": 1.01587427, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4727461911041047, + "language_loss": 0.72988242, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75165987, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.1496582, + "step": 5399, + "time_per_iteration": 2.439918041229248 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.0611732, + "balance_loss_mlp": 1.01788473, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 4.535095725690787, + "language_loss": 0.71386701, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73567641, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.15203857, + "step": 5400, + "time_per_iteration": 2.451572895050049 + }, + { + "auxiliary_loss_clip": 0.01155038, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.06541657, + "balance_loss_mlp": 1.01741195, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 2.0889475909654203, + "language_loss": 0.79567873, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81754875, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.14550781, + "step": 5401, + "time_per_iteration": 2.5449376106262207 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.05238008, + "balance_loss_mlp": 1.02329922, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.6150693537888072, + "language_loss": 0.87713194, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89895695, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.16589355, + "step": 5402, + "time_per_iteration": 2.446453332901001 + }, + { + "auxiliary_loss_clip": 0.01135707, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.05119681, + "balance_loss_mlp": 1.01950586, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 2.4503926588058493, + "language_loss": 0.84548962, + "learning_rate": 3.155282749751332e-06, + "loss": 0.8671912, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14935303, + "step": 5403, + "time_per_iteration": 2.9421544075012207 + }, + { + "auxiliary_loss_clip": 0.011368, + "auxiliary_loss_mlp": 0.01042608, + "balance_loss_clip": 1.05294847, + "balance_loss_mlp": 1.02896476, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.4842009826795377, + "language_loss": 0.86895931, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89075339, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13647461, + "step": 5404, + "time_per_iteration": 2.4958760738372803 + }, + { + "auxiliary_loss_clip": 0.01145048, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.06078243, + "balance_loss_mlp": 1.01670802, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6650167214688407, + "language_loss": 0.72935307, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.75111866, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14794922, + "step": 5405, + "time_per_iteration": 2.516759157180786 + }, + { + "auxiliary_loss_clip": 0.01133816, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.0494144, + "balance_loss_mlp": 1.02988279, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.6121457523512133, + "language_loss": 0.82876396, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85056877, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.16784668, + "step": 5406, + "time_per_iteration": 2.4841837882995605 + }, + { + "auxiliary_loss_clip": 0.01138339, + "auxiliary_loss_mlp": 0.0104142, + "balance_loss_clip": 1.05244303, + "balance_loss_mlp": 1.02521992, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 2.112417554316483, + "language_loss": 0.87988126, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.9016788, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.1619873, + "step": 5407, + "time_per_iteration": 2.4180212020874023 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.05460143, + "balance_loss_mlp": 1.01847816, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 2.209179316996062, + "language_loss": 0.69802082, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71976209, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.1463623, + "step": 5408, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01161013, + "auxiliary_loss_mlp": 0.01031472, + "balance_loss_clip": 1.06686306, + "balance_loss_mlp": 1.0165292, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 2.010080645012375, + "language_loss": 0.776981, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79890585, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.14941406, + "step": 5409, + "time_per_iteration": 2.419428825378418 + }, + { + "auxiliary_loss_clip": 0.01148307, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_clip": 1.05830216, + "balance_loss_mlp": 1.02948046, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 1.8353412602812564, + "language_loss": 0.8329109, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85483748, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.14868164, + "step": 5410, + "time_per_iteration": 3.885981798171997 + }, + { + "auxiliary_loss_clip": 0.01140687, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.0563159, + "balance_loss_mlp": 1.01715827, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.8946908271078111, + "language_loss": 0.71147096, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73319393, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14440918, + "step": 5411, + "time_per_iteration": 2.4105281829833984 + }, + { + "auxiliary_loss_clip": 0.01136211, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_clip": 1.04929626, + "balance_loss_mlp": 1.02942586, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.600989832017656, + "language_loss": 0.83296323, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85475874, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.13916016, + "step": 5412, + "time_per_iteration": 2.5229508876800537 + }, + { + "auxiliary_loss_clip": 0.01141292, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.05204034, + "balance_loss_mlp": 1.02534628, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.600989306149282, + "language_loss": 0.81167525, + "learning_rate": 3.152101422008203e-06, + "loss": 0.83349568, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.15393066, + "step": 5413, + "time_per_iteration": 2.435950756072998 + }, + { + "auxiliary_loss_clip": 0.01132441, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.04781556, + "balance_loss_mlp": 1.02266216, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.6122014351970448, + "language_loss": 0.76973307, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79144007, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.15576172, + "step": 5414, + "time_per_iteration": 2.479360342025757 + }, + { + "auxiliary_loss_clip": 0.01079166, + "auxiliary_loss_mlp": 0.01005, + "balance_loss_clip": 1.04685354, + "balance_loss_mlp": 1.00311637, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9026426874511717, + "language_loss": 0.6395191, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66036075, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.32324219, + "router_z_loss_mlp": 0.01882935, + "step": 5415, + "time_per_iteration": 2.999108076095581 + }, + { + "auxiliary_loss_clip": 0.01145472, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.05875087, + "balance_loss_mlp": 1.01885343, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.41436688692094, + "language_loss": 0.74086094, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76265383, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.14953613, + "step": 5416, + "time_per_iteration": 2.9920494556427 + }, + { + "auxiliary_loss_clip": 0.01065678, + "auxiliary_loss_mlp": 0.01003643, + "balance_loss_clip": 1.03450406, + "balance_loss_mlp": 1.00184298, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7726209457726679, + "language_loss": 0.57911551, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59980869, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.31103516, + "router_z_loss_mlp": 0.01800537, + "step": 5417, + "time_per_iteration": 3.158628225326538 + }, + { + "auxiliary_loss_clip": 0.0107248, + "auxiliary_loss_mlp": 0.01006012, + "balance_loss_clip": 1.04026437, + "balance_loss_mlp": 1.00407517, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.8137970211159612, + "language_loss": 0.63478386, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65556878, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.01937866, + "step": 5418, + "time_per_iteration": 3.1632208824157715 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.05482697, + "balance_loss_mlp": 1.0286783, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.057377263826721, + "language_loss": 0.69515967, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71698046, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.1385498, + "step": 5419, + "time_per_iteration": 2.5065455436706543 + }, + { + "auxiliary_loss_clip": 0.01140312, + "auxiliary_loss_mlp": 0.0104427, + "balance_loss_clip": 1.05346847, + "balance_loss_mlp": 1.02737772, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6792891272579906, + "language_loss": 0.77190626, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79375207, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.16882324, + "step": 5420, + "time_per_iteration": 2.4334471225738525 + }, + { + "auxiliary_loss_clip": 0.01141015, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.0527544, + "balance_loss_mlp": 1.02687252, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.6260959545394071, + "language_loss": 0.80151337, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82334757, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.15539551, + "step": 5421, + "time_per_iteration": 5.285635232925415 + }, + { + "auxiliary_loss_clip": 0.01136103, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.05298662, + "balance_loss_mlp": 1.02239656, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5494356115456076, + "language_loss": 0.75362104, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77533841, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13232422, + "step": 5422, + "time_per_iteration": 2.5132524967193604 + }, + { + "auxiliary_loss_clip": 0.01140012, + "auxiliary_loss_mlp": 0.01045297, + "balance_loss_clip": 1.05382204, + "balance_loss_mlp": 1.03024626, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 5.612304560932837, + "language_loss": 0.6278615, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64971459, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15057373, + "step": 5423, + "time_per_iteration": 2.4288170337677 + }, + { + "auxiliary_loss_clip": 0.01126564, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.04628551, + "balance_loss_mlp": 1.02341723, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 1.6039216372071377, + "language_loss": 0.74767351, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76929617, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.1227417, + "step": 5424, + "time_per_iteration": 2.5258491039276123 + }, + { + "auxiliary_loss_clip": 0.01131136, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.04957473, + "balance_loss_mlp": 1.02765691, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.616797131433075, + "language_loss": 0.77110934, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79282665, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1293335, + "step": 5425, + "time_per_iteration": 2.5210788249969482 + }, + { + "auxiliary_loss_clip": 0.01138239, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.05011964, + "balance_loss_mlp": 1.02287388, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 2.527030830849103, + "language_loss": 0.78284192, + "learning_rate": 3.147959166423428e-06, + "loss": 0.80460954, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.15631104, + "step": 5426, + "time_per_iteration": 2.51786470413208 + }, + { + "auxiliary_loss_clip": 0.01133844, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.04783475, + "balance_loss_mlp": 1.02231431, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 3.124974455480868, + "language_loss": 0.7440322, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76574302, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14929199, + "step": 5427, + "time_per_iteration": 2.463909149169922 + }, + { + "auxiliary_loss_clip": 0.01139424, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.05252838, + "balance_loss_mlp": 1.02834868, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 2.4069846454559185, + "language_loss": 0.79052168, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8123607, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.16113281, + "step": 5428, + "time_per_iteration": 4.185624599456787 + }, + { + "auxiliary_loss_clip": 0.01132528, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.04897451, + "balance_loss_mlp": 1.02689838, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7008637522467422, + "language_loss": 0.7098887, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73162216, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13909912, + "step": 5429, + "time_per_iteration": 2.4772539138793945 + }, + { + "auxiliary_loss_clip": 0.01142475, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.05882001, + "balance_loss_mlp": 1.02326488, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.6768378756141151, + "language_loss": 0.78815985, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80995274, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13543701, + "step": 5430, + "time_per_iteration": 2.4276034832000732 + }, + { + "auxiliary_loss_clip": 0.01139288, + "auxiliary_loss_mlp": 0.01048993, + "balance_loss_clip": 1.05254221, + "balance_loss_mlp": 1.03163612, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 2.1224042884508947, + "language_loss": 0.84542334, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86730617, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.17346191, + "step": 5431, + "time_per_iteration": 2.465571403503418 + }, + { + "auxiliary_loss_clip": 0.01131634, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.04791951, + "balance_loss_mlp": 1.02965903, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.9293302813456346, + "language_loss": 0.71006161, + "learning_rate": 3.146044873294678e-06, + "loss": 0.73183608, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.16137695, + "step": 5432, + "time_per_iteration": 2.4302964210510254 + }, + { + "auxiliary_loss_clip": 0.01137874, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.0516274, + "balance_loss_mlp": 1.02016973, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4856184287176433, + "language_loss": 0.84119457, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86291134, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.13635254, + "step": 5433, + "time_per_iteration": 2.447416067123413 + }, + { + "auxiliary_loss_clip": 0.01136269, + "auxiliary_loss_mlp": 0.01051909, + "balance_loss_clip": 1.05239201, + "balance_loss_mlp": 1.03525496, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.4558364302773883, + "language_loss": 0.85893393, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88081568, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.16650391, + "step": 5434, + "time_per_iteration": 2.4374027252197266 + }, + { + "auxiliary_loss_clip": 0.01139282, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.05244958, + "balance_loss_mlp": 1.02122736, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 2.321376740412662, + "language_loss": 0.87943959, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.90119284, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14807129, + "step": 5435, + "time_per_iteration": 2.491486072540283 + }, + { + "auxiliary_loss_clip": 0.01143328, + "auxiliary_loss_mlp": 0.0103524, + "balance_loss_clip": 1.05753613, + "balance_loss_mlp": 1.02160263, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.3620192195640297, + "language_loss": 0.76223016, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78401589, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.13653564, + "step": 5436, + "time_per_iteration": 2.3798534870147705 + }, + { + "auxiliary_loss_clip": 0.01150804, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.06680703, + "balance_loss_mlp": 1.01847124, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.6955477711368618, + "language_loss": 0.72166741, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74349362, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13342285, + "step": 5437, + "time_per_iteration": 2.510999917984009 + }, + { + "auxiliary_loss_clip": 0.01143459, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.05236053, + "balance_loss_mlp": 1.02331305, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.594961067441614, + "language_loss": 0.63729274, + "learning_rate": 3.144129015673189e-06, + "loss": 0.65912461, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.16418457, + "step": 5438, + "time_per_iteration": 2.4511947631835938 + }, + { + "auxiliary_loss_clip": 0.01148328, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.06490016, + "balance_loss_mlp": 1.02298641, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.5705081500936988, + "language_loss": 0.74640751, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76826435, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.14373779, + "step": 5439, + "time_per_iteration": 2.5717334747314453 + }, + { + "auxiliary_loss_clip": 0.01146447, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.05811059, + "balance_loss_mlp": 1.03093863, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.0209375864996666, + "language_loss": 0.74795389, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.76988739, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15966797, + "step": 5440, + "time_per_iteration": 2.4943127632141113 + }, + { + "auxiliary_loss_clip": 0.01138357, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.05419707, + "balance_loss_mlp": 1.02912152, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 2.0585564827125755, + "language_loss": 0.84880745, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.8706103, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.12811279, + "step": 5441, + "time_per_iteration": 2.780086040496826 + }, + { + "auxiliary_loss_clip": 0.01135959, + "auxiliary_loss_mlp": 0.010394, + "balance_loss_clip": 1.05131912, + "balance_loss_mlp": 1.02535677, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 2.076967546865615, + "language_loss": 0.86917353, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89092714, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14050293, + "step": 5442, + "time_per_iteration": 2.56367564201355 + }, + { + "auxiliary_loss_clip": 0.01143572, + "auxiliary_loss_mlp": 0.01041159, + "balance_loss_clip": 1.05720019, + "balance_loss_mlp": 1.02573347, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.7273183038768802, + "language_loss": 0.77526808, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79711545, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.15429688, + "step": 5443, + "time_per_iteration": 2.5432028770446777 + }, + { + "auxiliary_loss_clip": 0.01137411, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.0488615, + "balance_loss_mlp": 1.02982938, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.1767241769082935, + "language_loss": 0.8142519, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83608484, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.16064453, + "step": 5444, + "time_per_iteration": 2.4488730430603027 + }, + { + "auxiliary_loss_clip": 0.01137691, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.05066514, + "balance_loss_mlp": 1.0311451, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 4.681291561548711, + "language_loss": 0.59197176, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61380517, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.14508057, + "step": 5445, + "time_per_iteration": 2.47882342338562 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_clip": 1.06292272, + "balance_loss_mlp": 1.03275657, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.2609415588949267, + "language_loss": 0.88214362, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90410477, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14990234, + "step": 5446, + "time_per_iteration": 2.463756561279297 + }, + { + "auxiliary_loss_clip": 0.011485, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_clip": 1.05826473, + "balance_loss_mlp": 1.02662492, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.6847115129866574, + "language_loss": 0.79061174, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81253302, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.17004395, + "step": 5447, + "time_per_iteration": 2.559117078781128 + }, + { + "auxiliary_loss_clip": 0.01147165, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.06138325, + "balance_loss_mlp": 1.02977383, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 6.255531956006023, + "language_loss": 0.73612148, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75802571, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.1348877, + "step": 5448, + "time_per_iteration": 2.5087506771087646 + }, + { + "auxiliary_loss_clip": 0.01130613, + "auxiliary_loss_mlp": 0.01040457, + "balance_loss_clip": 1.04597628, + "balance_loss_mlp": 1.02540112, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.4529273276676482, + "language_loss": 0.67198551, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69369614, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.1505127, + "step": 5449, + "time_per_iteration": 2.5437726974487305 + }, + { + "auxiliary_loss_clip": 0.0114624, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.05978727, + "balance_loss_mlp": 1.02684474, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.504764779173413, + "language_loss": 0.65668821, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67855859, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.13946533, + "step": 5450, + "time_per_iteration": 2.4803991317749023 + }, + { + "auxiliary_loss_clip": 0.01133358, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_clip": 1.04677033, + "balance_loss_mlp": 1.02766716, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 2.078602099965288, + "language_loss": 0.77732432, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79907948, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14501953, + "step": 5451, + "time_per_iteration": 2.5001378059387207 + }, + { + "auxiliary_loss_clip": 0.01140312, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.05263495, + "balance_loss_mlp": 1.02017307, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 3.1546831145782885, + "language_loss": 0.70484281, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72660208, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.15441895, + "step": 5452, + "time_per_iteration": 2.4740395545959473 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.05490983, + "balance_loss_mlp": 1.02320051, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.5218436419740415, + "language_loss": 0.78619736, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80795681, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13671875, + "step": 5453, + "time_per_iteration": 3.914910316467285 + }, + { + "auxiliary_loss_clip": 0.01139653, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.05247939, + "balance_loss_mlp": 1.02044153, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 1.9883351509583205, + "language_loss": 0.75487018, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77661765, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14642334, + "step": 5454, + "time_per_iteration": 2.615030288696289 + }, + { + "auxiliary_loss_clip": 0.01144183, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.06031525, + "balance_loss_mlp": 1.02185988, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 1.8841618655789922, + "language_loss": 0.76932585, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79111743, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13128662, + "step": 5455, + "time_per_iteration": 2.745044469833374 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.06552577, + "balance_loss_mlp": 1.01975989, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 52.67164565714303, + "language_loss": 0.74152416, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76342905, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.15710449, + "step": 5456, + "time_per_iteration": 2.49654483795166 + }, + { + "auxiliary_loss_clip": 0.01145176, + "auxiliary_loss_mlp": 0.01040011, + "balance_loss_clip": 1.05931234, + "balance_loss_mlp": 1.02614129, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.4553909104103426, + "language_loss": 0.78345424, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80530608, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.13867188, + "step": 5457, + "time_per_iteration": 2.466085195541382 + }, + { + "auxiliary_loss_clip": 0.01148945, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.05531311, + "balance_loss_mlp": 1.02281618, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.0929124783173862, + "language_loss": 0.79279697, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81466031, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.14575195, + "step": 5458, + "time_per_iteration": 2.523689031600952 + }, + { + "auxiliary_loss_clip": 0.01145606, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.05963349, + "balance_loss_mlp": 1.02122307, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 1.9367917931840166, + "language_loss": 0.72976112, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75157785, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14855957, + "step": 5459, + "time_per_iteration": 2.448806047439575 + }, + { + "auxiliary_loss_clip": 0.01145819, + "auxiliary_loss_mlp": 0.01047997, + "balance_loss_clip": 1.05651283, + "balance_loss_mlp": 1.03208852, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8158929825307286, + "language_loss": 0.83801299, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.85995114, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.15899658, + "step": 5460, + "time_per_iteration": 2.5523831844329834 + }, + { + "auxiliary_loss_clip": 0.0115253, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.06450081, + "balance_loss_mlp": 1.01893687, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 2.3854333808560457, + "language_loss": 0.77024066, + "learning_rate": 3.136770448642288e-06, + "loss": 0.79209244, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.13702393, + "step": 5461, + "time_per_iteration": 2.4835469722747803 + }, + { + "auxiliary_loss_clip": 0.01147705, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.0608685, + "balance_loss_mlp": 1.01767063, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 2.121839043937196, + "language_loss": 0.63254678, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65436053, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.16003418, + "step": 5462, + "time_per_iteration": 2.662609577178955 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.06056166, + "balance_loss_mlp": 1.01978004, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 2.7771649302594157, + "language_loss": 0.77926999, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80104363, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13500977, + "step": 5463, + "time_per_iteration": 2.4845383167266846 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.05428839, + "balance_loss_mlp": 1.02027071, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 1.8175114768014975, + "language_loss": 0.69907671, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72082996, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14764404, + "step": 5464, + "time_per_iteration": 5.266142845153809 + }, + { + "auxiliary_loss_clip": 0.0113708, + "auxiliary_loss_mlp": 0.01042076, + "balance_loss_clip": 1.05457973, + "balance_loss_mlp": 1.0259825, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.8062872888308124, + "language_loss": 0.7289263, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.75071776, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.16088867, + "step": 5465, + "time_per_iteration": 2.4969165325164795 + }, + { + "auxiliary_loss_clip": 0.01142471, + "auxiliary_loss_mlp": 0.01039008, + "balance_loss_clip": 1.05416346, + "balance_loss_mlp": 1.0246197, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.4452340414992093, + "language_loss": 0.82896042, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85077524, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.14379883, + "step": 5466, + "time_per_iteration": 2.419948101043701 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01045644, + "balance_loss_clip": 1.05049658, + "balance_loss_mlp": 1.0294075, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 2.1373723575368273, + "language_loss": 0.79770803, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81953186, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.16235352, + "step": 5467, + "time_per_iteration": 2.875986337661743 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.05536413, + "balance_loss_mlp": 1.01564145, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.769751107273817, + "language_loss": 0.74717915, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76889664, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.1451416, + "step": 5468, + "time_per_iteration": 2.5262036323547363 + }, + { + "auxiliary_loss_clip": 0.01141213, + "auxiliary_loss_mlp": 0.01041951, + "balance_loss_clip": 1.05201268, + "balance_loss_mlp": 1.02465427, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.8245761628599995, + "language_loss": 0.78690141, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80873305, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.17297363, + "step": 5469, + "time_per_iteration": 2.4420504570007324 + }, + { + "auxiliary_loss_clip": 0.0115006, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.06359005, + "balance_loss_mlp": 1.0194366, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.8478276445093298, + "language_loss": 0.81615114, + "learning_rate": 3.133884793883107e-06, + "loss": 0.83798802, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14202881, + "step": 5470, + "time_per_iteration": 2.446578025817871 + }, + { + "auxiliary_loss_clip": 0.01143311, + "auxiliary_loss_mlp": 0.0104041, + "balance_loss_clip": 1.05431783, + "balance_loss_mlp": 1.02474594, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.858301085064625, + "language_loss": 0.67922318, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.70106041, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.15661621, + "step": 5471, + "time_per_iteration": 2.7707509994506836 + }, + { + "auxiliary_loss_clip": 0.01150042, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.05896401, + "balance_loss_mlp": 1.02368081, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 2.2848844986878794, + "language_loss": 0.64886695, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67077655, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.17248535, + "step": 5472, + "time_per_iteration": 3.8751566410064697 + }, + { + "auxiliary_loss_clip": 0.01145438, + "auxiliary_loss_mlp": 0.01048106, + "balance_loss_clip": 1.05407572, + "balance_loss_mlp": 1.03153563, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.9629630170914554, + "language_loss": 0.88638711, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90832257, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16558838, + "step": 5473, + "time_per_iteration": 2.4234585762023926 + }, + { + "auxiliary_loss_clip": 0.01143759, + "auxiliary_loss_mlp": 0.01041096, + "balance_loss_clip": 1.05467629, + "balance_loss_mlp": 1.02440643, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.9492168815989488, + "language_loss": 0.7844941, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.8063426, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.16674805, + "step": 5474, + "time_per_iteration": 2.4450302124023438 + }, + { + "auxiliary_loss_clip": 0.01071011, + "auxiliary_loss_mlp": 0.01010957, + "balance_loss_clip": 1.04012084, + "balance_loss_mlp": 1.00906134, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8182161509797878, + "language_loss": 0.6024453, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62326503, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.01895142, + "step": 5475, + "time_per_iteration": 3.022726058959961 + }, + { + "auxiliary_loss_clip": 0.01151158, + "auxiliary_loss_mlp": 0.01049354, + "balance_loss_clip": 1.05514479, + "balance_loss_mlp": 1.03192568, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5733429667667624, + "language_loss": 0.76776326, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78976834, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.17431641, + "step": 5476, + "time_per_iteration": 2.4719607830047607 + }, + { + "auxiliary_loss_clip": 0.01144255, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_clip": 1.05705285, + "balance_loss_mlp": 1.03006983, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 4.016245377584737, + "language_loss": 0.746297, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76818335, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.14324951, + "step": 5477, + "time_per_iteration": 2.419682502746582 + }, + { + "auxiliary_loss_clip": 0.01135429, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.05194569, + "balance_loss_mlp": 1.02293301, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 2.0036654499181012, + "language_loss": 0.76113796, + "learning_rate": 3.131316843357713e-06, + "loss": 0.78285587, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13415527, + "step": 5478, + "time_per_iteration": 2.481252670288086 + }, + { + "auxiliary_loss_clip": 0.01142711, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.05747259, + "balance_loss_mlp": 1.02257836, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.781781746734893, + "language_loss": 0.80570334, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82749748, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14099121, + "step": 5479, + "time_per_iteration": 2.424428939819336 + }, + { + "auxiliary_loss_clip": 0.01063598, + "auxiliary_loss_mlp": 0.01008487, + "balance_loss_clip": 1.03290391, + "balance_loss_mlp": 1.00665474, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7426858461858156, + "language_loss": 0.56517482, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58589566, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.01834106, + "step": 5480, + "time_per_iteration": 3.3316519260406494 + }, + { + "auxiliary_loss_clip": 0.01139629, + "auxiliary_loss_mlp": 0.01047009, + "balance_loss_clip": 1.05306661, + "balance_loss_mlp": 1.03207207, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.6922664774451623, + "language_loss": 0.77276754, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79463398, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14929199, + "step": 5481, + "time_per_iteration": 2.4460785388946533 + }, + { + "auxiliary_loss_clip": 0.01153085, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.06062376, + "balance_loss_mlp": 1.02228236, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.5282667814144555, + "language_loss": 0.78213876, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80404145, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.14904785, + "step": 5482, + "time_per_iteration": 2.5174012184143066 + }, + { + "auxiliary_loss_clip": 0.01145301, + "auxiliary_loss_mlp": 0.01041064, + "balance_loss_clip": 1.05543494, + "balance_loss_mlp": 1.02474427, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 3.3114226995763207, + "language_loss": 0.73462498, + "learning_rate": 3.129710479645185e-06, + "loss": 0.75648862, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16314697, + "step": 5483, + "time_per_iteration": 2.4989571571350098 + }, + { + "auxiliary_loss_clip": 0.01139549, + "auxiliary_loss_mlp": 0.01040624, + "balance_loss_clip": 1.05296087, + "balance_loss_mlp": 1.02621198, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 2.0371712422193706, + "language_loss": 0.75583923, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77764094, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14404297, + "step": 5484, + "time_per_iteration": 2.5991756916046143 + }, + { + "auxiliary_loss_clip": 0.01138023, + "auxiliary_loss_mlp": 0.01047767, + "balance_loss_clip": 1.0521909, + "balance_loss_mlp": 1.03244853, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 1.8182754238027505, + "language_loss": 0.71674931, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73860717, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.15313721, + "step": 5485, + "time_per_iteration": 2.4861087799072266 + }, + { + "auxiliary_loss_clip": 0.01143452, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.05636752, + "balance_loss_mlp": 1.02509689, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.647878990717961, + "language_loss": 0.8080554, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82988822, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.14746094, + "step": 5486, + "time_per_iteration": 2.592427968978882 + }, + { + "auxiliary_loss_clip": 0.0114261, + "auxiliary_loss_mlp": 0.01043817, + "balance_loss_clip": 1.05230474, + "balance_loss_mlp": 1.02839124, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 1.9574171343954603, + "language_loss": 0.84270394, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86456817, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.15441895, + "step": 5487, + "time_per_iteration": 2.449272632598877 + }, + { + "auxiliary_loss_clip": 0.01156037, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_clip": 1.06587231, + "balance_loss_mlp": 1.03021431, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 3.298239457060564, + "language_loss": 0.74649477, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76852167, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.16430664, + "step": 5488, + "time_per_iteration": 2.515875816345215 + }, + { + "auxiliary_loss_clip": 0.01148446, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.05888188, + "balance_loss_mlp": 1.02661514, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.7272146053082094, + "language_loss": 0.72858059, + "learning_rate": 3.127781429646098e-06, + "loss": 0.75049055, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15924072, + "step": 5489, + "time_per_iteration": 2.4503743648529053 + }, + { + "auxiliary_loss_clip": 0.01141603, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.05519974, + "balance_loss_mlp": 1.02157366, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.855848721731686, + "language_loss": 0.88669455, + "learning_rate": 3.127459771562238e-06, + "loss": 0.9084779, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.15142822, + "step": 5490, + "time_per_iteration": 2.6166348457336426 + }, + { + "auxiliary_loss_clip": 0.01158716, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.0681982, + "balance_loss_mlp": 1.02320278, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 2.8543927663260242, + "language_loss": 0.83128273, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85324913, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.14715576, + "step": 5491, + "time_per_iteration": 2.4487709999084473 + }, + { + "auxiliary_loss_clip": 0.01140962, + "auxiliary_loss_mlp": 0.01037425, + "balance_loss_clip": 1.05525374, + "balance_loss_mlp": 1.02276218, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 1.8403652543367552, + "language_loss": 0.77767712, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79946101, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.14678955, + "step": 5492, + "time_per_iteration": 2.625765800476074 + }, + { + "auxiliary_loss_clip": 0.01149516, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.05993688, + "balance_loss_mlp": 1.02479374, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.020069255021196, + "language_loss": 0.74797207, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76988876, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.17340088, + "step": 5493, + "time_per_iteration": 2.613210439682007 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01015475, + "balance_loss_clip": 1.08463764, + "balance_loss_mlp": 1.01334167, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.786673298029854, + "language_loss": 0.53918368, + "learning_rate": 3.12617271181492e-06, + "loss": 0.56049454, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.30957031, + "router_z_loss_mlp": 0.02133179, + "step": 5494, + "time_per_iteration": 3.13948392868042 + }, + { + "auxiliary_loss_clip": 0.01147798, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.06007516, + "balance_loss_mlp": 1.02001691, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.6116635264607853, + "language_loss": 0.86846566, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.890302, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.1583252, + "step": 5495, + "time_per_iteration": 3.0209221839904785 + }, + { + "auxiliary_loss_clip": 0.01141622, + "auxiliary_loss_mlp": 0.01044117, + "balance_loss_clip": 1.05080593, + "balance_loss_mlp": 1.02649808, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 2.4972852656238134, + "language_loss": 0.73540413, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75726146, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.17626953, + "step": 5496, + "time_per_iteration": 2.620426654815674 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.05255246, + "balance_loss_mlp": 1.01905179, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.192565106931306, + "language_loss": 0.73083699, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.75259471, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.15405273, + "step": 5497, + "time_per_iteration": 3.8828587532043457 + }, + { + "auxiliary_loss_clip": 0.01144792, + "auxiliary_loss_mlp": 0.01037645, + "balance_loss_clip": 1.05388367, + "balance_loss_mlp": 1.02219582, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 3.0130853867149545, + "language_loss": 0.80373383, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82555819, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.15435791, + "step": 5498, + "time_per_iteration": 2.5246427059173584 + }, + { + "auxiliary_loss_clip": 0.01142444, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.05177546, + "balance_loss_mlp": 1.02727163, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.6144075892975662, + "language_loss": 0.76160371, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78345883, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15795898, + "step": 5499, + "time_per_iteration": 2.462568521499634 + }, + { + "auxiliary_loss_clip": 0.01143426, + "auxiliary_loss_mlp": 0.0103767, + "balance_loss_clip": 1.05380225, + "balance_loss_mlp": 1.02214885, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.5330808110153291, + "language_loss": 0.79364389, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81545484, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.1550293, + "step": 5500, + "time_per_iteration": 2.463780164718628 + }, + { + "auxiliary_loss_clip": 0.0115265, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.06217337, + "balance_loss_mlp": 1.01940036, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.764429288616422, + "language_loss": 0.66660511, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.6884867, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.16113281, + "step": 5501, + "time_per_iteration": 2.5831663608551025 + }, + { + "auxiliary_loss_clip": 0.011459, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_clip": 1.05597293, + "balance_loss_mlp": 1.02489102, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.4402799883323296, + "language_loss": 0.7730999, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79498178, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.1739502, + "step": 5502, + "time_per_iteration": 2.3903188705444336 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.05911422, + "balance_loss_mlp": 1.02873755, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.568190738449007, + "language_loss": 0.72234082, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74428713, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15588379, + "step": 5503, + "time_per_iteration": 2.5012340545654297 + }, + { + "auxiliary_loss_clip": 0.0115027, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.06291616, + "balance_loss_mlp": 1.02049851, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.9510302133377317, + "language_loss": 0.75467861, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77654505, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.15863037, + "step": 5504, + "time_per_iteration": 2.520761489868164 + }, + { + "auxiliary_loss_clip": 0.01147285, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.05957389, + "balance_loss_mlp": 1.02248323, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.8287620394681487, + "language_loss": 0.69801706, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.71986413, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14941406, + "step": 5505, + "time_per_iteration": 2.5487425327301025 + }, + { + "auxiliary_loss_clip": 0.01139553, + "auxiliary_loss_mlp": 0.01044353, + "balance_loss_clip": 1.05344594, + "balance_loss_mlp": 1.02926099, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.7415871861194236, + "language_loss": 0.82038993, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84222895, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.15087891, + "step": 5506, + "time_per_iteration": 2.511711597442627 + }, + { + "auxiliary_loss_clip": 0.01145326, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.05539155, + "balance_loss_mlp": 1.03370571, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.8738261369932967, + "language_loss": 0.79296839, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81493407, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.17541504, + "step": 5507, + "time_per_iteration": 3.8792896270751953 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01038622, + "balance_loss_clip": 1.05802667, + "balance_loss_mlp": 1.02323222, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.5527841891818568, + "language_loss": 0.71405894, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.7358973, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15393066, + "step": 5508, + "time_per_iteration": 2.957165479660034 + }, + { + "auxiliary_loss_clip": 0.01153894, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.06840849, + "balance_loss_mlp": 1.01912761, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 1.9292594340085296, + "language_loss": 0.72020495, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74208009, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14501953, + "step": 5509, + "time_per_iteration": 3.9287264347076416 + }, + { + "auxiliary_loss_clip": 0.01142004, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.05574512, + "balance_loss_mlp": 1.01945853, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 4.2956950365585, + "language_loss": 0.73042369, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75218725, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.14904785, + "step": 5510, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01137464, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.05246949, + "balance_loss_mlp": 1.02439272, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.4761989247615346, + "language_loss": 0.87852401, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90029287, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15014648, + "step": 5511, + "time_per_iteration": 2.438929796218872 + }, + { + "auxiliary_loss_clip": 0.01136168, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.05529833, + "balance_loss_mlp": 1.01784515, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.632678424247874, + "language_loss": 0.72930151, + "learning_rate": 3.12037249872891e-06, + "loss": 0.75097775, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13604736, + "step": 5512, + "time_per_iteration": 2.4758381843566895 + }, + { + "auxiliary_loss_clip": 0.01139759, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.05595911, + "balance_loss_mlp": 1.02333593, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.827615859968814, + "language_loss": 0.72019172, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74197185, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.14916992, + "step": 5513, + "time_per_iteration": 2.6086292266845703 + }, + { + "auxiliary_loss_clip": 0.01143715, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.05239224, + "balance_loss_mlp": 1.01966333, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.9697370353473054, + "language_loss": 0.68630862, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70810694, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.16442871, + "step": 5514, + "time_per_iteration": 2.504812002182007 + }, + { + "auxiliary_loss_clip": 0.01141811, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.05370724, + "balance_loss_mlp": 1.02394783, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.550282096285393, + "language_loss": 0.66523176, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68706584, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.1763916, + "step": 5515, + "time_per_iteration": 3.8345656394958496 + }, + { + "auxiliary_loss_clip": 0.01141621, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.05442798, + "balance_loss_mlp": 1.01741767, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.6294244425668485, + "language_loss": 0.69192266, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71366483, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.15185547, + "step": 5516, + "time_per_iteration": 2.480407476425171 + }, + { + "auxiliary_loss_clip": 0.01137601, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.04917943, + "balance_loss_mlp": 1.02285242, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 1.914879303723082, + "language_loss": 0.80715466, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82890975, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.15063477, + "step": 5517, + "time_per_iteration": 2.4901282787323 + }, + { + "auxiliary_loss_clip": 0.01149186, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.0199461, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.7841308013668054, + "language_loss": 0.74625015, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76808584, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14440918, + "step": 5518, + "time_per_iteration": 2.4276676177978516 + }, + { + "auxiliary_loss_clip": 0.0107262, + "auxiliary_loss_mlp": 0.01002979, + "balance_loss_clip": 1.04189789, + "balance_loss_mlp": 1.00084186, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6259556404026129, + "language_loss": 0.54336774, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56412375, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.30761719, + "router_z_loss_mlp": 0.0213623, + "step": 5519, + "time_per_iteration": 3.1885719299316406 + }, + { + "auxiliary_loss_clip": 0.01135208, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.04968786, + "balance_loss_mlp": 1.02134061, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 2.21471656952799, + "language_loss": 0.78566372, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80739057, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.16125488, + "step": 5520, + "time_per_iteration": 2.463176965713501 + }, + { + "auxiliary_loss_clip": 0.01139221, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.05416679, + "balance_loss_mlp": 1.02084589, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.109653176646682, + "language_loss": 0.76278591, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78453392, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14727783, + "step": 5521, + "time_per_iteration": 2.9491279125213623 + }, + { + "auxiliary_loss_clip": 0.01136174, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.0482924, + "balance_loss_mlp": 1.02648997, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 2.277870058133752, + "language_loss": 0.69938326, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72117287, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.16308594, + "step": 5522, + "time_per_iteration": 2.503687858581543 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.05039132, + "balance_loss_mlp": 1.02149701, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7364995675036798, + "language_loss": 0.73893905, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76064628, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.14569092, + "step": 5523, + "time_per_iteration": 2.491178274154663 + }, + { + "auxiliary_loss_clip": 0.01136132, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.05243134, + "balance_loss_mlp": 1.0204277, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.7403817501412453, + "language_loss": 0.82048911, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84220475, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.15002441, + "step": 5524, + "time_per_iteration": 2.4524214267730713 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01041239, + "balance_loss_clip": 1.04835844, + "balance_loss_mlp": 1.02756524, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6048116421524778, + "language_loss": 0.82947034, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85121065, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13677979, + "step": 5525, + "time_per_iteration": 2.511319160461426 + }, + { + "auxiliary_loss_clip": 0.01067152, + "auxiliary_loss_mlp": 0.01009302, + "balance_loss_clip": 1.03647113, + "balance_loss_mlp": 1.00701618, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7611583630876451, + "language_loss": 0.52606773, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54683232, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 0.02285767, + "step": 5526, + "time_per_iteration": 3.066577672958374 + }, + { + "auxiliary_loss_clip": 0.01143084, + "auxiliary_loss_mlp": 0.01057884, + "balance_loss_clip": 1.05377102, + "balance_loss_mlp": 1.04022872, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.8433167449645333, + "language_loss": 0.77775919, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79976887, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.17651367, + "step": 5527, + "time_per_iteration": 2.4815239906311035 + }, + { + "auxiliary_loss_clip": 0.01164773, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.0741508, + "balance_loss_mlp": 1.0283469, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 1.945785559539204, + "language_loss": 0.71870244, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74077898, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.14538574, + "step": 5528, + "time_per_iteration": 2.52488374710083 + }, + { + "auxiliary_loss_clip": 0.0113636, + "auxiliary_loss_mlp": 0.010438, + "balance_loss_clip": 1.04855144, + "balance_loss_mlp": 1.02731323, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.7528658435168383, + "language_loss": 0.82642537, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84822702, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.16479492, + "step": 5529, + "time_per_iteration": 2.4362146854400635 + }, + { + "auxiliary_loss_clip": 0.01148468, + "auxiliary_loss_mlp": 0.01044955, + "balance_loss_clip": 1.05387092, + "balance_loss_mlp": 1.02929044, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 2.4624716139649157, + "language_loss": 0.69515127, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71708548, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15667725, + "step": 5530, + "time_per_iteration": 2.430126190185547 + }, + { + "auxiliary_loss_clip": 0.01141673, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.0524385, + "balance_loss_mlp": 1.02888048, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 3.0665962312067223, + "language_loss": 0.76322663, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78509176, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15966797, + "step": 5531, + "time_per_iteration": 2.4480485916137695 + }, + { + "auxiliary_loss_clip": 0.01147726, + "auxiliary_loss_mlp": 0.01035897, + "balance_loss_clip": 1.056476, + "balance_loss_mlp": 1.01994646, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.8862396407264002, + "language_loss": 0.73186857, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75370479, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.1595459, + "step": 5532, + "time_per_iteration": 2.5443813800811768 + }, + { + "auxiliary_loss_clip": 0.01146804, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.06024218, + "balance_loss_mlp": 1.01805544, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.138552989785087, + "language_loss": 0.65826845, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.6800617, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14459229, + "step": 5533, + "time_per_iteration": 2.4473791122436523 + }, + { + "auxiliary_loss_clip": 0.01139424, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.05202508, + "balance_loss_mlp": 1.02223718, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 2.009467716886563, + "language_loss": 0.71175826, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73352426, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14935303, + "step": 5534, + "time_per_iteration": 2.6918294429779053 + }, + { + "auxiliary_loss_clip": 0.01150464, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.06404042, + "balance_loss_mlp": 1.02215409, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.9682888611189953, + "language_loss": 0.66654652, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.68842494, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.15216064, + "step": 5535, + "time_per_iteration": 2.6792311668395996 + }, + { + "auxiliary_loss_clip": 0.0115021, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.06201625, + "balance_loss_mlp": 1.02570581, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 3.2041237322584384, + "language_loss": 0.7261861, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74809307, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14764404, + "step": 5536, + "time_per_iteration": 2.537656545639038 + }, + { + "auxiliary_loss_clip": 0.01154271, + "auxiliary_loss_mlp": 0.0103689, + "balance_loss_clip": 1.06716835, + "balance_loss_mlp": 1.0218513, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 2.580469535533384, + "language_loss": 0.8188436, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84075522, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15032959, + "step": 5537, + "time_per_iteration": 2.51682448387146 + }, + { + "auxiliary_loss_clip": 0.01151107, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.06318963, + "balance_loss_mlp": 1.02418065, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 2.2148463072034614, + "language_loss": 0.71800745, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73991352, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.15319824, + "step": 5538, + "time_per_iteration": 2.6336417198181152 + }, + { + "auxiliary_loss_clip": 0.01140023, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.05564046, + "balance_loss_mlp": 1.01481581, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 1.8690184493440523, + "language_loss": 0.7433306, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76502818, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14916992, + "step": 5539, + "time_per_iteration": 2.4709482192993164 + }, + { + "auxiliary_loss_clip": 0.01155648, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.06293082, + "balance_loss_mlp": 1.02882016, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 2.165249336393262, + "language_loss": 0.70958984, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.7315861, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15148926, + "step": 5540, + "time_per_iteration": 3.8938469886779785 + }, + { + "auxiliary_loss_clip": 0.01148525, + "auxiliary_loss_mlp": 0.01035448, + "balance_loss_clip": 1.05989778, + "balance_loss_mlp": 1.0203557, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 1.5452441135606516, + "language_loss": 0.6099605, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.63180023, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15087891, + "step": 5541, + "time_per_iteration": 2.6066460609436035 + }, + { + "auxiliary_loss_clip": 0.01145716, + "auxiliary_loss_mlp": 0.01039281, + "balance_loss_clip": 1.05677176, + "balance_loss_mlp": 1.0231998, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.8910951553956352, + "language_loss": 0.68422794, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70607793, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.16082764, + "step": 5542, + "time_per_iteration": 2.4707577228546143 + }, + { + "auxiliary_loss_clip": 0.01141677, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.05537546, + "balance_loss_mlp": 1.02315664, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.7179535787650881, + "language_loss": 0.75528216, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77707142, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14074707, + "step": 5543, + "time_per_iteration": 2.416982650756836 + }, + { + "auxiliary_loss_clip": 0.01146216, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.05746102, + "balance_loss_mlp": 1.02335048, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 2.4582904744516965, + "language_loss": 0.74469388, + "learning_rate": 3.110027066843348e-06, + "loss": 0.76653886, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14953613, + "step": 5544, + "time_per_iteration": 2.5287818908691406 + }, + { + "auxiliary_loss_clip": 0.01142416, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.05692947, + "balance_loss_mlp": 1.01689219, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.859924220128471, + "language_loss": 0.70706928, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.72880793, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14569092, + "step": 5545, + "time_per_iteration": 2.4831721782684326 + }, + { + "auxiliary_loss_clip": 0.01139564, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.05561101, + "balance_loss_mlp": 1.01870692, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.6206969452920814, + "language_loss": 0.68895596, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71068215, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14355469, + "step": 5546, + "time_per_iteration": 2.4664247035980225 + }, + { + "auxiliary_loss_clip": 0.01141332, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.05101514, + "balance_loss_mlp": 1.02118659, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.8412608011433766, + "language_loss": 0.65054137, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.67231691, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.15014648, + "step": 5547, + "time_per_iteration": 2.5698659420013428 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.05877948, + "balance_loss_mlp": 1.01878595, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.142774110517164, + "language_loss": 0.85237843, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87414658, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.13391113, + "step": 5548, + "time_per_iteration": 2.754169225692749 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.06193852, + "balance_loss_mlp": 1.0198102, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.550894629473506, + "language_loss": 0.74786294, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76973671, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.16088867, + "step": 5549, + "time_per_iteration": 2.632197856903076 + }, + { + "auxiliary_loss_clip": 0.01139435, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.05191731, + "balance_loss_mlp": 1.02201962, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 2.082466929869048, + "language_loss": 0.67963028, + "learning_rate": 3.108082487713921e-06, + "loss": 0.701406, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.16113281, + "step": 5550, + "time_per_iteration": 2.653421640396118 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.04793692, + "balance_loss_mlp": 1.02669346, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.77462261135081, + "language_loss": 0.60510039, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62684089, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.14031982, + "step": 5551, + "time_per_iteration": 2.534597635269165 + }, + { + "auxiliary_loss_clip": 0.01139894, + "auxiliary_loss_mlp": 0.01042609, + "balance_loss_clip": 1.0527029, + "balance_loss_mlp": 1.02733839, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.8023737305152774, + "language_loss": 0.70428401, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72610903, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15270996, + "step": 5552, + "time_per_iteration": 5.3116419315338135 + }, + { + "auxiliary_loss_clip": 0.01144719, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.05880034, + "balance_loss_mlp": 1.02184963, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 5.907680801694374, + "language_loss": 0.82774174, + "learning_rate": 3.107109630732192e-06, + "loss": 0.84955084, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14337158, + "step": 5553, + "time_per_iteration": 2.48319411277771 + }, + { + "auxiliary_loss_clip": 0.01147086, + "auxiliary_loss_mlp": 0.01038315, + "balance_loss_clip": 1.05712938, + "balance_loss_mlp": 1.02134526, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.0821510301410493, + "language_loss": 0.8059907, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82784474, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.16967773, + "step": 5554, + "time_per_iteration": 2.4491965770721436 + }, + { + "auxiliary_loss_clip": 0.01145718, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.0600009, + "balance_loss_mlp": 1.0263679, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.6052781935718967, + "language_loss": 0.81453288, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83639729, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.14367676, + "step": 5555, + "time_per_iteration": 2.5075244903564453 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.04985213, + "balance_loss_mlp": 1.02418303, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.6856053640705835, + "language_loss": 0.74418944, + "learning_rate": 3.106136395915099e-06, + "loss": 0.765926, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.15374756, + "step": 5556, + "time_per_iteration": 2.5995728969573975 + }, + { + "auxiliary_loss_clip": 0.01155527, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.07039165, + "balance_loss_mlp": 1.0247817, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 2.073095204248628, + "language_loss": 0.82434672, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84629709, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14733887, + "step": 5557, + "time_per_iteration": 2.4937291145324707 + }, + { + "auxiliary_loss_clip": 0.01148229, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.05666232, + "balance_loss_mlp": 1.02495813, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.5842470089474774, + "language_loss": 0.80358529, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82547069, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15350342, + "step": 5558, + "time_per_iteration": 2.578524589538574 + }, + { + "auxiliary_loss_clip": 0.01139474, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.05361426, + "balance_loss_mlp": 1.01898837, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.9656551903756783, + "language_loss": 0.81202841, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83374989, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.13677979, + "step": 5559, + "time_per_iteration": 4.151397943496704 + }, + { + "auxiliary_loss_clip": 0.01143556, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.05908298, + "balance_loss_mlp": 1.02364433, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 8.145760360271364, + "language_loss": 0.71557647, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.73739183, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14331055, + "step": 5560, + "time_per_iteration": 2.69102144241333 + }, + { + "auxiliary_loss_clip": 0.0114187, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.05059552, + "balance_loss_mlp": 1.02690363, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 1.803739328225121, + "language_loss": 0.74881244, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77066112, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.16088867, + "step": 5561, + "time_per_iteration": 2.567941665649414 + }, + { + "auxiliary_loss_clip": 0.01132753, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.04701114, + "balance_loss_mlp": 1.02324867, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 1.6778270965882627, + "language_loss": 0.69356012, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71526903, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.14874268, + "step": 5562, + "time_per_iteration": 2.494380474090576 + }, + { + "auxiliary_loss_clip": 0.01136825, + "auxiliary_loss_mlp": 0.01046486, + "balance_loss_clip": 1.04969287, + "balance_loss_mlp": 1.03215742, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.6029200018091925, + "language_loss": 0.64729202, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.66912514, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14331055, + "step": 5563, + "time_per_iteration": 2.5280637741088867 + }, + { + "auxiliary_loss_clip": 0.01136578, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.04936504, + "balance_loss_mlp": 1.02717149, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 2.619828452262571, + "language_loss": 0.74484313, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76663703, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.15631104, + "step": 5564, + "time_per_iteration": 2.743419885635376 + }, + { + "auxiliary_loss_clip": 0.01050956, + "auxiliary_loss_mlp": 0.01007358, + "balance_loss_clip": 1.02208769, + "balance_loss_mlp": 1.00549865, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7777713966203946, + "language_loss": 0.5545873, + "learning_rate": 3.103214427773745e-06, + "loss": 0.5751704, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.28857422, + "router_z_loss_mlp": 0.01858521, + "step": 5565, + "time_per_iteration": 3.041335105895996 + }, + { + "auxiliary_loss_clip": 0.01138979, + "auxiliary_loss_mlp": 0.01047386, + "balance_loss_clip": 1.05449128, + "balance_loss_mlp": 1.0311023, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 2.1343713216075497, + "language_loss": 0.64829445, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67015815, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.16308594, + "step": 5566, + "time_per_iteration": 2.5582058429718018 + }, + { + "auxiliary_loss_clip": 0.01141042, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.05268574, + "balance_loss_mlp": 1.03126132, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 3.1659852266673347, + "language_loss": 0.77226675, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79414415, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.15423584, + "step": 5567, + "time_per_iteration": 2.412616729736328 + }, + { + "auxiliary_loss_clip": 0.0114697, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_clip": 1.05581021, + "balance_loss_mlp": 1.02643371, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 4.824631272655634, + "language_loss": 0.76574242, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78763539, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.15893555, + "step": 5568, + "time_per_iteration": 2.3749589920043945 + }, + { + "auxiliary_loss_clip": 0.01141685, + "auxiliary_loss_mlp": 0.01045057, + "balance_loss_clip": 1.05485845, + "balance_loss_mlp": 1.03012645, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.150834163352438, + "language_loss": 0.70724452, + "learning_rate": 3.101914687048842e-06, + "loss": 0.72911191, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.14923096, + "step": 5569, + "time_per_iteration": 2.434924602508545 + }, + { + "auxiliary_loss_clip": 0.01148927, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.05853975, + "balance_loss_mlp": 1.01820135, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.9755999603648013, + "language_loss": 0.90045089, + "learning_rate": 3.10158964737502e-06, + "loss": 0.92228097, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.15881348, + "step": 5570, + "time_per_iteration": 2.399505376815796 + }, + { + "auxiliary_loss_clip": 0.01157649, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.06873858, + "balance_loss_mlp": 1.02230358, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.7684516536948038, + "language_loss": 0.79859847, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82054436, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.1463623, + "step": 5571, + "time_per_iteration": 2.540355682373047 + }, + { + "auxiliary_loss_clip": 0.01091713, + "auxiliary_loss_mlp": 0.01012674, + "balance_loss_clip": 1.0619328, + "balance_loss_mlp": 1.01097846, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8937061530370011, + "language_loss": 0.55910188, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.58014572, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.29833984, + "router_z_loss_mlp": 0.01696777, + "step": 5572, + "time_per_iteration": 3.0152106285095215 + }, + { + "auxiliary_loss_clip": 0.01149315, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.06366825, + "balance_loss_mlp": 1.03051233, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 4.025160693084559, + "language_loss": 0.77834201, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.8002882, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14776611, + "step": 5573, + "time_per_iteration": 2.8871302604675293 + }, + { + "auxiliary_loss_clip": 0.0114254, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_clip": 1.05577588, + "balance_loss_mlp": 1.02981949, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1026784299153407, + "language_loss": 0.72345662, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.74534053, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.16015625, + "step": 5574, + "time_per_iteration": 2.531754970550537 + }, + { + "auxiliary_loss_clip": 0.0114487, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.06131232, + "balance_loss_mlp": 1.0188055, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.7574148524234932, + "language_loss": 0.87980735, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90158707, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.14294434, + "step": 5575, + "time_per_iteration": 2.618161916732788 + }, + { + "auxiliary_loss_clip": 0.01142184, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_clip": 1.05150795, + "balance_loss_mlp": 1.02969873, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 3.8060605864016206, + "language_loss": 0.82957727, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.85147798, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.18188477, + "step": 5576, + "time_per_iteration": 2.446073293685913 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01036982, + "balance_loss_clip": 1.05217814, + "balance_loss_mlp": 1.02130592, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.9940133122150248, + "language_loss": 0.7367509, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75852585, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15673828, + "step": 5577, + "time_per_iteration": 2.5211989879608154 + }, + { + "auxiliary_loss_clip": 0.01146391, + "auxiliary_loss_mlp": 0.01037694, + "balance_loss_clip": 1.05823767, + "balance_loss_mlp": 1.02161241, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 2.251262831552823, + "language_loss": 0.81913149, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8409723, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.16064453, + "step": 5578, + "time_per_iteration": 2.4295389652252197 + }, + { + "auxiliary_loss_clip": 0.01140238, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.0579927, + "balance_loss_mlp": 1.0276134, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 2.105603226526878, + "language_loss": 0.71566713, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73749161, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14581299, + "step": 5579, + "time_per_iteration": 2.4148917198181152 + }, + { + "auxiliary_loss_clip": 0.01139093, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.0516876, + "balance_loss_mlp": 1.01904678, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.20541697093684, + "language_loss": 0.81572056, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83745247, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.15045166, + "step": 5580, + "time_per_iteration": 2.484102249145508 + }, + { + "auxiliary_loss_clip": 0.01137977, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.05165148, + "balance_loss_mlp": 1.01497602, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.7057955728307228, + "language_loss": 0.77987325, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80155694, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15429688, + "step": 5581, + "time_per_iteration": 2.4834392070770264 + }, + { + "auxiliary_loss_clip": 0.0114373, + "auxiliary_loss_mlp": 0.01045014, + "balance_loss_clip": 1.05325127, + "balance_loss_mlp": 1.02768064, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 2.4384643664953285, + "language_loss": 0.74652225, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76840973, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.17321777, + "step": 5582, + "time_per_iteration": 3.9143519401550293 + }, + { + "auxiliary_loss_clip": 0.01142795, + "auxiliary_loss_mlp": 0.0104168, + "balance_loss_clip": 1.05398679, + "balance_loss_mlp": 1.0257535, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.6782621555028836, + "language_loss": 0.82292181, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84476656, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.15942383, + "step": 5583, + "time_per_iteration": 2.4601924419403076 + }, + { + "auxiliary_loss_clip": 0.01140892, + "auxiliary_loss_mlp": 0.01043134, + "balance_loss_clip": 1.05435228, + "balance_loss_mlp": 1.02830446, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9965194531635526, + "language_loss": 0.7751531, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79699337, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14819336, + "step": 5584, + "time_per_iteration": 2.5824780464172363 + }, + { + "auxiliary_loss_clip": 0.0113906, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.05042017, + "balance_loss_mlp": 1.02527714, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.5401739480835612, + "language_loss": 0.75887984, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78067893, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.15576172, + "step": 5585, + "time_per_iteration": 2.473992347717285 + }, + { + "auxiliary_loss_clip": 0.01136143, + "auxiliary_loss_mlp": 0.01039451, + "balance_loss_clip": 1.05079389, + "balance_loss_mlp": 1.02346504, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 3.7637493910416446, + "language_loss": 0.777385, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79914093, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.16003418, + "step": 5586, + "time_per_iteration": 2.9009287357330322 + }, + { + "auxiliary_loss_clip": 0.01149559, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.05815029, + "balance_loss_mlp": 1.02594948, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.7414566274315517, + "language_loss": 0.80788338, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.8298161, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.17773438, + "step": 5587, + "time_per_iteration": 2.44661808013916 + }, + { + "auxiliary_loss_clip": 0.01142761, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.05712795, + "balance_loss_mlp": 1.02380335, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 2.4187199671414863, + "language_loss": 0.67242885, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69423437, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.13977051, + "step": 5588, + "time_per_iteration": 2.4806571006774902 + }, + { + "auxiliary_loss_clip": 0.01137256, + "auxiliary_loss_mlp": 0.01050131, + "balance_loss_clip": 1.05127883, + "balance_loss_mlp": 1.03180885, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.9849213210215515, + "language_loss": 0.70310974, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72498363, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.18334961, + "step": 5589, + "time_per_iteration": 2.5553271770477295 + }, + { + "auxiliary_loss_clip": 0.01145613, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.05570638, + "balance_loss_mlp": 1.02250659, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 4.9945889240520795, + "language_loss": 0.67076218, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69260669, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.16333008, + "step": 5590, + "time_per_iteration": 2.5075063705444336 + }, + { + "auxiliary_loss_clip": 0.01136152, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.05066955, + "balance_loss_mlp": 1.02510619, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 2.0585648131631524, + "language_loss": 0.73012209, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75190002, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.1652832, + "step": 5591, + "time_per_iteration": 2.5085649490356445 + }, + { + "auxiliary_loss_clip": 0.01130096, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.04636037, + "balance_loss_mlp": 1.01957226, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 2.5775026055609693, + "language_loss": 0.695822, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.7174679, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14904785, + "step": 5592, + "time_per_iteration": 2.5027835369110107 + }, + { + "auxiliary_loss_clip": 0.01138093, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.05214882, + "balance_loss_mlp": 1.02086258, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.238525654047721, + "language_loss": 0.76795852, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78969252, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14453125, + "step": 5593, + "time_per_iteration": 2.629727363586426 + }, + { + "auxiliary_loss_clip": 0.011352, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.04629338, + "balance_loss_mlp": 1.01644325, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.4521743990577245, + "language_loss": 0.7298075, + "learning_rate": 3.093776191858731e-06, + "loss": 0.75150049, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.17663574, + "step": 5594, + "time_per_iteration": 3.9841818809509277 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.05074394, + "balance_loss_mlp": 1.02129841, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 2.0507347329989605, + "language_loss": 0.79794347, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81970894, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.16345215, + "step": 5595, + "time_per_iteration": 2.636387348175049 + }, + { + "auxiliary_loss_clip": 0.01139284, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.05116212, + "balance_loss_mlp": 1.02427101, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.7477457761215776, + "language_loss": 0.81713718, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83891875, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14599609, + "step": 5596, + "time_per_iteration": 3.864307403564453 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.05494857, + "balance_loss_mlp": 1.02431822, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.6652538610387455, + "language_loss": 0.7585296, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78036928, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14788818, + "step": 5597, + "time_per_iteration": 2.5211222171783447 + }, + { + "auxiliary_loss_clip": 0.01140581, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.05695915, + "balance_loss_mlp": 1.02182651, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.788844780148863, + "language_loss": 0.78792173, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.80969501, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14929199, + "step": 5598, + "time_per_iteration": 2.5645601749420166 + }, + { + "auxiliary_loss_clip": 0.01147998, + "auxiliary_loss_mlp": 0.01037517, + "balance_loss_clip": 1.05627823, + "balance_loss_mlp": 1.02106571, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.5786222692879046, + "language_loss": 0.64834023, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.6701954, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.16455078, + "step": 5599, + "time_per_iteration": 2.969515323638916 + }, + { + "auxiliary_loss_clip": 0.01146069, + "auxiliary_loss_mlp": 0.01047088, + "balance_loss_clip": 1.05335677, + "balance_loss_mlp": 1.02945673, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.6359026728898027, + "language_loss": 0.82343769, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84536922, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.17626953, + "step": 5600, + "time_per_iteration": 2.5046112537384033 + }, + { + "auxiliary_loss_clip": 0.01142132, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_clip": 1.0517993, + "balance_loss_mlp": 1.02898312, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.142957557053804, + "language_loss": 0.8268162, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84870082, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.17358398, + "step": 5601, + "time_per_iteration": 2.425103187561035 + }, + { + "auxiliary_loss_clip": 0.01135344, + "auxiliary_loss_mlp": 0.01040916, + "balance_loss_clip": 1.05327642, + "balance_loss_mlp": 1.02446485, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.9957290020793266, + "language_loss": 0.83164728, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85340977, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.16455078, + "step": 5602, + "time_per_iteration": 3.8999900817871094 + }, + { + "auxiliary_loss_clip": 0.01146832, + "auxiliary_loss_mlp": 0.01051151, + "balance_loss_clip": 1.05930901, + "balance_loss_mlp": 1.03595161, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.8000352256570822, + "language_loss": 0.6980449, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.7200247, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.15197754, + "step": 5603, + "time_per_iteration": 2.6018621921539307 + }, + { + "auxiliary_loss_clip": 0.01141811, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_clip": 1.05164874, + "balance_loss_mlp": 1.02725923, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 2.1073372057931086, + "language_loss": 0.83086485, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85272282, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.1673584, + "step": 5604, + "time_per_iteration": 2.513306140899658 + }, + { + "auxiliary_loss_clip": 0.01142815, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_clip": 1.05399477, + "balance_loss_mlp": 1.02883959, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.823143308702107, + "language_loss": 0.73718518, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75906551, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.16375732, + "step": 5605, + "time_per_iteration": 2.497511386871338 + }, + { + "auxiliary_loss_clip": 0.01142228, + "auxiliary_loss_mlp": 0.01043337, + "balance_loss_clip": 1.05077052, + "balance_loss_mlp": 1.02686834, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 3.944888330199194, + "language_loss": 0.83346057, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85531628, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16485596, + "step": 5606, + "time_per_iteration": 2.4168293476104736 + }, + { + "auxiliary_loss_clip": 0.01149973, + "auxiliary_loss_mlp": 0.01042869, + "balance_loss_clip": 1.06233883, + "balance_loss_mlp": 1.02745557, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.6856443875417115, + "language_loss": 0.67836273, + "learning_rate": 3.089533917561809e-06, + "loss": 0.70029116, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1541748, + "step": 5607, + "time_per_iteration": 2.470445394515991 + }, + { + "auxiliary_loss_clip": 0.01143431, + "auxiliary_loss_mlp": 0.01041767, + "balance_loss_clip": 1.05202901, + "balance_loss_mlp": 1.02517343, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 1.829141602329256, + "language_loss": 0.71277261, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73462456, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16601562, + "step": 5608, + "time_per_iteration": 2.5530717372894287 + }, + { + "auxiliary_loss_clip": 0.0114095, + "auxiliary_loss_mlp": 0.0104128, + "balance_loss_clip": 1.05248237, + "balance_loss_mlp": 1.02641428, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 2.0401868785442794, + "language_loss": 0.79329622, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81511849, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14868164, + "step": 5609, + "time_per_iteration": 2.4121625423431396 + }, + { + "auxiliary_loss_clip": 0.01144241, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.05385745, + "balance_loss_mlp": 1.0274055, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.9067574704980161, + "language_loss": 0.8265152, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84840596, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.17443848, + "step": 5610, + "time_per_iteration": 2.45977783203125 + }, + { + "auxiliary_loss_clip": 0.01142711, + "auxiliary_loss_mlp": 0.01043932, + "balance_loss_clip": 1.05554485, + "balance_loss_mlp": 1.02675438, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8728485344862875, + "language_loss": 0.82226002, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84412646, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.17175293, + "step": 5611, + "time_per_iteration": 2.4196853637695312 + }, + { + "auxiliary_loss_clip": 0.01153891, + "auxiliary_loss_mlp": 0.01053814, + "balance_loss_clip": 1.05997515, + "balance_loss_mlp": 1.0358609, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.6949313745580523, + "language_loss": 0.79611778, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81819487, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.17919922, + "step": 5612, + "time_per_iteration": 2.649928092956543 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.05633068, + "balance_loss_mlp": 1.02927494, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.125574799473138, + "language_loss": 0.70041502, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72237551, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16381836, + "step": 5613, + "time_per_iteration": 2.7540981769561768 + }, + { + "auxiliary_loss_clip": 0.01151102, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.06049585, + "balance_loss_mlp": 1.02349508, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 3.0013388505564706, + "language_loss": 0.79570681, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81761789, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16503906, + "step": 5614, + "time_per_iteration": 2.445892572402954 + }, + { + "auxiliary_loss_clip": 0.0115252, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.06265497, + "balance_loss_mlp": 1.02920628, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 2.0138854507333455, + "language_loss": 0.91513038, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93712473, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.17712402, + "step": 5615, + "time_per_iteration": 2.4558918476104736 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01049858, + "balance_loss_clip": 1.05279636, + "balance_loss_mlp": 1.03344321, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.837720875161895, + "language_loss": 0.81097633, + "learning_rate": 3.086592866591809e-06, + "loss": 0.8328979, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.16418457, + "step": 5616, + "time_per_iteration": 2.505790948867798 + }, + { + "auxiliary_loss_clip": 0.01151563, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_clip": 1.05380797, + "balance_loss_mlp": 1.03049183, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 1.7634024684874343, + "language_loss": 0.84271097, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86471808, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.18652344, + "step": 5617, + "time_per_iteration": 2.4650471210479736 + }, + { + "auxiliary_loss_clip": 0.01150012, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.0575372, + "balance_loss_mlp": 1.02201796, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.7474806163904244, + "language_loss": 0.80386043, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82573831, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.15771484, + "step": 5618, + "time_per_iteration": 2.497312068939209 + }, + { + "auxiliary_loss_clip": 0.01159259, + "auxiliary_loss_mlp": 0.01038422, + "balance_loss_clip": 1.06400812, + "balance_loss_mlp": 1.02222109, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9537683320950185, + "language_loss": 0.71029758, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73227435, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16186523, + "step": 5619, + "time_per_iteration": 2.4611504077911377 + }, + { + "auxiliary_loss_clip": 0.0114298, + "auxiliary_loss_mlp": 0.01045432, + "balance_loss_clip": 1.05523038, + "balance_loss_mlp": 1.02983928, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 2.9994361574452624, + "language_loss": 0.7044785, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72636259, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.15576172, + "step": 5620, + "time_per_iteration": 2.5128438472747803 + }, + { + "auxiliary_loss_clip": 0.01142749, + "auxiliary_loss_mlp": 0.01043873, + "balance_loss_clip": 1.05618811, + "balance_loss_mlp": 1.02794695, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 1.7297202270682217, + "language_loss": 0.6807825, + "learning_rate": 3.084957506678058e-06, + "loss": 0.7026487, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.15930176, + "step": 5621, + "time_per_iteration": 2.464953899383545 + }, + { + "auxiliary_loss_clip": 0.01150299, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.06365967, + "balance_loss_mlp": 1.02641845, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 3.4190410625576626, + "language_loss": 0.82887161, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85079503, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15612793, + "step": 5622, + "time_per_iteration": 2.489082098007202 + }, + { + "auxiliary_loss_clip": 0.01143553, + "auxiliary_loss_mlp": 0.01041245, + "balance_loss_clip": 1.05730236, + "balance_loss_mlp": 1.0260694, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.6031346422057964, + "language_loss": 0.73544115, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75728917, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.15161133, + "step": 5623, + "time_per_iteration": 2.4725608825683594 + }, + { + "auxiliary_loss_clip": 0.01077319, + "auxiliary_loss_mlp": 0.01007185, + "balance_loss_clip": 1.0474143, + "balance_loss_mlp": 1.00525725, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.742994490959183, + "language_loss": 0.54908496, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56993008, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.01928711, + "step": 5624, + "time_per_iteration": 3.231135368347168 + }, + { + "auxiliary_loss_clip": 0.01149708, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.0598067, + "balance_loss_mlp": 1.02603304, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.5986195251124036, + "language_loss": 0.72814655, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75007641, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.17260742, + "step": 5625, + "time_per_iteration": 2.493117094039917 + }, + { + "auxiliary_loss_clip": 0.01146386, + "auxiliary_loss_mlp": 0.01048487, + "balance_loss_clip": 1.05317783, + "balance_loss_mlp": 1.03079605, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 2.1006600791403667, + "language_loss": 0.70618641, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72813517, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.17687988, + "step": 5626, + "time_per_iteration": 4.296574115753174 + }, + { + "auxiliary_loss_clip": 0.01141762, + "auxiliary_loss_mlp": 0.01043129, + "balance_loss_clip": 1.05587733, + "balance_loss_mlp": 1.02561736, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.780094554452308, + "language_loss": 0.80943739, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83128631, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.17492676, + "step": 5627, + "time_per_iteration": 2.6210250854492188 + }, + { + "auxiliary_loss_clip": 0.01150669, + "auxiliary_loss_mlp": 0.01043929, + "balance_loss_clip": 1.05934238, + "balance_loss_mlp": 1.02680993, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.296224718480286, + "language_loss": 0.80361843, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82556444, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.17114258, + "step": 5628, + "time_per_iteration": 2.4826600551605225 + }, + { + "auxiliary_loss_clip": 0.01150515, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.05908179, + "balance_loss_mlp": 1.0286839, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 2.5014473659303227, + "language_loss": 0.77416623, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79612517, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16699219, + "step": 5629, + "time_per_iteration": 2.504014730453491 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01041759, + "balance_loss_clip": 1.05794489, + "balance_loss_mlp": 1.02374685, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 2.6430717843904765, + "language_loss": 0.84488595, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.8668291, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.18029785, + "step": 5630, + "time_per_iteration": 2.469987154006958 + }, + { + "auxiliary_loss_clip": 0.01149173, + "auxiliary_loss_mlp": 0.01042111, + "balance_loss_clip": 1.05837774, + "balance_loss_mlp": 1.02643538, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 3.2967527658331344, + "language_loss": 0.71834922, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74026203, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.15686035, + "step": 5631, + "time_per_iteration": 2.5413260459899902 + }, + { + "auxiliary_loss_clip": 0.01095924, + "auxiliary_loss_mlp": 0.010103, + "balance_loss_clip": 1.06479299, + "balance_loss_mlp": 1.00829411, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8555686348125343, + "language_loss": 0.56145018, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58251244, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.31152344, + "router_z_loss_mlp": 0.02005005, + "step": 5632, + "time_per_iteration": 3.098846197128296 + }, + { + "auxiliary_loss_clip": 0.01146658, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_clip": 1.05671704, + "balance_loss_mlp": 1.02730525, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.7665342303329474, + "language_loss": 0.80284733, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82476985, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.18286133, + "step": 5633, + "time_per_iteration": 2.493216037750244 + }, + { + "auxiliary_loss_clip": 0.01144706, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.05618775, + "balance_loss_mlp": 1.01744306, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.0916440101723226, + "language_loss": 0.59194887, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61372948, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.15936279, + "step": 5634, + "time_per_iteration": 2.444373846054077 + }, + { + "auxiliary_loss_clip": 0.01152077, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_clip": 1.06137896, + "balance_loss_mlp": 1.02627027, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.8423581506687385, + "language_loss": 0.9270401, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94899213, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.16870117, + "step": 5635, + "time_per_iteration": 2.448404312133789 + }, + { + "auxiliary_loss_clip": 0.0114658, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.05899024, + "balance_loss_mlp": 1.02344847, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7260622671144363, + "language_loss": 0.75561285, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77749002, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.17687988, + "step": 5636, + "time_per_iteration": 2.4051921367645264 + }, + { + "auxiliary_loss_clip": 0.0113997, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.05299687, + "balance_loss_mlp": 1.02223253, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.6585007953338404, + "language_loss": 0.83464479, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85642785, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.16113281, + "step": 5637, + "time_per_iteration": 3.9953324794769287 + }, + { + "auxiliary_loss_clip": 0.01145457, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.05606985, + "balance_loss_mlp": 1.0275743, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 2.444636496142331, + "language_loss": 0.69729316, + "learning_rate": 3.079389598759495e-06, + "loss": 0.7192077, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.1842041, + "step": 5638, + "time_per_iteration": 2.58418869972229 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.0105272, + "balance_loss_clip": 1.05301762, + "balance_loss_mlp": 1.03471899, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.7195104658307818, + "language_loss": 0.81026715, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83220875, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.18005371, + "step": 5639, + "time_per_iteration": 2.8572463989257812 + }, + { + "auxiliary_loss_clip": 0.01148855, + "auxiliary_loss_mlp": 0.01042641, + "balance_loss_clip": 1.05643499, + "balance_loss_mlp": 1.02565384, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.005330585717586, + "language_loss": 0.67217129, + "learning_rate": 3.078733771907907e-06, + "loss": 0.69408625, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16992188, + "step": 5640, + "time_per_iteration": 4.006258726119995 + }, + { + "auxiliary_loss_clip": 0.01144956, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.05737567, + "balance_loss_mlp": 1.02199614, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.7737470645915405, + "language_loss": 0.6976698, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.71950197, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.16271973, + "step": 5641, + "time_per_iteration": 2.4973106384277344 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.05381298, + "balance_loss_mlp": 1.0345614, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.9407559880044043, + "language_loss": 0.87611067, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89806676, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.16662598, + "step": 5642, + "time_per_iteration": 2.598093032836914 + }, + { + "auxiliary_loss_clip": 0.01142069, + "auxiliary_loss_mlp": 0.010444, + "balance_loss_clip": 1.05574346, + "balance_loss_mlp": 1.02989805, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.908811288805054, + "language_loss": 0.83898938, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86085403, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14483643, + "step": 5643, + "time_per_iteration": 2.4847846031188965 + }, + { + "auxiliary_loss_clip": 0.01143859, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.05313563, + "balance_loss_mlp": 1.03325677, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.567832287463894, + "language_loss": 0.76896644, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79089284, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15527344, + "step": 5644, + "time_per_iteration": 2.455538034439087 + }, + { + "auxiliary_loss_clip": 0.01142742, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0535233, + "balance_loss_mlp": 1.0301609, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 7.416908636426381, + "language_loss": 0.63384068, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65573502, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.1652832, + "step": 5645, + "time_per_iteration": 2.4211902618408203 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.05503345, + "balance_loss_mlp": 1.02497149, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 1.924692122581499, + "language_loss": 0.76430249, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78611863, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14324951, + "step": 5646, + "time_per_iteration": 2.4984333515167236 + }, + { + "auxiliary_loss_clip": 0.01155177, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.06002092, + "balance_loss_mlp": 1.02528799, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.3786585904279884, + "language_loss": 0.78963596, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81161344, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.17272949, + "step": 5647, + "time_per_iteration": 3.902188539505005 + }, + { + "auxiliary_loss_clip": 0.01147046, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.05711651, + "balance_loss_mlp": 1.02566409, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 2.0626208485425015, + "language_loss": 0.77371383, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.7956031, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.16223145, + "step": 5648, + "time_per_iteration": 2.473952054977417 + }, + { + "auxiliary_loss_clip": 0.01082676, + "auxiliary_loss_mlp": 0.01004288, + "balance_loss_clip": 1.04983282, + "balance_loss_mlp": 1.00201738, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7761043410932692, + "language_loss": 0.56356543, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58443511, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.32861328, + "router_z_loss_mlp": 0.02270508, + "step": 5649, + "time_per_iteration": 3.0922014713287354 + }, + { + "auxiliary_loss_clip": 0.01151544, + "auxiliary_loss_mlp": 0.01045194, + "balance_loss_clip": 1.06392956, + "balance_loss_mlp": 1.02969098, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.8040727289241938, + "language_loss": 0.85374659, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.875714, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.15509033, + "step": 5650, + "time_per_iteration": 2.4736249446868896 + }, + { + "auxiliary_loss_clip": 0.01141637, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.05556822, + "balance_loss_mlp": 1.0143404, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.9065331484646404, + "language_loss": 0.70894825, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73065376, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.14569092, + "step": 5651, + "time_per_iteration": 2.9294495582580566 + }, + { + "auxiliary_loss_clip": 0.01142988, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.05631304, + "balance_loss_mlp": 1.02509415, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.994237400740013, + "language_loss": 0.8080914, + "learning_rate": 3.074795378203616e-06, + "loss": 0.82993346, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.16137695, + "step": 5652, + "time_per_iteration": 2.440117359161377 + }, + { + "auxiliary_loss_clip": 0.01146131, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.05643344, + "balance_loss_mlp": 1.02506912, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 2.2782718192843645, + "language_loss": 0.77062821, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79250038, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.16003418, + "step": 5653, + "time_per_iteration": 2.5240249633789062 + }, + { + "auxiliary_loss_clip": 0.0114352, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.05505753, + "balance_loss_mlp": 1.02014256, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 3.63909600020138, + "language_loss": 0.85490167, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87669331, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.1550293, + "step": 5654, + "time_per_iteration": 2.4245152473449707 + }, + { + "auxiliary_loss_clip": 0.01144886, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.05756378, + "balance_loss_mlp": 1.02431297, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 2.242549347507802, + "language_loss": 0.6580683, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67992002, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15979004, + "step": 5655, + "time_per_iteration": 2.452134132385254 + }, + { + "auxiliary_loss_clip": 0.0114966, + "auxiliary_loss_mlp": 0.01044007, + "balance_loss_clip": 1.05974376, + "balance_loss_mlp": 1.02900457, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4311897165510545, + "language_loss": 0.76445991, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78639656, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.14996338, + "step": 5656, + "time_per_iteration": 2.5209553241729736 + }, + { + "auxiliary_loss_clip": 0.01145727, + "auxiliary_loss_mlp": 0.01041147, + "balance_loss_clip": 1.05363226, + "balance_loss_mlp": 1.02474332, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 8.273858936932628, + "language_loss": 0.83067346, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85254216, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.1640625, + "step": 5657, + "time_per_iteration": 2.46239972114563 + }, + { + "auxiliary_loss_clip": 0.01154964, + "auxiliary_loss_mlp": 0.01041509, + "balance_loss_clip": 1.06809866, + "balance_loss_mlp": 1.02691221, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.9032047711300855, + "language_loss": 0.85412771, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87609243, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14587402, + "step": 5658, + "time_per_iteration": 2.5125417709350586 + }, + { + "auxiliary_loss_clip": 0.01079095, + "auxiliary_loss_mlp": 0.01005137, + "balance_loss_clip": 1.04749191, + "balance_loss_mlp": 1.0032146, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8178810186522942, + "language_loss": 0.60014546, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62098777, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.31689453, + "router_z_loss_mlp": 0.01919556, + "step": 5659, + "time_per_iteration": 3.01814341545105 + }, + { + "auxiliary_loss_clip": 0.01135409, + "auxiliary_loss_mlp": 0.01037954, + "balance_loss_clip": 1.051355, + "balance_loss_mlp": 1.02250493, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.921253209436872, + "language_loss": 0.67527986, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.6970135, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.15441895, + "step": 5660, + "time_per_iteration": 2.514860153198242 + }, + { + "auxiliary_loss_clip": 0.01141827, + "auxiliary_loss_mlp": 0.01043339, + "balance_loss_clip": 1.05408382, + "balance_loss_mlp": 1.02716863, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 2.4388297847964693, + "language_loss": 0.67353541, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69538713, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.16168213, + "step": 5661, + "time_per_iteration": 2.5186045169830322 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01037631, + "balance_loss_clip": 1.04896271, + "balance_loss_mlp": 1.0222888, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.9802302362070061, + "language_loss": 0.78588426, + "learning_rate": 3.071508899340113e-06, + "loss": 0.80759835, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.15344238, + "step": 5662, + "time_per_iteration": 2.4988784790039062 + }, + { + "auxiliary_loss_clip": 0.01140436, + "auxiliary_loss_mlp": 0.01040426, + "balance_loss_clip": 1.0535233, + "balance_loss_mlp": 1.02388, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.8971332932529632, + "language_loss": 0.7388382, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.76064682, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.16540527, + "step": 5663, + "time_per_iteration": 2.5040476322174072 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.0475173, + "balance_loss_mlp": 1.02086711, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.8464378431662498, + "language_loss": 0.86238062, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.8840273, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13964844, + "step": 5664, + "time_per_iteration": 2.771418809890747 + }, + { + "auxiliary_loss_clip": 0.01137265, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.05087817, + "balance_loss_mlp": 1.02101433, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.896944343058256, + "language_loss": 0.68627858, + "learning_rate": 3.070522162795235e-06, + "loss": 0.70800602, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14477539, + "step": 5665, + "time_per_iteration": 2.624218702316284 + }, + { + "auxiliary_loss_clip": 0.01138732, + "auxiliary_loss_mlp": 0.01042872, + "balance_loss_clip": 1.05116796, + "balance_loss_mlp": 1.02514505, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.7274007172024515, + "language_loss": 0.73502743, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.75684345, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.17736816, + "step": 5666, + "time_per_iteration": 2.4577524662017822 + }, + { + "auxiliary_loss_clip": 0.01148826, + "auxiliary_loss_mlp": 0.01037425, + "balance_loss_clip": 1.05765104, + "balance_loss_mlp": 1.02195215, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 4.733067843115504, + "language_loss": 0.73032302, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75218558, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15478516, + "step": 5667, + "time_per_iteration": 2.4606611728668213 + }, + { + "auxiliary_loss_clip": 0.01076858, + "auxiliary_loss_mlp": 0.01009074, + "balance_loss_clip": 1.04500854, + "balance_loss_mlp": 1.00691009, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8380902313208607, + "language_loss": 0.63240623, + "learning_rate": 3.069535060901597e-06, + "loss": 0.6532656, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 0.02166748, + "step": 5668, + "time_per_iteration": 3.30633544921875 + }, + { + "auxiliary_loss_clip": 0.01142568, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.05503464, + "balance_loss_mlp": 1.03299999, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.1405954908525984, + "language_loss": 0.71843201, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74035597, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.16845703, + "step": 5669, + "time_per_iteration": 2.4621641635894775 + }, + { + "auxiliary_loss_clip": 0.01152032, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.06264162, + "balance_loss_mlp": 1.01992965, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 2.8434101463507684, + "language_loss": 0.80740827, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.8292768, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14880371, + "step": 5670, + "time_per_iteration": 3.9475901126861572 + }, + { + "auxiliary_loss_clip": 0.01154813, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.0647347, + "balance_loss_mlp": 1.02397728, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.8981306702376224, + "language_loss": 0.77266276, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79460263, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.15197754, + "step": 5671, + "time_per_iteration": 2.5049548149108887 + }, + { + "auxiliary_loss_clip": 0.01163081, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_clip": 1.07448888, + "balance_loss_mlp": 1.02571177, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 2.3477150754102776, + "language_loss": 0.73910272, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.76115829, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.16772461, + "step": 5672, + "time_per_iteration": 2.548161268234253 + }, + { + "auxiliary_loss_clip": 0.01139403, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.0510335, + "balance_loss_mlp": 1.02422249, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.879080749698063, + "language_loss": 0.73654187, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75832564, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.14770508, + "step": 5673, + "time_per_iteration": 2.5643603801727295 + }, + { + "auxiliary_loss_clip": 0.01139776, + "auxiliary_loss_mlp": 0.01035661, + "balance_loss_clip": 1.05349922, + "balance_loss_mlp": 1.02048528, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 2.03601928103123, + "language_loss": 0.7966755, + "learning_rate": 3.067559762415682e-06, + "loss": 0.81842983, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.15185547, + "step": 5674, + "time_per_iteration": 2.4748995304107666 + }, + { + "auxiliary_loss_clip": 0.01070449, + "auxiliary_loss_mlp": 0.01003511, + "balance_loss_clip": 1.04046166, + "balance_loss_mlp": 1.00167203, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7912303098866116, + "language_loss": 0.56080997, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58154953, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.29980469, + "router_z_loss_mlp": 0.01837158, + "step": 5675, + "time_per_iteration": 3.2362210750579834 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.05081701, + "balance_loss_mlp": 1.02545631, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.9851661460067644, + "language_loss": 0.78991985, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81167865, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.15478516, + "step": 5676, + "time_per_iteration": 2.4807448387145996 + }, + { + "auxiliary_loss_clip": 0.01145439, + "auxiliary_loss_mlp": 0.0104431, + "balance_loss_clip": 1.05413544, + "balance_loss_mlp": 1.02777565, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.7428658712106757, + "language_loss": 0.85849106, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.8803885, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.16540527, + "step": 5677, + "time_per_iteration": 2.708488702774048 + }, + { + "auxiliary_loss_clip": 0.01144886, + "auxiliary_loss_mlp": 0.01040128, + "balance_loss_clip": 1.05719697, + "balance_loss_mlp": 1.0240109, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 1.8961965756783727, + "language_loss": 0.798473, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.82032323, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.16119385, + "step": 5678, + "time_per_iteration": 2.7954230308532715 + }, + { + "auxiliary_loss_clip": 0.01138587, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.05139554, + "balance_loss_mlp": 1.01895785, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 2.014621950482532, + "language_loss": 0.74734068, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76906592, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14959717, + "step": 5679, + "time_per_iteration": 2.538250684738159 + }, + { + "auxiliary_loss_clip": 0.01069618, + "auxiliary_loss_mlp": 0.01006852, + "balance_loss_clip": 1.03898907, + "balance_loss_mlp": 1.00492644, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7273111290805031, + "language_loss": 0.5943923, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61515707, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.30615234, + "router_z_loss_mlp": 0.01922607, + "step": 5680, + "time_per_iteration": 4.641901254653931 + }, + { + "auxiliary_loss_clip": 0.01133655, + "auxiliary_loss_mlp": 0.01030273, + "balance_loss_clip": 1.04987311, + "balance_loss_mlp": 1.01570582, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 1.676243882539545, + "language_loss": 0.72186935, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.7435087, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.14562988, + "step": 5681, + "time_per_iteration": 2.6028807163238525 + }, + { + "auxiliary_loss_clip": 0.01136481, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.05196583, + "balance_loss_mlp": 1.02400017, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.060436983543673, + "language_loss": 0.71133393, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7330904, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15161133, + "step": 5682, + "time_per_iteration": 3.1507952213287354 + }, + { + "auxiliary_loss_clip": 0.01150778, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.05957735, + "balance_loss_mlp": 1.02550972, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.682718803857829, + "language_loss": 0.84335679, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.8652879, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.16833496, + "step": 5683, + "time_per_iteration": 4.464916229248047 + }, + { + "auxiliary_loss_clip": 0.01149736, + "auxiliary_loss_mlp": 0.01048703, + "balance_loss_clip": 1.05977821, + "balance_loss_mlp": 1.03167975, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 1.6601777292220063, + "language_loss": 0.70665389, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72863829, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.17028809, + "step": 5684, + "time_per_iteration": 2.8335118293762207 + }, + { + "auxiliary_loss_clip": 0.01133739, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.05009055, + "balance_loss_mlp": 1.01987481, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.5806581257614496, + "language_loss": 0.7525757, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77425718, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14532471, + "step": 5685, + "time_per_iteration": 2.9869048595428467 + }, + { + "auxiliary_loss_clip": 0.01134176, + "auxiliary_loss_mlp": 0.01039273, + "balance_loss_clip": 1.04974997, + "balance_loss_mlp": 1.02471757, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.9146727942703385, + "language_loss": 0.70121801, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72295249, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14562988, + "step": 5686, + "time_per_iteration": 2.8179116249084473 + }, + { + "auxiliary_loss_clip": 0.01143719, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.05654693, + "balance_loss_mlp": 1.02281022, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 10.462761432643404, + "language_loss": 0.77607071, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79788888, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15289307, + "step": 5687, + "time_per_iteration": 2.492269277572632 + }, + { + "auxiliary_loss_clip": 0.01135798, + "auxiliary_loss_mlp": 0.01040076, + "balance_loss_clip": 1.05191493, + "balance_loss_mlp": 1.02476966, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 2.011329599290932, + "language_loss": 0.86665964, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88841832, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.1529541, + "step": 5688, + "time_per_iteration": 2.516814947128296 + }, + { + "auxiliary_loss_clip": 0.0115346, + "auxiliary_loss_mlp": 0.01041776, + "balance_loss_clip": 1.05876327, + "balance_loss_mlp": 1.02490759, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 2.04048672967033, + "language_loss": 0.79993379, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82188618, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.1685791, + "step": 5689, + "time_per_iteration": 2.5458741188049316 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.06034303, + "balance_loss_mlp": 1.02177072, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 1.8230352618324497, + "language_loss": 0.74059165, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.7624529, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.15985107, + "step": 5690, + "time_per_iteration": 4.058058261871338 + }, + { + "auxiliary_loss_clip": 0.01147057, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.05702424, + "balance_loss_mlp": 1.02295709, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 1.858727814873542, + "language_loss": 0.75675827, + "learning_rate": 3.061955178104237e-06, + "loss": 0.77860534, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.14697266, + "step": 5691, + "time_per_iteration": 2.471987724304199 + }, + { + "auxiliary_loss_clip": 0.01133773, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.0497992, + "balance_loss_mlp": 1.02191556, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.775861721564695, + "language_loss": 0.68766725, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70937091, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14666748, + "step": 5692, + "time_per_iteration": 2.6225507259368896 + }, + { + "auxiliary_loss_clip": 0.01147546, + "auxiliary_loss_mlp": 0.01037654, + "balance_loss_clip": 1.05819046, + "balance_loss_mlp": 1.02141809, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.93025355762431, + "language_loss": 0.73023218, + "learning_rate": 3.06129504893632e-06, + "loss": 0.75208414, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.16235352, + "step": 5693, + "time_per_iteration": 2.422879219055176 + }, + { + "auxiliary_loss_clip": 0.01135898, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.05099487, + "balance_loss_mlp": 1.02117801, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.871870715358856, + "language_loss": 0.74996829, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77168131, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14221191, + "step": 5694, + "time_per_iteration": 2.608856201171875 + }, + { + "auxiliary_loss_clip": 0.01137008, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.05262411, + "balance_loss_mlp": 1.03319407, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 2.1135014589022885, + "language_loss": 0.79840398, + "learning_rate": 3.060634758790747e-06, + "loss": 0.82026887, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.1628418, + "step": 5695, + "time_per_iteration": 2.4629809856414795 + }, + { + "auxiliary_loss_clip": 0.0114207, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.05618703, + "balance_loss_mlp": 1.02247095, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.7326331341583852, + "language_loss": 0.73255444, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75434959, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.1496582, + "step": 5696, + "time_per_iteration": 2.5637271404266357 + }, + { + "auxiliary_loss_clip": 0.01145998, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_clip": 1.05850101, + "balance_loss_mlp": 1.03009081, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 2.6046139437197193, + "language_loss": 0.71273434, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73464721, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.15197754, + "step": 5697, + "time_per_iteration": 2.515141487121582 + }, + { + "auxiliary_loss_clip": 0.01139026, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.05434477, + "balance_loss_mlp": 1.01506817, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.9075176923351198, + "language_loss": 0.82107836, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84276712, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.14788818, + "step": 5698, + "time_per_iteration": 2.590841293334961 + }, + { + "auxiliary_loss_clip": 0.01148259, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_clip": 1.06065905, + "balance_loss_mlp": 1.03141785, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 3.810682568762181, + "language_loss": 0.68954223, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71150577, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.16699219, + "step": 5699, + "time_per_iteration": 2.466679334640503 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.04911304, + "balance_loss_mlp": 1.02006745, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.338975723650239, + "language_loss": 0.72624779, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74793184, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14312744, + "step": 5700, + "time_per_iteration": 2.5007517337799072 + }, + { + "auxiliary_loss_clip": 0.01136407, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.05143499, + "balance_loss_mlp": 1.02515936, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 1.7909725153066065, + "language_loss": 0.82244974, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.8442241, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.15869141, + "step": 5701, + "time_per_iteration": 2.437412977218628 + }, + { + "auxiliary_loss_clip": 0.01138966, + "auxiliary_loss_mlp": 0.01050853, + "balance_loss_clip": 1.05159044, + "balance_loss_mlp": 1.03309035, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.6283458879773547, + "language_loss": 0.71211588, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73401403, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.17773438, + "step": 5702, + "time_per_iteration": 2.44708251953125 + }, + { + "auxiliary_loss_clip": 0.01078023, + "auxiliary_loss_mlp": 0.01005562, + "balance_loss_clip": 1.04650617, + "balance_loss_mlp": 1.0033921, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.7838808207307834, + "language_loss": 0.57373631, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59457219, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.31494141, + "router_z_loss_mlp": 0.021698, + "step": 5703, + "time_per_iteration": 2.9684982299804688 + }, + { + "auxiliary_loss_clip": 0.01143249, + "auxiliary_loss_mlp": 0.01040923, + "balance_loss_clip": 1.05471301, + "balance_loss_mlp": 1.02450788, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.8018427223990092, + "language_loss": 0.74761951, + "learning_rate": 3.057661463723086e-06, + "loss": 0.76946127, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.1640625, + "step": 5704, + "time_per_iteration": 2.638335943222046 + }, + { + "auxiliary_loss_clip": 0.01148365, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.06039548, + "balance_loss_mlp": 1.02635396, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.415181413674256, + "language_loss": 0.72920245, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75108981, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.14025879, + "step": 5705, + "time_per_iteration": 2.469698429107666 + }, + { + "auxiliary_loss_clip": 0.01140266, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.05459785, + "balance_loss_mlp": 1.02133381, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 1.9400644888974756, + "language_loss": 0.79827797, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82004094, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.14697266, + "step": 5706, + "time_per_iteration": 2.457613468170166 + }, + { + "auxiliary_loss_clip": 0.01138837, + "auxiliary_loss_mlp": 0.01037832, + "balance_loss_clip": 1.04906249, + "balance_loss_mlp": 1.0223285, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 2.1916873393078995, + "language_loss": 0.82581633, + "learning_rate": 3.056669642996787e-06, + "loss": 0.84758306, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.1550293, + "step": 5707, + "time_per_iteration": 2.4576711654663086 + }, + { + "auxiliary_loss_clip": 0.01148711, + "auxiliary_loss_mlp": 0.01035413, + "balance_loss_clip": 1.06091547, + "balance_loss_mlp": 1.02058303, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.4920470918548092, + "language_loss": 0.75067466, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77251589, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.1484375, + "step": 5708, + "time_per_iteration": 2.4047493934631348 + }, + { + "auxiliary_loss_clip": 0.01140729, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.05592394, + "balance_loss_mlp": 1.02652049, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.8499149401086727, + "language_loss": 0.81267697, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83451229, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.16271973, + "step": 5709, + "time_per_iteration": 2.5223703384399414 + }, + { + "auxiliary_loss_clip": 0.01134795, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.04922295, + "balance_loss_mlp": 1.02262926, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.6986399364138833, + "language_loss": 0.792068, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81380105, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.15893555, + "step": 5710, + "time_per_iteration": 2.5467629432678223 + }, + { + "auxiliary_loss_clip": 0.01148005, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.05840254, + "balance_loss_mlp": 1.02081108, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.877784669276676, + "language_loss": 0.69907475, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72092521, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.16259766, + "step": 5711, + "time_per_iteration": 2.484752655029297 + }, + { + "auxiliary_loss_clip": 0.01132848, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.04820848, + "balance_loss_mlp": 1.02135324, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 2.228397718688446, + "language_loss": 0.67667866, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69837034, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.1496582, + "step": 5712, + "time_per_iteration": 2.4832851886749268 + }, + { + "auxiliary_loss_clip": 0.01086956, + "auxiliary_loss_mlp": 0.01005614, + "balance_loss_clip": 1.05685759, + "balance_loss_mlp": 1.00355136, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8644407993062894, + "language_loss": 0.58094621, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60187197, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.30126953, + "router_z_loss_mlp": 0.02062988, + "step": 5713, + "time_per_iteration": 4.517830848693848 + }, + { + "auxiliary_loss_clip": 0.01143577, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.05498779, + "balance_loss_mlp": 1.0215565, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6201673126171607, + "language_loss": 0.81149113, + "learning_rate": 3.054353992805076e-06, + "loss": 0.83329356, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.15093994, + "step": 5714, + "time_per_iteration": 2.493122100830078 + }, + { + "auxiliary_loss_clip": 0.01139484, + "auxiliary_loss_mlp": 0.01049628, + "balance_loss_clip": 1.05286884, + "balance_loss_mlp": 1.03241444, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 2.9218113350147115, + "language_loss": 0.7245537, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74644482, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.17211914, + "step": 5715, + "time_per_iteration": 2.515801429748535 + }, + { + "auxiliary_loss_clip": 0.0109008, + "auxiliary_loss_mlp": 0.01005083, + "balance_loss_clip": 1.05797887, + "balance_loss_mlp": 1.00242174, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8919197721087387, + "language_loss": 0.65906882, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68002045, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.32080078, + "router_z_loss_mlp": 0.02661133, + "step": 5716, + "time_per_iteration": 3.117772340774536 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.05482602, + "balance_loss_mlp": 1.02571344, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 1.9239903210125282, + "language_loss": 0.74461699, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76642781, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.14703369, + "step": 5717, + "time_per_iteration": 2.6813879013061523 + }, + { + "auxiliary_loss_clip": 0.0114631, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.05962539, + "balance_loss_mlp": 1.02310908, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.881484299626364, + "language_loss": 0.75595403, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77779472, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.1463623, + "step": 5718, + "time_per_iteration": 2.54461407661438 + }, + { + "auxiliary_loss_clip": 0.01143716, + "auxiliary_loss_mlp": 0.01043852, + "balance_loss_clip": 1.05434275, + "balance_loss_mlp": 1.02870607, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9218544552654484, + "language_loss": 0.6388979, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66077352, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.15142822, + "step": 5719, + "time_per_iteration": 2.5746755599975586 + }, + { + "auxiliary_loss_clip": 0.01142684, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.05250394, + "balance_loss_mlp": 1.02108264, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 1.9002700317241559, + "language_loss": 0.73770165, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.7595064, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.16711426, + "step": 5720, + "time_per_iteration": 2.5913095474243164 + }, + { + "auxiliary_loss_clip": 0.01145615, + "auxiliary_loss_mlp": 0.01036839, + "balance_loss_clip": 1.05683732, + "balance_loss_mlp": 1.0213362, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.8389597077906086, + "language_loss": 0.74218655, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76401109, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.1550293, + "step": 5721, + "time_per_iteration": 2.4962499141693115 + }, + { + "auxiliary_loss_clip": 0.01145823, + "auxiliary_loss_mlp": 0.01047269, + "balance_loss_clip": 1.05887651, + "balance_loss_mlp": 1.03232586, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 2.19154698191607, + "language_loss": 0.79824454, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82017547, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.1494751, + "step": 5722, + "time_per_iteration": 2.441927671432495 + }, + { + "auxiliary_loss_clip": 0.01135432, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.04932177, + "balance_loss_mlp": 1.02124369, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.7934847571893273, + "language_loss": 0.81911898, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84083164, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.14581299, + "step": 5723, + "time_per_iteration": 2.500272035598755 + }, + { + "auxiliary_loss_clip": 0.01151568, + "auxiliary_loss_mlp": 0.01045998, + "balance_loss_clip": 1.06541681, + "balance_loss_mlp": 1.03071535, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 2.2357195820552174, + "language_loss": 0.81496584, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83694148, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.152771, + "step": 5724, + "time_per_iteration": 3.918818950653076 + }, + { + "auxiliary_loss_clip": 0.01138899, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.0506444, + "balance_loss_mlp": 1.02425265, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 11.672815604362205, + "language_loss": 0.69277024, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71455193, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.15008545, + "step": 5725, + "time_per_iteration": 2.5619263648986816 + }, + { + "auxiliary_loss_clip": 0.01148557, + "auxiliary_loss_mlp": 0.01043358, + "balance_loss_clip": 1.05532789, + "balance_loss_mlp": 1.02637076, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.5130293884381354, + "language_loss": 0.69035125, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71227044, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.16986084, + "step": 5726, + "time_per_iteration": 3.886091947555542 + }, + { + "auxiliary_loss_clip": 0.01144745, + "auxiliary_loss_mlp": 0.01044269, + "balance_loss_clip": 1.0568862, + "balance_loss_mlp": 1.0295285, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.865812193149626, + "language_loss": 0.73404235, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75593245, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14727783, + "step": 5727, + "time_per_iteration": 2.4880642890930176 + }, + { + "auxiliary_loss_clip": 0.0113295, + "auxiliary_loss_mlp": 0.01047907, + "balance_loss_clip": 1.04804921, + "balance_loss_mlp": 1.03251708, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 2.2566499079122697, + "language_loss": 0.88285315, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90466166, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15380859, + "step": 5728, + "time_per_iteration": 2.439397096633911 + }, + { + "auxiliary_loss_clip": 0.01135954, + "auxiliary_loss_mlp": 0.01047448, + "balance_loss_clip": 1.05117989, + "balance_loss_mlp": 1.03299451, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.030626594940743, + "language_loss": 0.70148575, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72331983, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14447021, + "step": 5729, + "time_per_iteration": 2.7870421409606934 + }, + { + "auxiliary_loss_clip": 0.01135683, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.04950345, + "balance_loss_mlp": 1.0255692, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 2.0589707089569593, + "language_loss": 0.74140143, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76317799, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.1640625, + "step": 5730, + "time_per_iteration": 2.430353879928589 + }, + { + "auxiliary_loss_clip": 0.01139896, + "auxiliary_loss_mlp": 0.01051104, + "balance_loss_clip": 1.05370569, + "balance_loss_mlp": 1.03596401, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.427727421555626, + "language_loss": 0.79790515, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81981516, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15136719, + "step": 5731, + "time_per_iteration": 2.512727737426758 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042669, + "balance_loss_clip": 1.0602088, + "balance_loss_mlp": 1.02783954, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 1.9170040804326955, + "language_loss": 0.78427941, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.80619013, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14819336, + "step": 5732, + "time_per_iteration": 2.5170910358428955 + }, + { + "auxiliary_loss_clip": 0.01076016, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.0460763, + "balance_loss_mlp": 1.03111088, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.754264778853484, + "language_loss": 0.53533626, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55642611, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.29931641, + "router_z_loss_mlp": 0.01858521, + "step": 5733, + "time_per_iteration": 4.4898035526275635 + }, + { + "auxiliary_loss_clip": 0.01141502, + "auxiliary_loss_mlp": 0.01044279, + "balance_loss_clip": 1.05543411, + "balance_loss_mlp": 1.02859139, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.7112508226146859, + "language_loss": 0.83305651, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85491431, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.15698242, + "step": 5734, + "time_per_iteration": 2.4633023738861084 + }, + { + "auxiliary_loss_clip": 0.01136101, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.05010998, + "balance_loss_mlp": 1.02304292, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 1.9352768249459524, + "language_loss": 0.92710185, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94884038, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14727783, + "step": 5735, + "time_per_iteration": 2.531832218170166 + }, + { + "auxiliary_loss_clip": 0.01141912, + "auxiliary_loss_mlp": 0.01049316, + "balance_loss_clip": 1.05273545, + "balance_loss_mlp": 1.03096986, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.6994048698266324, + "language_loss": 0.76570249, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78761482, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.18359375, + "step": 5736, + "time_per_iteration": 2.4771652221679688 + }, + { + "auxiliary_loss_clip": 0.0114133, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.05375814, + "balance_loss_mlp": 1.02663589, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 2.2674964809114906, + "language_loss": 0.78948569, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81131506, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14978027, + "step": 5737, + "time_per_iteration": 2.5988271236419678 + }, + { + "auxiliary_loss_clip": 0.0114344, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.05292678, + "balance_loss_mlp": 1.0213052, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.308168868825699, + "language_loss": 0.71726459, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73907453, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.16247559, + "step": 5738, + "time_per_iteration": 2.5006332397460938 + }, + { + "auxiliary_loss_clip": 0.01151596, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.06026924, + "balance_loss_mlp": 1.02479506, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 2.1583388082726898, + "language_loss": 0.81555468, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83748215, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.16357422, + "step": 5739, + "time_per_iteration": 2.564995050430298 + }, + { + "auxiliary_loss_clip": 0.01139144, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.05247462, + "balance_loss_mlp": 1.01867008, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.054100476418273, + "language_loss": 0.82505178, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.84678471, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.15472412, + "step": 5740, + "time_per_iteration": 2.5187954902648926 + }, + { + "auxiliary_loss_clip": 0.01143706, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.05680728, + "balance_loss_mlp": 1.01594746, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.257371723355799, + "language_loss": 0.77026272, + "learning_rate": 3.045403886269181e-06, + "loss": 0.79202557, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.16650391, + "step": 5741, + "time_per_iteration": 2.485518217086792 + }, + { + "auxiliary_loss_clip": 0.01155081, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.06560659, + "balance_loss_mlp": 1.01913166, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.5492911096161797, + "language_loss": 0.77183014, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79372704, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.15466309, + "step": 5742, + "time_per_iteration": 2.7385127544403076 + }, + { + "auxiliary_loss_clip": 0.0114475, + "auxiliary_loss_mlp": 0.01041817, + "balance_loss_clip": 1.05461085, + "balance_loss_mlp": 1.02565217, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.9877118924574877, + "language_loss": 0.76619554, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.7880612, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.16156006, + "step": 5743, + "time_per_iteration": 2.5105226039886475 + }, + { + "auxiliary_loss_clip": 0.01138693, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.05165434, + "balance_loss_mlp": 1.02029991, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.6992398206966755, + "language_loss": 0.70125329, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72299236, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14916992, + "step": 5744, + "time_per_iteration": 2.551891565322876 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.04975998, + "balance_loss_mlp": 1.0224725, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.7637421386859293, + "language_loss": 0.8013972, + "learning_rate": 3.044075480787665e-06, + "loss": 0.82312948, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.16674805, + "step": 5745, + "time_per_iteration": 2.4288904666900635 + }, + { + "auxiliary_loss_clip": 0.01155744, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.06321621, + "balance_loss_mlp": 1.02484655, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 2.2176379931522003, + "language_loss": 0.8930887, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91506213, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16748047, + "step": 5746, + "time_per_iteration": 2.441101312637329 + }, + { + "auxiliary_loss_clip": 0.01144472, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.05286121, + "balance_loss_mlp": 1.02275169, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 2.2646412714721134, + "language_loss": 0.64247024, + "learning_rate": 3.043411040447849e-06, + "loss": 0.664307, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.16455078, + "step": 5747, + "time_per_iteration": 2.4393796920776367 + }, + { + "auxiliary_loss_clip": 0.0114452, + "auxiliary_loss_mlp": 0.01040622, + "balance_loss_clip": 1.05506754, + "balance_loss_mlp": 1.02561378, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5773275743950135, + "language_loss": 0.72788286, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74973428, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.15014648, + "step": 5748, + "time_per_iteration": 2.5716662406921387 + }, + { + "auxiliary_loss_clip": 0.0113923, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.05587304, + "balance_loss_mlp": 1.01854444, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6044299549690215, + "language_loss": 0.75516087, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77687621, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13757324, + "step": 5749, + "time_per_iteration": 2.4864394664764404 + }, + { + "auxiliary_loss_clip": 0.01080016, + "auxiliary_loss_mlp": 0.0101815, + "balance_loss_clip": 1.04912102, + "balance_loss_mlp": 1.01631081, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8869393544924012, + "language_loss": 0.62687576, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64785743, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.01837158, + "step": 5750, + "time_per_iteration": 2.9427947998046875 + }, + { + "auxiliary_loss_clip": 0.01136533, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.0539391, + "balance_loss_mlp": 1.01501226, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.256053772868347, + "language_loss": 0.80742407, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82908297, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14349365, + "step": 5751, + "time_per_iteration": 2.4175000190734863 + }, + { + "auxiliary_loss_clip": 0.01137048, + "auxiliary_loss_mlp": 0.01044588, + "balance_loss_clip": 1.0499084, + "balance_loss_mlp": 1.02934086, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.506124979229315, + "language_loss": 0.84177411, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86359042, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15258789, + "step": 5752, + "time_per_iteration": 2.445352792739868 + }, + { + "auxiliary_loss_clip": 0.01071713, + "auxiliary_loss_mlp": 0.01003069, + "balance_loss_clip": 1.04199839, + "balance_loss_mlp": 1.0012269, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7311490901906995, + "language_loss": 0.63089859, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65164638, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.0184021, + "step": 5753, + "time_per_iteration": 2.9561564922332764 + }, + { + "auxiliary_loss_clip": 0.01148872, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.06120718, + "balance_loss_mlp": 1.02437437, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 2.443284275635767, + "language_loss": 0.70927429, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73116362, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.15673828, + "step": 5754, + "time_per_iteration": 2.4611356258392334 + }, + { + "auxiliary_loss_clip": 0.01153099, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.059829, + "balance_loss_mlp": 1.02360249, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 1.7401081608777071, + "language_loss": 0.72849345, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75041777, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15734863, + "step": 5755, + "time_per_iteration": 2.4375178813934326 + }, + { + "auxiliary_loss_clip": 0.01142214, + "auxiliary_loss_mlp": 0.01035457, + "balance_loss_clip": 1.05694246, + "balance_loss_mlp": 1.02040672, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.444178306431455, + "language_loss": 0.72360313, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74537992, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.15039062, + "step": 5756, + "time_per_iteration": 4.335761308670044 + }, + { + "auxiliary_loss_clip": 0.01072583, + "auxiliary_loss_mlp": 0.01009681, + "balance_loss_clip": 1.04303312, + "balance_loss_mlp": 1.00766051, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.713624574776363, + "language_loss": 0.62551534, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64633793, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 0.02020264, + "step": 5757, + "time_per_iteration": 3.067472457885742 + }, + { + "auxiliary_loss_clip": 0.01082684, + "auxiliary_loss_mlp": 0.01007172, + "balance_loss_clip": 1.05339992, + "balance_loss_mlp": 1.00536883, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8199611695672643, + "language_loss": 0.59249204, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61339056, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 0.01803589, + "step": 5758, + "time_per_iteration": 3.0659677982330322 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01053323, + "balance_loss_clip": 1.05363107, + "balance_loss_mlp": 1.03793335, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.6254095329011957, + "language_loss": 0.71507275, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.7370038, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.15393066, + "step": 5759, + "time_per_iteration": 2.474591016769409 + }, + { + "auxiliary_loss_clip": 0.01143509, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.05570185, + "balance_loss_mlp": 1.04172194, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 2.0091761285776104, + "language_loss": 0.83755583, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.8595823, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.17407227, + "step": 5760, + "time_per_iteration": 2.4606685638427734 + }, + { + "auxiliary_loss_clip": 0.01064624, + "auxiliary_loss_mlp": 0.01011175, + "balance_loss_clip": 1.03525639, + "balance_loss_mlp": 1.00947046, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8227863659093394, + "language_loss": 0.5650115, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58576953, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.29345703, + "router_z_loss_mlp": 0.01708984, + "step": 5761, + "time_per_iteration": 3.1467506885528564 + }, + { + "auxiliary_loss_clip": 0.01135314, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.05033636, + "balance_loss_mlp": 1.03045106, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.5798238734712093, + "language_loss": 0.95332283, + "learning_rate": 3.038422700166474e-06, + "loss": 0.9751333, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15283203, + "step": 5762, + "time_per_iteration": 2.4445598125457764 + }, + { + "auxiliary_loss_clip": 0.01149629, + "auxiliary_loss_mlp": 0.01046104, + "balance_loss_clip": 1.06084037, + "balance_loss_mlp": 1.02887821, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.6562589457859807, + "language_loss": 0.69824064, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.72019798, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.17224121, + "step": 5763, + "time_per_iteration": 2.5031981468200684 + }, + { + "auxiliary_loss_clip": 0.01153251, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.06033278, + "balance_loss_mlp": 1.03002501, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 1.6456379383516888, + "language_loss": 0.83863044, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86063588, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.17285156, + "step": 5764, + "time_per_iteration": 2.5371153354644775 + }, + { + "auxiliary_loss_clip": 0.01142131, + "auxiliary_loss_mlp": 0.01036378, + "balance_loss_clip": 1.05629635, + "balance_loss_mlp": 1.02173305, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.3191170035034596, + "language_loss": 0.67290485, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69468993, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.14642334, + "step": 5765, + "time_per_iteration": 2.465261697769165 + }, + { + "auxiliary_loss_clip": 0.01151166, + "auxiliary_loss_mlp": 0.01039088, + "balance_loss_clip": 1.06416082, + "balance_loss_mlp": 1.02390063, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 1.7911143297280585, + "language_loss": 0.77067161, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79257417, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15209961, + "step": 5766, + "time_per_iteration": 2.5608928203582764 + }, + { + "auxiliary_loss_clip": 0.01138766, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.05308175, + "balance_loss_mlp": 1.02069545, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.4800237088148818, + "language_loss": 0.73580587, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75754333, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.14312744, + "step": 5767, + "time_per_iteration": 3.8450911045074463 + }, + { + "auxiliary_loss_clip": 0.011507, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.06134152, + "balance_loss_mlp": 1.02163029, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 1.9108226414895852, + "language_loss": 0.78060991, + "learning_rate": 3.036424880912893e-06, + "loss": 0.80249763, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.16430664, + "step": 5768, + "time_per_iteration": 2.5806896686553955 + }, + { + "auxiliary_loss_clip": 0.01062037, + "auxiliary_loss_mlp": 0.0101071, + "balance_loss_clip": 1.03302455, + "balance_loss_mlp": 1.00896358, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7725045717398928, + "language_loss": 0.57497936, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59570682, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.29052734, + "router_z_loss_mlp": 0.01748657, + "step": 5769, + "time_per_iteration": 3.3680787086486816 + }, + { + "auxiliary_loss_clip": 0.01153128, + "auxiliary_loss_mlp": 0.01040368, + "balance_loss_clip": 1.05563498, + "balance_loss_mlp": 1.02257013, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.962463180960872, + "language_loss": 0.85950947, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.88144445, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.17785645, + "step": 5770, + "time_per_iteration": 3.7969613075256348 + }, + { + "auxiliary_loss_clip": 0.01065732, + "auxiliary_loss_mlp": 0.01006093, + "balance_loss_clip": 1.03623259, + "balance_loss_mlp": 1.00422192, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7681739964380128, + "language_loss": 0.59861839, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61933666, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.29541016, + "router_z_loss_mlp": 0.01867676, + "step": 5771, + "time_per_iteration": 2.8644707202911377 + }, + { + "auxiliary_loss_clip": 0.01145495, + "auxiliary_loss_mlp": 0.01044105, + "balance_loss_clip": 1.05861986, + "balance_loss_mlp": 1.02937675, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.7355600458004812, + "language_loss": 0.71932125, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.74121726, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14727783, + "step": 5772, + "time_per_iteration": 2.540659189224243 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.05031168, + "balance_loss_mlp": 1.02367043, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4167261774556743, + "language_loss": 0.76248384, + "learning_rate": 3.034758950632507e-06, + "loss": 0.7842586, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.15539551, + "step": 5773, + "time_per_iteration": 2.575561046600342 + }, + { + "auxiliary_loss_clip": 0.01149846, + "auxiliary_loss_mlp": 0.01046887, + "balance_loss_clip": 1.05660558, + "balance_loss_mlp": 1.03044796, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.238332405535623, + "language_loss": 0.70340139, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72536874, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.16442871, + "step": 5774, + "time_per_iteration": 2.4068095684051514 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.01040316, + "balance_loss_clip": 1.05393863, + "balance_loss_mlp": 1.02586746, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.647296611393969, + "language_loss": 0.76083094, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.7826159, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14465332, + "step": 5775, + "time_per_iteration": 2.463754892349243 + }, + { + "auxiliary_loss_clip": 0.0114201, + "auxiliary_loss_mlp": 0.01045706, + "balance_loss_clip": 1.05091584, + "balance_loss_mlp": 1.02846229, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.491978142403422, + "language_loss": 0.77249813, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.7943753, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.17254639, + "step": 5776, + "time_per_iteration": 2.402470111846924 + }, + { + "auxiliary_loss_clip": 0.01076651, + "auxiliary_loss_mlp": 0.01007822, + "balance_loss_clip": 1.04807293, + "balance_loss_mlp": 1.00595641, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8378319117241327, + "language_loss": 0.63362265, + "learning_rate": 3.033425500045478e-06, + "loss": 0.6544674, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.28613281, + "router_z_loss_mlp": 0.01864624, + "step": 5777, + "time_per_iteration": 4.5294482707977295 + }, + { + "auxiliary_loss_clip": 0.01139212, + "auxiliary_loss_mlp": 0.01044232, + "balance_loss_clip": 1.05080247, + "balance_loss_mlp": 1.02825797, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.0251918132479934, + "language_loss": 0.65112245, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67295688, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15966797, + "step": 5778, + "time_per_iteration": 2.596310615539551 + }, + { + "auxiliary_loss_clip": 0.01139497, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0503583, + "balance_loss_mlp": 1.03427804, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.8216248375219493, + "language_loss": 0.7156992, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73759198, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.1550293, + "step": 5779, + "time_per_iteration": 2.6117067337036133 + }, + { + "auxiliary_loss_clip": 0.01143121, + "auxiliary_loss_mlp": 0.01041468, + "balance_loss_clip": 1.05203891, + "balance_loss_mlp": 1.02607822, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.901182902434772, + "language_loss": 0.6217196, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64356554, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.15380859, + "step": 5780, + "time_per_iteration": 2.6277012825012207 + }, + { + "auxiliary_loss_clip": 0.01135325, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.04891551, + "balance_loss_mlp": 1.02597857, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.7310763135588523, + "language_loss": 0.7177099, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.73946935, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.14642334, + "step": 5781, + "time_per_iteration": 2.770644426345825 + }, + { + "auxiliary_loss_clip": 0.01142823, + "auxiliary_loss_mlp": 0.01054154, + "balance_loss_clip": 1.05266583, + "balance_loss_mlp": 1.03577769, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.2788984375980967, + "language_loss": 0.77271128, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79468101, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.18365479, + "step": 5782, + "time_per_iteration": 2.4412529468536377 + }, + { + "auxiliary_loss_clip": 0.01148871, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.0618484, + "balance_loss_mlp": 1.02240229, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 2.104719813726109, + "language_loss": 0.62319744, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64507133, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.16113281, + "step": 5783, + "time_per_iteration": 2.421504020690918 + }, + { + "auxiliary_loss_clip": 0.01139004, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_clip": 1.05443954, + "balance_loss_mlp": 1.02567744, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.7004421019859597, + "language_loss": 0.88533974, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90715122, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.16467285, + "step": 5784, + "time_per_iteration": 2.4557294845581055 + }, + { + "auxiliary_loss_clip": 0.01154517, + "auxiliary_loss_mlp": 0.01035372, + "balance_loss_clip": 1.06820202, + "balance_loss_mlp": 1.02015448, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 2.0325564677156254, + "language_loss": 0.81779277, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.8396917, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15228271, + "step": 5785, + "time_per_iteration": 2.459528923034668 + }, + { + "auxiliary_loss_clip": 0.01145741, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.05870712, + "balance_loss_mlp": 1.01993656, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.7251209986662503, + "language_loss": 0.80291396, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82471681, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.14624023, + "step": 5786, + "time_per_iteration": 2.4419422149658203 + }, + { + "auxiliary_loss_clip": 0.01145447, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.0583477, + "balance_loss_mlp": 1.02021217, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5935887009997496, + "language_loss": 0.74635208, + "learning_rate": 3.030089132216836e-06, + "loss": 0.76816654, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15795898, + "step": 5787, + "time_per_iteration": 2.390232563018799 + }, + { + "auxiliary_loss_clip": 0.01148432, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_clip": 1.05804503, + "balance_loss_mlp": 1.02925897, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.948181232740787, + "language_loss": 0.81287038, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83479595, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14886475, + "step": 5788, + "time_per_iteration": 2.553356647491455 + }, + { + "auxiliary_loss_clip": 0.0115264, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.06036603, + "balance_loss_mlp": 1.02449524, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 2.7750626913571916, + "language_loss": 0.85821295, + "learning_rate": 3.029421389513147e-06, + "loss": 0.8801443, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.15991211, + "step": 5789, + "time_per_iteration": 2.414586305618286 + }, + { + "auxiliary_loss_clip": 0.01150334, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.05843306, + "balance_loss_mlp": 1.03535068, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.8788685646178052, + "language_loss": 0.84823507, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87024844, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15649414, + "step": 5790, + "time_per_iteration": 2.4204046726226807 + }, + { + "auxiliary_loss_clip": 0.01149746, + "auxiliary_loss_mlp": 0.01042516, + "balance_loss_clip": 1.06214368, + "balance_loss_mlp": 1.02682757, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 6.701047853207177, + "language_loss": 0.81123137, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83315396, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.15673828, + "step": 5791, + "time_per_iteration": 2.467336893081665 + }, + { + "auxiliary_loss_clip": 0.01146817, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.05708504, + "balance_loss_mlp": 1.0214808, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.8745101278028045, + "language_loss": 0.77895594, + "learning_rate": 3.028419482721056e-06, + "loss": 0.80079603, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.15698242, + "step": 5792, + "time_per_iteration": 2.5074265003204346 + }, + { + "auxiliary_loss_clip": 0.01149758, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.06171393, + "balance_loss_mlp": 1.01700139, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.6291771941502087, + "language_loss": 0.81361616, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83543128, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14746094, + "step": 5793, + "time_per_iteration": 2.4300291538238525 + }, + { + "auxiliary_loss_clip": 0.01159191, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.06506133, + "balance_loss_mlp": 1.03464198, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 2.1369604520269383, + "language_loss": 0.76204669, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78414935, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16436768, + "step": 5794, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01147106, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.06093824, + "balance_loss_mlp": 1.02497911, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.689887317986063, + "language_loss": 0.57213449, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59400463, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.14923096, + "step": 5795, + "time_per_iteration": 2.8134469985961914 + }, + { + "auxiliary_loss_clip": 0.01145535, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.05826604, + "balance_loss_mlp": 1.02010632, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 1.6646029939021851, + "language_loss": 0.82315969, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84496522, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14916992, + "step": 5796, + "time_per_iteration": 2.4575729370117188 + }, + { + "auxiliary_loss_clip": 0.0113375, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.05113029, + "balance_loss_mlp": 1.01555943, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.8292535139276398, + "language_loss": 0.83974993, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.86138576, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.1427002, + "step": 5797, + "time_per_iteration": 2.4653337001800537 + }, + { + "auxiliary_loss_clip": 0.01136747, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.05167127, + "balance_loss_mlp": 1.0235548, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7480366348547376, + "language_loss": 0.73361963, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75537705, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15435791, + "step": 5798, + "time_per_iteration": 2.511367082595825 + }, + { + "auxiliary_loss_clip": 0.01145541, + "auxiliary_loss_mlp": 0.01052135, + "balance_loss_clip": 1.05704391, + "balance_loss_mlp": 1.03381252, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.1417790476447593, + "language_loss": 0.76328802, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78526473, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.18322754, + "step": 5799, + "time_per_iteration": 3.8783576488494873 + }, + { + "auxiliary_loss_clip": 0.01145657, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.0583806, + "balance_loss_mlp": 1.02662146, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 3.1989903714815413, + "language_loss": 0.75668061, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77854455, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14117432, + "step": 5800, + "time_per_iteration": 2.4695940017700195 + }, + { + "auxiliary_loss_clip": 0.01161296, + "auxiliary_loss_mlp": 0.01037122, + "balance_loss_clip": 1.06805348, + "balance_loss_mlp": 1.02106428, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 1.6987721709384578, + "language_loss": 0.67264628, + "learning_rate": 3.025411657833591e-06, + "loss": 0.6946305, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.16052246, + "step": 5801, + "time_per_iteration": 2.6996614933013916 + }, + { + "auxiliary_loss_clip": 0.01144795, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_clip": 1.05950189, + "balance_loss_mlp": 1.0287683, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.9952008827021699, + "language_loss": 0.76519954, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78708112, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14611816, + "step": 5802, + "time_per_iteration": 2.4481050968170166 + }, + { + "auxiliary_loss_clip": 0.01140873, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_clip": 1.05748022, + "balance_loss_mlp": 1.02633119, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 4.201724737449475, + "language_loss": 0.79421127, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81603414, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.15087891, + "step": 5803, + "time_per_iteration": 2.4534974098205566 + }, + { + "auxiliary_loss_clip": 0.01145239, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.05597162, + "balance_loss_mlp": 1.02202368, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.973550442424928, + "language_loss": 0.6798526, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.70167249, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.1472168, + "step": 5804, + "time_per_iteration": 2.524277448654175 + }, + { + "auxiliary_loss_clip": 0.01140152, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.05580902, + "balance_loss_mlp": 1.02661026, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.98753235525373, + "language_loss": 0.76127148, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78308773, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14855957, + "step": 5805, + "time_per_iteration": 2.4491493701934814 + }, + { + "auxiliary_loss_clip": 0.01142924, + "auxiliary_loss_mlp": 0.01038891, + "balance_loss_clip": 1.05686641, + "balance_loss_mlp": 1.02342916, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 2.065281113393938, + "language_loss": 0.67398763, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69580579, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.15460205, + "step": 5806, + "time_per_iteration": 2.473522186279297 + }, + { + "auxiliary_loss_clip": 0.01149761, + "auxiliary_loss_mlp": 0.0103835, + "balance_loss_clip": 1.06220794, + "balance_loss_mlp": 1.02341938, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.589619475031644, + "language_loss": 0.71821785, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74009895, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14929199, + "step": 5807, + "time_per_iteration": 2.5366549491882324 + }, + { + "auxiliary_loss_clip": 0.01141601, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.05209041, + "balance_loss_mlp": 1.02442861, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.867264872658206, + "language_loss": 0.73730314, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.75912786, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.16418457, + "step": 5808, + "time_per_iteration": 2.9009156227111816 + }, + { + "auxiliary_loss_clip": 0.01153246, + "auxiliary_loss_mlp": 0.01043652, + "balance_loss_clip": 1.07088947, + "balance_loss_mlp": 1.02865541, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.5083918770836358, + "language_loss": 0.84351456, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86548352, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.14996338, + "step": 5809, + "time_per_iteration": 2.496373414993286 + }, + { + "auxiliary_loss_clip": 0.01140209, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.05835009, + "balance_loss_mlp": 1.0212189, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.9622745510422346, + "language_loss": 0.80641294, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82816732, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14013672, + "step": 5810, + "time_per_iteration": 3.859407901763916 + }, + { + "auxiliary_loss_clip": 0.01142264, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_clip": 1.0557462, + "balance_loss_mlp": 1.03145552, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.6814574166900633, + "language_loss": 0.75152326, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77340496, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.14459229, + "step": 5811, + "time_per_iteration": 2.493598699569702 + }, + { + "auxiliary_loss_clip": 0.01137086, + "auxiliary_loss_mlp": 0.01037715, + "balance_loss_clip": 1.05004179, + "balance_loss_mlp": 1.022856, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4879449827673115, + "language_loss": 0.80044997, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82219803, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14862061, + "step": 5812, + "time_per_iteration": 2.531526803970337 + }, + { + "auxiliary_loss_clip": 0.01147388, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.05683565, + "balance_loss_mlp": 1.02457213, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.9695105538548188, + "language_loss": 0.69841242, + "learning_rate": 3.021396326901918e-06, + "loss": 0.72028887, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.15679932, + "step": 5813, + "time_per_iteration": 2.4197137355804443 + }, + { + "auxiliary_loss_clip": 0.0113082, + "auxiliary_loss_mlp": 0.01035083, + "balance_loss_clip": 1.04701352, + "balance_loss_mlp": 1.02003849, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 1.9331608672224336, + "language_loss": 0.76614124, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78780025, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.15039062, + "step": 5814, + "time_per_iteration": 3.9047601222991943 + }, + { + "auxiliary_loss_clip": 0.0114682, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_clip": 1.05989218, + "balance_loss_mlp": 1.02487183, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.6056651717897914, + "language_loss": 0.8454529, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86733305, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.16320801, + "step": 5815, + "time_per_iteration": 2.4661154747009277 + }, + { + "auxiliary_loss_clip": 0.01135836, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.04901922, + "balance_loss_mlp": 1.02255976, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.505533779120067, + "language_loss": 0.77393001, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79565394, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14013672, + "step": 5816, + "time_per_iteration": 2.459810256958008 + }, + { + "auxiliary_loss_clip": 0.01149761, + "auxiliary_loss_mlp": 0.01049789, + "balance_loss_clip": 1.05991113, + "balance_loss_mlp": 1.03306425, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 1.9002852873154625, + "language_loss": 0.58750576, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60950124, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.16723633, + "step": 5817, + "time_per_iteration": 2.5855135917663574 + }, + { + "auxiliary_loss_clip": 0.01068069, + "auxiliary_loss_mlp": 0.01007021, + "balance_loss_clip": 1.0395751, + "balance_loss_mlp": 1.00513434, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8742040983683609, + "language_loss": 0.59861147, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61936235, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.01889038, + "step": 5818, + "time_per_iteration": 3.138105630874634 + }, + { + "auxiliary_loss_clip": 0.01140291, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.0559516, + "balance_loss_mlp": 1.02603757, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 1.6883918904521003, + "language_loss": 0.83619106, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85801071, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.15643311, + "step": 5819, + "time_per_iteration": 2.4154274463653564 + }, + { + "auxiliary_loss_clip": 0.01145161, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.05945516, + "balance_loss_mlp": 1.0175606, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.7879001835397896, + "language_loss": 0.70816886, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.72993958, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.14349365, + "step": 5820, + "time_per_iteration": 4.19676661491394 + }, + { + "auxiliary_loss_clip": 0.01146396, + "auxiliary_loss_mlp": 0.01040735, + "balance_loss_clip": 1.05711675, + "balance_loss_mlp": 1.02619147, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 2.1665998668430415, + "language_loss": 0.70727277, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7291441, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.14532471, + "step": 5821, + "time_per_iteration": 2.5867812633514404 + }, + { + "auxiliary_loss_clip": 0.01148757, + "auxiliary_loss_mlp": 0.01044518, + "balance_loss_clip": 1.05761719, + "balance_loss_mlp": 1.02784014, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.3774228696383757, + "language_loss": 0.73542988, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.7573626, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16662598, + "step": 5822, + "time_per_iteration": 2.464564085006714 + }, + { + "auxiliary_loss_clip": 0.01140678, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.05340648, + "balance_loss_mlp": 1.02018213, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5393520072848008, + "language_loss": 0.78002143, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80179131, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.16125488, + "step": 5823, + "time_per_iteration": 2.497824192047119 + }, + { + "auxiliary_loss_clip": 0.01058825, + "auxiliary_loss_mlp": 0.01005458, + "balance_loss_clip": 1.02963388, + "balance_loss_mlp": 1.00345528, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7161812770569156, + "language_loss": 0.59264362, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61328644, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.29199219, + "router_z_loss_mlp": 0.02001953, + "step": 5824, + "time_per_iteration": 3.101832866668701 + }, + { + "auxiliary_loss_clip": 0.01138426, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.05226088, + "balance_loss_mlp": 1.0212611, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 2.153055088715128, + "language_loss": 0.85056174, + "learning_rate": 3.017375418643811e-06, + "loss": 0.87231636, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15783691, + "step": 5825, + "time_per_iteration": 2.462773323059082 + }, + { + "auxiliary_loss_clip": 0.01142821, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.05586326, + "balance_loss_mlp": 1.03044438, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.326751405532776, + "language_loss": 0.83585095, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85774201, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.1585083, + "step": 5826, + "time_per_iteration": 2.3813765048980713 + }, + { + "auxiliary_loss_clip": 0.01150186, + "auxiliary_loss_mlp": 0.01047286, + "balance_loss_clip": 1.05807531, + "balance_loss_mlp": 1.03209913, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5910913477613844, + "language_loss": 0.8093462, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83132088, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.15197754, + "step": 5827, + "time_per_iteration": 2.501481056213379 + }, + { + "auxiliary_loss_clip": 0.01148431, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.06313157, + "balance_loss_mlp": 1.02016401, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.693831392156157, + "language_loss": 0.70910424, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73093647, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.1463623, + "step": 5828, + "time_per_iteration": 2.4253060817718506 + }, + { + "auxiliary_loss_clip": 0.01146017, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.05712032, + "balance_loss_mlp": 1.03101289, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.8222904436611163, + "language_loss": 0.79897064, + "learning_rate": 3.016033880279248e-06, + "loss": 0.8209157, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.17456055, + "step": 5829, + "time_per_iteration": 2.5284156799316406 + }, + { + "auxiliary_loss_clip": 0.01145767, + "auxiliary_loss_mlp": 0.01049049, + "balance_loss_clip": 1.05624366, + "balance_loss_mlp": 1.03228855, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.8497542063285184, + "language_loss": 0.72402859, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74597681, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.16748047, + "step": 5830, + "time_per_iteration": 2.476227045059204 + }, + { + "auxiliary_loss_clip": 0.01131833, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.04703951, + "balance_loss_mlp": 1.01940393, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 3.1497242242907166, + "language_loss": 0.88659281, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90825558, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.15032959, + "step": 5831, + "time_per_iteration": 2.4679207801818848 + }, + { + "auxiliary_loss_clip": 0.01141596, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.05499995, + "balance_loss_mlp": 1.02783108, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 2.26293229361629, + "language_loss": 0.78152239, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.8033644, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14770508, + "step": 5832, + "time_per_iteration": 2.419485092163086 + }, + { + "auxiliary_loss_clip": 0.01146585, + "auxiliary_loss_mlp": 0.01047417, + "balance_loss_clip": 1.05334973, + "balance_loss_mlp": 1.03106165, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.9118193111614514, + "language_loss": 0.70863718, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73057717, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.16369629, + "step": 5833, + "time_per_iteration": 2.738558053970337 + }, + { + "auxiliary_loss_clip": 0.01132777, + "auxiliary_loss_mlp": 0.01041693, + "balance_loss_clip": 1.04896212, + "balance_loss_mlp": 1.0269407, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3826980488233471, + "language_loss": 0.81078255, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83252722, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14752197, + "step": 5834, + "time_per_iteration": 2.673908233642578 + }, + { + "auxiliary_loss_clip": 0.01132337, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.04741454, + "balance_loss_mlp": 1.02565026, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 1.9842137186513529, + "language_loss": 0.83749092, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.859231, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.16027832, + "step": 5835, + "time_per_iteration": 2.468362808227539 + }, + { + "auxiliary_loss_clip": 0.01145911, + "auxiliary_loss_mlp": 0.01044627, + "balance_loss_clip": 1.06100917, + "balance_loss_mlp": 1.02992845, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.3897981259315302, + "language_loss": 0.76393461, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78583997, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.14697266, + "step": 5836, + "time_per_iteration": 2.4770755767822266 + }, + { + "auxiliary_loss_clip": 0.01139897, + "auxiliary_loss_mlp": 0.01042563, + "balance_loss_clip": 1.05431795, + "balance_loss_mlp": 1.02742314, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 1.8953093083976462, + "language_loss": 0.77620018, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79802477, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.15155029, + "step": 5837, + "time_per_iteration": 2.427199602127075 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01043448, + "balance_loss_clip": 1.0540731, + "balance_loss_mlp": 1.02854681, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 2.1046601425518654, + "language_loss": 0.67507553, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.6968987, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.14904785, + "step": 5838, + "time_per_iteration": 2.4087841510772705 + }, + { + "auxiliary_loss_clip": 0.01140892, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.05381322, + "balance_loss_mlp": 1.0221734, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.071858512431883, + "language_loss": 0.83763301, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85942435, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.16064453, + "step": 5839, + "time_per_iteration": 2.4166393280029297 + }, + { + "auxiliary_loss_clip": 0.01134306, + "auxiliary_loss_mlp": 0.01040943, + "balance_loss_clip": 1.04636991, + "balance_loss_mlp": 1.02514815, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.73911872504306, + "language_loss": 0.58610803, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60786051, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.15808105, + "step": 5840, + "time_per_iteration": 2.465359687805176 + }, + { + "auxiliary_loss_clip": 0.01134264, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.04726303, + "balance_loss_mlp": 1.02453399, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 3.331278220595362, + "language_loss": 0.87895799, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.90070397, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.15795898, + "step": 5841, + "time_per_iteration": 2.4965481758117676 + }, + { + "auxiliary_loss_clip": 0.01145989, + "auxiliary_loss_mlp": 0.01043562, + "balance_loss_clip": 1.0567534, + "balance_loss_mlp": 1.02568102, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.7142401020253628, + "language_loss": 0.75239992, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77429545, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.17883301, + "step": 5842, + "time_per_iteration": 2.4449336528778076 + }, + { + "auxiliary_loss_clip": 0.01135253, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.04803789, + "balance_loss_mlp": 1.02956867, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.397609890035879, + "language_loss": 0.69079399, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.71259594, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15386963, + "step": 5843, + "time_per_iteration": 3.9269638061523438 + }, + { + "auxiliary_loss_clip": 0.01140697, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.05416393, + "balance_loss_mlp": 1.0280081, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 3.306198806332479, + "language_loss": 0.6562264, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67807972, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.16625977, + "step": 5844, + "time_per_iteration": 2.5110247135162354 + }, + { + "auxiliary_loss_clip": 0.01138824, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.05188441, + "balance_loss_mlp": 1.02417541, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.2037798585243347, + "language_loss": 0.75560075, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77740026, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.16967773, + "step": 5845, + "time_per_iteration": 2.4762866497039795 + }, + { + "auxiliary_loss_clip": 0.01140496, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_clip": 1.05699956, + "balance_loss_mlp": 1.02663827, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.6235626339346174, + "language_loss": 0.7274965, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.74932432, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15631104, + "step": 5846, + "time_per_iteration": 2.4930472373962402 + }, + { + "auxiliary_loss_clip": 0.01141503, + "auxiliary_loss_mlp": 0.01048633, + "balance_loss_clip": 1.05317152, + "balance_loss_mlp": 1.0327071, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.608354396291718, + "language_loss": 0.75599265, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77789402, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15917969, + "step": 5847, + "time_per_iteration": 2.948540449142456 + }, + { + "auxiliary_loss_clip": 0.01140909, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.05393052, + "balance_loss_mlp": 1.01767099, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.2259002634003155, + "language_loss": 0.72664636, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74838865, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.15649414, + "step": 5848, + "time_per_iteration": 2.6091413497924805 + }, + { + "auxiliary_loss_clip": 0.01146855, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.05790389, + "balance_loss_mlp": 1.03790307, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.320106311530468, + "language_loss": 0.89903402, + "learning_rate": 3.009316958003178e-06, + "loss": 0.92106068, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.17895508, + "step": 5849, + "time_per_iteration": 2.432454824447632 + }, + { + "auxiliary_loss_clip": 0.01155737, + "auxiliary_loss_mlp": 0.01037701, + "balance_loss_clip": 1.06805491, + "balance_loss_mlp": 1.02195323, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 1.8327125664536683, + "language_loss": 0.74838567, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.77032012, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.1574707, + "step": 5850, + "time_per_iteration": 2.48321270942688 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.05535865, + "balance_loss_mlp": 1.02248442, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.5363063197891456, + "language_loss": 0.75752985, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77930748, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.16149902, + "step": 5851, + "time_per_iteration": 2.4814624786376953 + }, + { + "auxiliary_loss_clip": 0.01153354, + "auxiliary_loss_mlp": 0.01046892, + "balance_loss_clip": 1.06121016, + "balance_loss_mlp": 1.02967834, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 1.9505722460936445, + "language_loss": 0.87482733, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89682978, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.17211914, + "step": 5852, + "time_per_iteration": 2.491025447845459 + }, + { + "auxiliary_loss_clip": 0.01137056, + "auxiliary_loss_mlp": 0.01034127, + "balance_loss_clip": 1.05275679, + "balance_loss_mlp": 1.0190767, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 3.4742360993352044, + "language_loss": 0.67790449, + "learning_rate": 3.007971733162737e-06, + "loss": 0.69961631, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.1505127, + "step": 5853, + "time_per_iteration": 3.953540086746216 + }, + { + "auxiliary_loss_clip": 0.01139257, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.0528388, + "balance_loss_mlp": 1.02002835, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.7398632831330028, + "language_loss": 0.81147599, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83322793, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.15905762, + "step": 5854, + "time_per_iteration": 2.410994529724121 + }, + { + "auxiliary_loss_clip": 0.01136946, + "auxiliary_loss_mlp": 0.01043954, + "balance_loss_clip": 1.05388319, + "balance_loss_mlp": 1.02750349, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.7902208708039555, + "language_loss": 0.73266172, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75447071, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.16455078, + "step": 5855, + "time_per_iteration": 2.4050889015197754 + }, + { + "auxiliary_loss_clip": 0.01142558, + "auxiliary_loss_mlp": 0.01050051, + "balance_loss_clip": 1.05556273, + "balance_loss_mlp": 1.0347985, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 2.0441888458402375, + "language_loss": 0.71019077, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73211688, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15252686, + "step": 5856, + "time_per_iteration": 2.4801723957061768 + }, + { + "auxiliary_loss_clip": 0.0114328, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.0545125, + "balance_loss_mlp": 1.03425074, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.7830441187557309, + "language_loss": 0.61867833, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.64061671, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.16308594, + "step": 5857, + "time_per_iteration": 4.077128648757935 + }, + { + "auxiliary_loss_clip": 0.01148807, + "auxiliary_loss_mlp": 0.01038989, + "balance_loss_clip": 1.06223416, + "balance_loss_mlp": 1.0231576, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.79379918291294, + "language_loss": 0.73388451, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75576246, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.15820312, + "step": 5858, + "time_per_iteration": 2.4670560359954834 + }, + { + "auxiliary_loss_clip": 0.01135171, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.04785633, + "balance_loss_mlp": 1.0232923, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.8137291123834696, + "language_loss": 0.76124984, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7829901, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.15563965, + "step": 5859, + "time_per_iteration": 2.823500394821167 + }, + { + "auxiliary_loss_clip": 0.0114676, + "auxiliary_loss_mlp": 0.01042431, + "balance_loss_clip": 1.05651367, + "balance_loss_mlp": 1.0257293, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.821218437990977, + "language_loss": 0.72387004, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.74576193, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.16687012, + "step": 5860, + "time_per_iteration": 2.589226245880127 + }, + { + "auxiliary_loss_clip": 0.0114073, + "auxiliary_loss_mlp": 0.01039437, + "balance_loss_clip": 1.05175245, + "balance_loss_mlp": 1.02261662, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.325418403285692, + "language_loss": 0.66219711, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68399876, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.16833496, + "step": 5861, + "time_per_iteration": 2.461960554122925 + }, + { + "auxiliary_loss_clip": 0.01138921, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.05432606, + "balance_loss_mlp": 1.01986659, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 1.9497812475880234, + "language_loss": 0.66573197, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68747121, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.15130615, + "step": 5862, + "time_per_iteration": 2.426698684692383 + }, + { + "auxiliary_loss_clip": 0.01148492, + "auxiliary_loss_mlp": 0.01046501, + "balance_loss_clip": 1.05888319, + "balance_loss_mlp": 1.02971601, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 1.8718721490666683, + "language_loss": 0.76421523, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.78616512, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.16772461, + "step": 5863, + "time_per_iteration": 3.8420891761779785 + }, + { + "auxiliary_loss_clip": 0.01148867, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.0604068, + "balance_loss_mlp": 1.02355301, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 1.7055701800768701, + "language_loss": 0.7507897, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77266264, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14868164, + "step": 5864, + "time_per_iteration": 2.5015130043029785 + }, + { + "auxiliary_loss_clip": 0.01141377, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.05630219, + "balance_loss_mlp": 1.02854097, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.5979030695921934, + "language_loss": 0.79780459, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81965733, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.15344238, + "step": 5865, + "time_per_iteration": 2.4896700382232666 + }, + { + "auxiliary_loss_clip": 0.01147065, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.05770421, + "balance_loss_mlp": 1.02598417, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 3.085322878189111, + "language_loss": 0.81507516, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83696365, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.15814209, + "step": 5866, + "time_per_iteration": 2.531691551208496 + }, + { + "auxiliary_loss_clip": 0.01149897, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.0574801, + "balance_loss_mlp": 1.02230012, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.1134124786564445, + "language_loss": 0.83741748, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.85931069, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.17114258, + "step": 5867, + "time_per_iteration": 2.411742925643921 + }, + { + "auxiliary_loss_clip": 0.01138259, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.0502708, + "balance_loss_mlp": 1.03249419, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1.8747576604924057, + "language_loss": 0.73876089, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76063162, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.16314697, + "step": 5868, + "time_per_iteration": 2.441516876220703 + }, + { + "auxiliary_loss_clip": 0.01146942, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_clip": 1.05816889, + "balance_loss_mlp": 1.02615702, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.4817705881936667, + "language_loss": 0.61526495, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63715816, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.16235352, + "step": 5869, + "time_per_iteration": 2.5335469245910645 + }, + { + "auxiliary_loss_clip": 0.0114019, + "auxiliary_loss_mlp": 0.01053106, + "balance_loss_clip": 1.0534631, + "balance_loss_mlp": 1.03564215, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.0331772375534243, + "language_loss": 0.74684358, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76877654, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.17456055, + "step": 5870, + "time_per_iteration": 2.468923330307007 + }, + { + "auxiliary_loss_clip": 0.01141845, + "auxiliary_loss_mlp": 0.01036342, + "balance_loss_clip": 1.05602396, + "balance_loss_mlp": 1.02089238, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.5600320111356953, + "language_loss": 0.71786976, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73965162, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.15454102, + "step": 5871, + "time_per_iteration": 2.626437187194824 + }, + { + "auxiliary_loss_clip": 0.01142171, + "auxiliary_loss_mlp": 0.0103891, + "balance_loss_clip": 1.05634975, + "balance_loss_mlp": 1.02488506, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.224850440403325, + "language_loss": 0.73671651, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.75852728, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.14019775, + "step": 5872, + "time_per_iteration": 2.4539403915405273 + }, + { + "auxiliary_loss_clip": 0.0113745, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.05365944, + "balance_loss_mlp": 1.02492321, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.975200221426962, + "language_loss": 0.82506764, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84682971, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.1383667, + "step": 5873, + "time_per_iteration": 2.8782031536102295 + }, + { + "auxiliary_loss_clip": 0.01143242, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.05443656, + "balance_loss_mlp": 1.02634192, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.9170716539219805, + "language_loss": 0.65631163, + "learning_rate": 3.000899288359104e-06, + "loss": 0.67817092, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.16357422, + "step": 5874, + "time_per_iteration": 2.497403860092163 + }, + { + "auxiliary_loss_clip": 0.01092412, + "auxiliary_loss_mlp": 0.01004419, + "balance_loss_clip": 1.06249595, + "balance_loss_mlp": 1.0025804, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7626062595168159, + "language_loss": 0.61475372, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63572204, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.01837158, + "step": 5875, + "time_per_iteration": 2.983210325241089 + }, + { + "auxiliary_loss_clip": 0.0114781, + "auxiliary_loss_mlp": 0.01043845, + "balance_loss_clip": 1.06199574, + "balance_loss_mlp": 1.02902722, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.7893713186549067, + "language_loss": 0.80055755, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82247412, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14813232, + "step": 5876, + "time_per_iteration": 2.510321855545044 + }, + { + "auxiliary_loss_clip": 0.01083507, + "auxiliary_loss_mlp": 0.01003495, + "balance_loss_clip": 1.05490088, + "balance_loss_mlp": 1.00179291, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.672047457089192, + "language_loss": 0.56739366, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58826369, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.28613281, + "router_z_loss_mlp": 0.01702881, + "step": 5877, + "time_per_iteration": 3.136626958847046 + }, + { + "auxiliary_loss_clip": 0.01145212, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.05812502, + "balance_loss_mlp": 1.02290225, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.535784851099588, + "language_loss": 0.72216761, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74401116, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.16235352, + "step": 5878, + "time_per_iteration": 2.4840054512023926 + }, + { + "auxiliary_loss_clip": 0.01144604, + "auxiliary_loss_mlp": 0.01040604, + "balance_loss_clip": 1.05616355, + "balance_loss_mlp": 1.02571487, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9396122776761135, + "language_loss": 0.78553325, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80738533, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.14886475, + "step": 5879, + "time_per_iteration": 2.418060779571533 + }, + { + "auxiliary_loss_clip": 0.01146652, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.05545592, + "balance_loss_mlp": 1.03280652, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.3299057962027008, + "language_loss": 0.63377535, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65575516, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.18530273, + "step": 5880, + "time_per_iteration": 2.482011318206787 + }, + { + "auxiliary_loss_clip": 0.01143926, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.057253, + "balance_loss_mlp": 1.02013683, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 2.129313839718241, + "language_loss": 0.6608566, + "learning_rate": 2.998538081402727e-06, + "loss": 0.68265915, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.16174316, + "step": 5881, + "time_per_iteration": 2.515552043914795 + }, + { + "auxiliary_loss_clip": 0.01139992, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.05824506, + "balance_loss_mlp": 1.01589322, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.446279841608754, + "language_loss": 0.75656974, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77826834, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13977051, + "step": 5882, + "time_per_iteration": 2.472038984298706 + }, + { + "auxiliary_loss_clip": 0.01139419, + "auxiliary_loss_mlp": 0.01042618, + "balance_loss_clip": 1.05356991, + "balance_loss_mlp": 1.02621508, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.0497186404059464, + "language_loss": 0.70679009, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.7286104, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.16375732, + "step": 5883, + "time_per_iteration": 2.4809558391571045 + }, + { + "auxiliary_loss_clip": 0.01139931, + "auxiliary_loss_mlp": 0.01038001, + "balance_loss_clip": 1.0508846, + "balance_loss_mlp": 1.0223372, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.312269360249894, + "language_loss": 0.78507161, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80685097, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.15649414, + "step": 5884, + "time_per_iteration": 2.539923906326294 + }, + { + "auxiliary_loss_clip": 0.01142188, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.05546236, + "balance_loss_mlp": 1.02387857, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 2.2114383815745238, + "language_loss": 0.75815171, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.7799598, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14746094, + "step": 5885, + "time_per_iteration": 2.4810307025909424 + }, + { + "auxiliary_loss_clip": 0.01142681, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.05497336, + "balance_loss_mlp": 1.02093446, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.789198403194387, + "language_loss": 0.83151215, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85330635, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.15802002, + "step": 5886, + "time_per_iteration": 4.553324937820435 + }, + { + "auxiliary_loss_clip": 0.01140844, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.05597591, + "balance_loss_mlp": 1.02144718, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 1.984478071567638, + "language_loss": 0.77998877, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80177003, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.15820312, + "step": 5887, + "time_per_iteration": 2.591614246368408 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.06097031, + "balance_loss_mlp": 1.02269101, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 2.7357927614882485, + "language_loss": 0.65813255, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67997181, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.14831543, + "step": 5888, + "time_per_iteration": 2.4276301860809326 + }, + { + "auxiliary_loss_clip": 0.01144283, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.05916774, + "balance_loss_mlp": 1.01878452, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.8367485978821108, + "language_loss": 0.77247709, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79425049, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14263916, + "step": 5889, + "time_per_iteration": 2.5115818977355957 + }, + { + "auxiliary_loss_clip": 0.01142497, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.0585233, + "balance_loss_mlp": 1.02671027, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.794500438552229, + "language_loss": 0.80729812, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82914734, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.15710449, + "step": 5890, + "time_per_iteration": 2.4460947513580322 + }, + { + "auxiliary_loss_clip": 0.01139185, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.05462849, + "balance_loss_mlp": 1.02136755, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.5463834367447307, + "language_loss": 0.79917622, + "learning_rate": 2.99516171119991e-06, + "loss": 0.82091641, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13476562, + "step": 5891, + "time_per_iteration": 2.543992042541504 + }, + { + "auxiliary_loss_clip": 0.0114763, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.06128132, + "balance_loss_mlp": 1.02151513, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.0012050018585748, + "language_loss": 0.72856057, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75040233, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15039062, + "step": 5892, + "time_per_iteration": 2.5010595321655273 + }, + { + "auxiliary_loss_clip": 0.01146381, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.05984259, + "balance_loss_mlp": 1.02268004, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.5325596099116088, + "language_loss": 0.66986793, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.6917125, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.15393066, + "step": 5893, + "time_per_iteration": 2.4965128898620605 + }, + { + "auxiliary_loss_clip": 0.01141468, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.05611801, + "balance_loss_mlp": 1.02355683, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.7144987312563449, + "language_loss": 0.69650042, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.7183032, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.15246582, + "step": 5894, + "time_per_iteration": 2.509770393371582 + }, + { + "auxiliary_loss_clip": 0.01138659, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.05470836, + "balance_loss_mlp": 1.02017701, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.8334498791672402, + "language_loss": 0.74703336, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76876378, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14208984, + "step": 5895, + "time_per_iteration": 2.5542654991149902 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.05914497, + "balance_loss_mlp": 1.02457094, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 1.8571726169615055, + "language_loss": 0.83578146, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85761958, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14562988, + "step": 5896, + "time_per_iteration": 3.9033360481262207 + }, + { + "auxiliary_loss_clip": 0.01134815, + "auxiliary_loss_mlp": 0.01052536, + "balance_loss_clip": 1.05078101, + "balance_loss_mlp": 1.03691971, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 2.0420805929589547, + "language_loss": 0.70219076, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.72406429, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.15600586, + "step": 5897, + "time_per_iteration": 2.6510069370269775 + }, + { + "auxiliary_loss_clip": 0.01141116, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.05582929, + "balance_loss_mlp": 1.02034414, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.6558857980234385, + "language_loss": 0.81396866, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83573049, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.1472168, + "step": 5898, + "time_per_iteration": 2.6808977127075195 + }, + { + "auxiliary_loss_clip": 0.01148305, + "auxiliary_loss_mlp": 0.01038469, + "balance_loss_clip": 1.0624733, + "balance_loss_mlp": 1.02483165, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.5446357708182223, + "language_loss": 0.74356556, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76543331, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.13641357, + "step": 5899, + "time_per_iteration": 2.723754644393921 + }, + { + "auxiliary_loss_clip": 0.01140473, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.05503511, + "balance_loss_mlp": 1.02136815, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.8500740549918733, + "language_loss": 0.7956149, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81738043, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.14703369, + "step": 5900, + "time_per_iteration": 2.837468385696411 + }, + { + "auxiliary_loss_clip": 0.01145524, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.06025422, + "balance_loss_mlp": 1.02209508, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.8200496227662795, + "language_loss": 0.81226301, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83409822, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.15905762, + "step": 5901, + "time_per_iteration": 3.895824909210205 + }, + { + "auxiliary_loss_clip": 0.01147455, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.0593245, + "balance_loss_mlp": 1.02080798, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 2.3545271775736842, + "language_loss": 0.7565006, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77832973, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.14672852, + "step": 5902, + "time_per_iteration": 2.4962286949157715 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01036887, + "balance_loss_clip": 1.05594122, + "balance_loss_mlp": 1.02301133, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.84223106079665, + "language_loss": 0.70754266, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72932357, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.13861084, + "step": 5903, + "time_per_iteration": 2.4365642070770264 + }, + { + "auxiliary_loss_clip": 0.01146458, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.05448651, + "balance_loss_mlp": 1.02758121, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.5432804403252813, + "language_loss": 0.74968195, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77157593, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.15356445, + "step": 5904, + "time_per_iteration": 2.4282314777374268 + }, + { + "auxiliary_loss_clip": 0.01140787, + "auxiliary_loss_mlp": 0.01036174, + "balance_loss_clip": 1.0542537, + "balance_loss_mlp": 1.02176178, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.241676171334329, + "language_loss": 0.78626096, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80803061, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14404297, + "step": 5905, + "time_per_iteration": 2.427565336227417 + }, + { + "auxiliary_loss_clip": 0.01127604, + "auxiliary_loss_mlp": 0.01043867, + "balance_loss_clip": 1.04798365, + "balance_loss_mlp": 1.0295794, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 1.9888527361772594, + "language_loss": 0.72616792, + "learning_rate": 2.990090084284356e-06, + "loss": 0.7478826, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14300537, + "step": 5906, + "time_per_iteration": 2.458446502685547 + }, + { + "auxiliary_loss_clip": 0.01146987, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.0573231, + "balance_loss_mlp": 1.02394295, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.2000127098901046, + "language_loss": 0.7504856, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.77235639, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16162109, + "step": 5907, + "time_per_iteration": 3.8449790477752686 + }, + { + "auxiliary_loss_clip": 0.01131538, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.04784214, + "balance_loss_mlp": 1.02470684, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 2.4656627494196948, + "language_loss": 0.75539207, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77711523, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.16064453, + "step": 5908, + "time_per_iteration": 2.5163888931274414 + }, + { + "auxiliary_loss_clip": 0.01147123, + "auxiliary_loss_mlp": 0.01040488, + "balance_loss_clip": 1.06098044, + "balance_loss_mlp": 1.0260402, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 2.0953815406764726, + "language_loss": 0.68979633, + "learning_rate": 2.989074743819502e-06, + "loss": 0.71167254, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.14440918, + "step": 5909, + "time_per_iteration": 2.511106491088867 + }, + { + "auxiliary_loss_clip": 0.01146184, + "auxiliary_loss_mlp": 0.01039666, + "balance_loss_clip": 1.06319499, + "balance_loss_mlp": 1.02567065, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 1.782181083242686, + "language_loss": 0.78236639, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80422485, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14001465, + "step": 5910, + "time_per_iteration": 2.428523540496826 + }, + { + "auxiliary_loss_clip": 0.01148848, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.06075215, + "balance_loss_mlp": 1.02117443, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.7062827604400628, + "language_loss": 0.71058667, + "learning_rate": 2.98839766262581e-06, + "loss": 0.73243606, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14929199, + "step": 5911, + "time_per_iteration": 2.4479453563690186 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01038097, + "balance_loss_clip": 1.05366528, + "balance_loss_mlp": 1.02333343, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.1908694738987213, + "language_loss": 0.86990201, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89164072, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14764404, + "step": 5912, + "time_per_iteration": 2.409292221069336 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.05613422, + "balance_loss_mlp": 1.01855493, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 2.9169074551626033, + "language_loss": 0.76841259, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79011726, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14044189, + "step": 5913, + "time_per_iteration": 2.9466724395751953 + }, + { + "auxiliary_loss_clip": 0.01141178, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.05838156, + "balance_loss_mlp": 1.0226059, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3522770703961133, + "language_loss": 0.82688129, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84866107, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14190674, + "step": 5914, + "time_per_iteration": 2.5817718505859375 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.05644321, + "balance_loss_mlp": 1.03120828, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.3847442815662467, + "language_loss": 0.70931196, + "learning_rate": 2.98704305057949e-06, + "loss": 0.73119193, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.15008545, + "step": 5915, + "time_per_iteration": 2.6154584884643555 + }, + { + "auxiliary_loss_clip": 0.01141963, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.0567838, + "balance_loss_mlp": 1.02108288, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.710657164950283, + "language_loss": 0.76191437, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78368789, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14306641, + "step": 5916, + "time_per_iteration": 2.5063743591308594 + }, + { + "auxiliary_loss_clip": 0.01141627, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.05572438, + "balance_loss_mlp": 1.02206779, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 4.096400815680272, + "language_loss": 0.88465631, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90642941, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.1362915, + "step": 5917, + "time_per_iteration": 2.446692943572998 + }, + { + "auxiliary_loss_clip": 0.01142758, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.05844915, + "balance_loss_mlp": 1.02044201, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.1667075970957623, + "language_loss": 0.74540496, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76717806, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.14105225, + "step": 5918, + "time_per_iteration": 2.497922658920288 + }, + { + "auxiliary_loss_clip": 0.01096333, + "auxiliary_loss_mlp": 0.01005012, + "balance_loss_clip": 1.06654835, + "balance_loss_mlp": 1.0030663, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9361003094566411, + "language_loss": 0.63926244, + "learning_rate": 2.985687839672857e-06, + "loss": 0.66027594, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.29785156, + "router_z_loss_mlp": 0.01947021, + "step": 5919, + "time_per_iteration": 2.8172872066497803 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.05742693, + "balance_loss_mlp": 1.01644564, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.4494181697710133, + "language_loss": 0.74144173, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76318163, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14117432, + "step": 5920, + "time_per_iteration": 2.4297382831573486 + }, + { + "auxiliary_loss_clip": 0.01141352, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.05835938, + "balance_loss_mlp": 1.01982069, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.9843077959952091, + "language_loss": 0.77048302, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79222888, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13409424, + "step": 5921, + "time_per_iteration": 2.489473342895508 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.05686247, + "balance_loss_mlp": 1.01882672, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 2.593844230066171, + "language_loss": 0.68081683, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.70254028, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13671875, + "step": 5922, + "time_per_iteration": 2.469182014465332 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.06059992, + "balance_loss_mlp": 1.02108693, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 2.77763812383432, + "language_loss": 0.79000664, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81180835, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.1395874, + "step": 5923, + "time_per_iteration": 2.4713022708892822 + }, + { + "auxiliary_loss_clip": 0.01151452, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.06759727, + "balance_loss_mlp": 1.01992345, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 2.1296785215458613, + "language_loss": 0.85725117, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87909949, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13482666, + "step": 5924, + "time_per_iteration": 2.429004669189453 + }, + { + "auxiliary_loss_clip": 0.01138857, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.05562222, + "balance_loss_mlp": 1.02385497, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 2.2258175495253507, + "language_loss": 0.77253425, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79430902, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14770508, + "step": 5925, + "time_per_iteration": 2.5009920597076416 + }, + { + "auxiliary_loss_clip": 0.01137592, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.05516064, + "balance_loss_mlp": 1.02201867, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.856690801134581, + "language_loss": 0.75880545, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.7805407, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13922119, + "step": 5926, + "time_per_iteration": 2.6217756271362305 + }, + { + "auxiliary_loss_clip": 0.01151664, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.06512547, + "balance_loss_mlp": 1.02291775, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.4142533948392093, + "language_loss": 0.69716287, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71906042, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.15179443, + "step": 5927, + "time_per_iteration": 2.7452828884124756 + }, + { + "auxiliary_loss_clip": 0.01138009, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.05460191, + "balance_loss_mlp": 1.02413249, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.9122965418458553, + "language_loss": 0.7996074, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.82136905, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.14013672, + "step": 5928, + "time_per_iteration": 2.4793574810028076 + }, + { + "auxiliary_loss_clip": 0.011468, + "auxiliary_loss_mlp": 0.01035818, + "balance_loss_clip": 1.06241679, + "balance_loss_mlp": 1.02144122, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 4.910594400233363, + "language_loss": 0.81973624, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84156239, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14367676, + "step": 5929, + "time_per_iteration": 2.4743316173553467 + }, + { + "auxiliary_loss_clip": 0.01135811, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.05373979, + "balance_loss_mlp": 1.01888824, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.6272177271923587, + "language_loss": 0.70679796, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72847927, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13433838, + "step": 5930, + "time_per_iteration": 3.9413633346557617 + }, + { + "auxiliary_loss_clip": 0.01142986, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.05592716, + "balance_loss_mlp": 1.02681625, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 2.465487742861928, + "language_loss": 0.67876995, + "learning_rate": 2.981618622015244e-06, + "loss": 0.70062977, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.16149902, + "step": 5931, + "time_per_iteration": 2.446303367614746 + }, + { + "auxiliary_loss_clip": 0.01139436, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.05486917, + "balance_loss_mlp": 1.02392554, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.9338311974714113, + "language_loss": 0.673388, + "learning_rate": 2.981279278287211e-06, + "loss": 0.6951642, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14251709, + "step": 5932, + "time_per_iteration": 2.544729471206665 + }, + { + "auxiliary_loss_clip": 0.0113241, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.05221653, + "balance_loss_mlp": 1.01643252, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.134726673867565, + "language_loss": 0.78970599, + "learning_rate": 2.980939897348969e-06, + "loss": 0.81133038, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13598633, + "step": 5933, + "time_per_iteration": 2.5132791996002197 + }, + { + "auxiliary_loss_clip": 0.01139432, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.0533576, + "balance_loss_mlp": 1.02809668, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.5187782169752877, + "language_loss": 0.69754052, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71935654, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.14068604, + "step": 5934, + "time_per_iteration": 2.555983543395996 + }, + { + "auxiliary_loss_clip": 0.0113754, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.04912734, + "balance_loss_mlp": 1.02110171, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.766693174907817, + "language_loss": 0.71059453, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.732337, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.15606689, + "step": 5935, + "time_per_iteration": 2.4494338035583496 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.05517411, + "balance_loss_mlp": 1.02189791, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.1796189843542892, + "language_loss": 0.78463304, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80639744, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14630127, + "step": 5936, + "time_per_iteration": 2.444303274154663 + }, + { + "auxiliary_loss_clip": 0.01131179, + "auxiliary_loss_mlp": 0.01034664, + "balance_loss_clip": 1.04931223, + "balance_loss_mlp": 1.02026331, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.5620538655385354, + "language_loss": 0.64679217, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66845065, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14404297, + "step": 5937, + "time_per_iteration": 2.4936702251434326 + }, + { + "auxiliary_loss_clip": 0.01145209, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.05817425, + "balance_loss_mlp": 1.0207175, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.667259748340806, + "language_loss": 0.78974319, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.81155479, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.15246582, + "step": 5938, + "time_per_iteration": 2.4977829456329346 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.05965781, + "balance_loss_mlp": 1.02248192, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 2.8678249315096744, + "language_loss": 0.80420876, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82601625, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14093018, + "step": 5939, + "time_per_iteration": 4.243799448013306 + }, + { + "auxiliary_loss_clip": 0.01150125, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.05815339, + "balance_loss_mlp": 1.01930761, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.9699039417552215, + "language_loss": 0.78971171, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81155789, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.15179443, + "step": 5940, + "time_per_iteration": 2.511401414871216 + }, + { + "auxiliary_loss_clip": 0.01132323, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.04850245, + "balance_loss_mlp": 1.01766944, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 1.9256506551614079, + "language_loss": 0.72477877, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74643397, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.15509033, + "step": 5941, + "time_per_iteration": 2.501847505569458 + }, + { + "auxiliary_loss_clip": 0.01139662, + "auxiliary_loss_mlp": 0.01041318, + "balance_loss_clip": 1.0544219, + "balance_loss_mlp": 1.02472377, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 2.304692050433971, + "language_loss": 0.64689475, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66870463, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.16607666, + "step": 5942, + "time_per_iteration": 2.542701482772827 + }, + { + "auxiliary_loss_clip": 0.01136202, + "auxiliary_loss_mlp": 0.01039462, + "balance_loss_clip": 1.05246174, + "balance_loss_mlp": 1.0239464, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.5196852639414056, + "language_loss": 0.73809409, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.75985074, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.1552124, + "step": 5943, + "time_per_iteration": 2.422698974609375 + }, + { + "auxiliary_loss_clip": 0.0107795, + "auxiliary_loss_mlp": 0.01004687, + "balance_loss_clip": 1.04894531, + "balance_loss_mlp": 1.00316525, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.9312978901503338, + "language_loss": 0.60688615, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62771249, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.29003906, + "router_z_loss_mlp": 0.01521301, + "step": 5944, + "time_per_iteration": 3.1678426265716553 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.05814016, + "balance_loss_mlp": 1.01849103, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 1.8711556184660505, + "language_loss": 0.72530401, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74703538, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.1428833, + "step": 5945, + "time_per_iteration": 3.804718494415283 + }, + { + "auxiliary_loss_clip": 0.01133582, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.05069876, + "balance_loss_mlp": 1.02777815, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7794557192373477, + "language_loss": 0.81140792, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83316433, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.14282227, + "step": 5946, + "time_per_iteration": 2.420393705368042 + }, + { + "auxiliary_loss_clip": 0.01149569, + "auxiliary_loss_mlp": 0.01046775, + "balance_loss_clip": 1.06334615, + "balance_loss_mlp": 1.03110468, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.3368854920331465, + "language_loss": 0.69045532, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71241879, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15661621, + "step": 5947, + "time_per_iteration": 2.4616434574127197 + }, + { + "auxiliary_loss_clip": 0.01134467, + "auxiliary_loss_mlp": 0.01055152, + "balance_loss_clip": 1.05322778, + "balance_loss_mlp": 1.03880799, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.7751744311796032, + "language_loss": 0.75816339, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.78005952, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.16333008, + "step": 5948, + "time_per_iteration": 2.4048218727111816 + }, + { + "auxiliary_loss_clip": 0.01130889, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.04739797, + "balance_loss_mlp": 1.02574623, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 4.049596286696511, + "language_loss": 0.70692557, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72863448, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14251709, + "step": 5949, + "time_per_iteration": 2.485452890396118 + }, + { + "auxiliary_loss_clip": 0.01144844, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.06163716, + "balance_loss_mlp": 1.02363729, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 2.061806547035636, + "language_loss": 0.77430898, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79613733, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14337158, + "step": 5950, + "time_per_iteration": 3.8078861236572266 + }, + { + "auxiliary_loss_clip": 0.01141485, + "auxiliary_loss_mlp": 0.01035012, + "balance_loss_clip": 1.05585337, + "balance_loss_mlp": 1.0200448, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.654989485023394, + "language_loss": 0.72495949, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74672437, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.14971924, + "step": 5951, + "time_per_iteration": 2.422715187072754 + }, + { + "auxiliary_loss_clip": 0.01135238, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.04935801, + "balance_loss_mlp": 1.02704906, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 2.3284342206333095, + "language_loss": 0.70256841, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72435951, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.16809082, + "step": 5952, + "time_per_iteration": 2.8243179321289062 + }, + { + "auxiliary_loss_clip": 0.01137376, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.05498314, + "balance_loss_mlp": 1.02104425, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 1.8711121936347235, + "language_loss": 0.69665766, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71837556, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13348389, + "step": 5953, + "time_per_iteration": 2.5834076404571533 + }, + { + "auxiliary_loss_clip": 0.01153354, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.07022977, + "balance_loss_mlp": 1.01687694, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.6961128543633943, + "language_loss": 0.66435683, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68620133, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14221191, + "step": 5954, + "time_per_iteration": 2.510937452316284 + }, + { + "auxiliary_loss_clip": 0.01134869, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.0526495, + "balance_loss_mlp": 1.02142382, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.8296450938857545, + "language_loss": 0.75028294, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.77198637, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14025879, + "step": 5955, + "time_per_iteration": 2.4069833755493164 + }, + { + "auxiliary_loss_clip": 0.01129464, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.04896092, + "balance_loss_mlp": 1.02069449, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.7637907709962957, + "language_loss": 0.76108092, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78273177, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14941406, + "step": 5956, + "time_per_iteration": 2.464806318283081 + }, + { + "auxiliary_loss_clip": 0.0113357, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.05370378, + "balance_loss_mlp": 1.01846302, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.6988311768276736, + "language_loss": 0.73477221, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75642884, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1362915, + "step": 5957, + "time_per_iteration": 2.4101250171661377 + }, + { + "auxiliary_loss_clip": 0.01143967, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.06155825, + "balance_loss_mlp": 1.02073169, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 1.860980943251136, + "language_loss": 0.71650678, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73828304, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12939453, + "step": 5958, + "time_per_iteration": 2.5098979473114014 + }, + { + "auxiliary_loss_clip": 0.01131687, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.05213094, + "balance_loss_mlp": 1.01676881, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 3.369038881581994, + "language_loss": 0.88536608, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90698004, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12939453, + "step": 5959, + "time_per_iteration": 2.487675666809082 + }, + { + "auxiliary_loss_clip": 0.0112902, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.04918289, + "balance_loss_mlp": 1.01933479, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.6373242048261503, + "language_loss": 0.58214521, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60376608, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13726807, + "step": 5960, + "time_per_iteration": 2.6144702434539795 + }, + { + "auxiliary_loss_clip": 0.01137446, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.05378485, + "balance_loss_mlp": 1.02235365, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 2.1675064731615197, + "language_loss": 0.76344788, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78519934, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.15356445, + "step": 5961, + "time_per_iteration": 2.507870674133301 + }, + { + "auxiliary_loss_clip": 0.01142114, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.05808949, + "balance_loss_mlp": 1.02019644, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.9935176797530132, + "language_loss": 0.7072084, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72897696, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14550781, + "step": 5962, + "time_per_iteration": 2.599658489227295 + }, + { + "auxiliary_loss_clip": 0.01144948, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.06321645, + "balance_loss_mlp": 1.02358317, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.664794906416684, + "language_loss": 0.74698025, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76879585, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13031006, + "step": 5963, + "time_per_iteration": 2.4574272632598877 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01043493, + "balance_loss_clip": 1.05653834, + "balance_loss_mlp": 1.02924764, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.823658465255433, + "language_loss": 0.78462374, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80646694, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14257812, + "step": 5964, + "time_per_iteration": 2.5265769958496094 + }, + { + "auxiliary_loss_clip": 0.01153912, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.06678629, + "balance_loss_mlp": 1.02008367, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.7468196458107714, + "language_loss": 0.66500241, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68688965, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14746094, + "step": 5965, + "time_per_iteration": 2.688416004180908 + }, + { + "auxiliary_loss_clip": 0.01154122, + "auxiliary_loss_mlp": 0.0103952, + "balance_loss_clip": 1.06962729, + "balance_loss_mlp": 1.02506542, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 1.6597914955664412, + "language_loss": 0.78663993, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.80857635, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14459229, + "step": 5966, + "time_per_iteration": 2.6736655235290527 + }, + { + "auxiliary_loss_clip": 0.01136309, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.05364263, + "balance_loss_mlp": 1.02267146, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 1.996657111543041, + "language_loss": 0.9105984, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93233562, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14733887, + "step": 5967, + "time_per_iteration": 2.4469196796417236 + }, + { + "auxiliary_loss_clip": 0.01144752, + "auxiliary_loss_mlp": 0.01051967, + "balance_loss_clip": 1.05730557, + "balance_loss_mlp": 1.03400254, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 2.2097900462132585, + "language_loss": 0.80346262, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82542986, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.1796875, + "step": 5968, + "time_per_iteration": 2.459371328353882 + }, + { + "auxiliary_loss_clip": 0.01141631, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.05360067, + "balance_loss_mlp": 1.03603148, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.1732510234989197, + "language_loss": 0.8428424, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86479282, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.17382812, + "step": 5969, + "time_per_iteration": 2.459486484527588 + }, + { + "auxiliary_loss_clip": 0.01134124, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.05224299, + "balance_loss_mlp": 1.01715541, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.8759561486858274, + "language_loss": 0.72323942, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74488902, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13690186, + "step": 5970, + "time_per_iteration": 2.5229196548461914 + }, + { + "auxiliary_loss_clip": 0.01137227, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.05392313, + "balance_loss_mlp": 1.02817464, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6008992847700876, + "language_loss": 0.79726571, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81908154, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.16186523, + "step": 5971, + "time_per_iteration": 2.421015501022339 + }, + { + "auxiliary_loss_clip": 0.01134479, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.04882693, + "balance_loss_mlp": 1.02561414, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 3.3794068434234603, + "language_loss": 0.78764248, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80939603, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.15234375, + "step": 5972, + "time_per_iteration": 2.414687156677246 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.05348694, + "balance_loss_mlp": 1.02172565, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 2.0184539502309646, + "language_loss": 0.81481087, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83653855, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13781738, + "step": 5973, + "time_per_iteration": 3.937230110168457 + }, + { + "auxiliary_loss_clip": 0.01069396, + "auxiliary_loss_mlp": 0.01001839, + "balance_loss_clip": 1.04125547, + "balance_loss_mlp": 1.0002234, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9278549716252019, + "language_loss": 0.56751966, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58823198, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.28076172, + "router_z_loss_mlp": 0.01615906, + "step": 5974, + "time_per_iteration": 2.9035227298736572 + }, + { + "auxiliary_loss_clip": 0.01143878, + "auxiliary_loss_mlp": 0.01043834, + "balance_loss_clip": 1.058532, + "balance_loss_mlp": 1.02803874, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 2.6407844204117166, + "language_loss": 0.69076228, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.71263939, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.15783691, + "step": 5975, + "time_per_iteration": 2.43460750579834 + }, + { + "auxiliary_loss_clip": 0.01147053, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.06571198, + "balance_loss_mlp": 1.02165043, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.666068296520682, + "language_loss": 0.80365694, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82547748, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13342285, + "step": 5976, + "time_per_iteration": 2.4603068828582764 + }, + { + "auxiliary_loss_clip": 0.01138142, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.05474007, + "balance_loss_mlp": 1.02348793, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.9031623130326214, + "language_loss": 0.78810525, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.80987471, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.15332031, + "step": 5977, + "time_per_iteration": 2.4062883853912354 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.0105188, + "balance_loss_clip": 1.05199981, + "balance_loss_mlp": 1.03761089, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.81095642390537, + "language_loss": 0.80231172, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82420719, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.1427002, + "step": 5978, + "time_per_iteration": 2.4172475337982178 + }, + { + "auxiliary_loss_clip": 0.01139236, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.05647159, + "balance_loss_mlp": 1.01984167, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5540112184800252, + "language_loss": 0.67416018, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69589639, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14501953, + "step": 5979, + "time_per_iteration": 2.825150728225708 + }, + { + "auxiliary_loss_clip": 0.01140374, + "auxiliary_loss_mlp": 0.01036346, + "balance_loss_clip": 1.0551585, + "balance_loss_mlp": 1.02158237, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 1.9804520312397118, + "language_loss": 0.67270261, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69446981, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.14752197, + "step": 5980, + "time_per_iteration": 2.4422993659973145 + }, + { + "auxiliary_loss_clip": 0.01141558, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.05350971, + "balance_loss_mlp": 1.03038621, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.7881717319331405, + "language_loss": 0.71525371, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73715496, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.18164062, + "step": 5981, + "time_per_iteration": 2.567004442214966 + }, + { + "auxiliary_loss_clip": 0.01150308, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.06401467, + "balance_loss_mlp": 1.01759887, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 2.7859029823843806, + "language_loss": 0.71396571, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73580617, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.16143799, + "step": 5982, + "time_per_iteration": 2.5968844890594482 + }, + { + "auxiliary_loss_clip": 0.01138551, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.05860722, + "balance_loss_mlp": 1.02151585, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.7740915904962493, + "language_loss": 0.76341367, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.7851491, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13494873, + "step": 5983, + "time_per_iteration": 3.9442906379699707 + }, + { + "auxiliary_loss_clip": 0.01142402, + "auxiliary_loss_mlp": 0.01045809, + "balance_loss_clip": 1.05643117, + "balance_loss_mlp": 1.02956724, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.3726763438162353, + "language_loss": 0.76173186, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78361392, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.16229248, + "step": 5984, + "time_per_iteration": 2.411978006362915 + }, + { + "auxiliary_loss_clip": 0.01136356, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.05260491, + "balance_loss_mlp": 1.02144217, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.73719721474691, + "language_loss": 0.86564338, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88737214, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15057373, + "step": 5985, + "time_per_iteration": 2.4438211917877197 + }, + { + "auxiliary_loss_clip": 0.01142695, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.06122506, + "balance_loss_mlp": 1.02375901, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.5125835151382556, + "language_loss": 0.72888529, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.75070143, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.1517334, + "step": 5986, + "time_per_iteration": 2.41601300239563 + }, + { + "auxiliary_loss_clip": 0.01143506, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.05363202, + "balance_loss_mlp": 1.02767706, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.9729246833315972, + "language_loss": 0.73704696, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75891292, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.15429688, + "step": 5987, + "time_per_iteration": 2.496981382369995 + }, + { + "auxiliary_loss_clip": 0.01146063, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.05984783, + "balance_loss_mlp": 1.02134824, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 2.0139300720478803, + "language_loss": 0.70009774, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.72192723, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15527344, + "step": 5988, + "time_per_iteration": 2.4063000679016113 + }, + { + "auxiliary_loss_clip": 0.01148105, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_clip": 1.06063271, + "balance_loss_mlp": 1.02689409, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.758257126721296, + "language_loss": 0.73280269, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75470173, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14898682, + "step": 5989, + "time_per_iteration": 3.820918321609497 + }, + { + "auxiliary_loss_clip": 0.01137677, + "auxiliary_loss_mlp": 0.01044853, + "balance_loss_clip": 1.05211878, + "balance_loss_mlp": 1.02941597, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.5483445508747604, + "language_loss": 0.7996242, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82144958, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.15441895, + "step": 5990, + "time_per_iteration": 2.4997291564941406 + }, + { + "auxiliary_loss_clip": 0.01142076, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.05621266, + "balance_loss_mlp": 1.01926494, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.6954801065414304, + "language_loss": 0.8387183, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86048448, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.15283203, + "step": 5991, + "time_per_iteration": 2.594167709350586 + }, + { + "auxiliary_loss_clip": 0.0115652, + "auxiliary_loss_mlp": 0.0104036, + "balance_loss_clip": 1.06688952, + "balance_loss_mlp": 1.0253818, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 5.311782496155776, + "language_loss": 0.75903738, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.78100622, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.14978027, + "step": 5992, + "time_per_iteration": 2.660306692123413 + }, + { + "auxiliary_loss_clip": 0.01139313, + "auxiliary_loss_mlp": 0.01041161, + "balance_loss_clip": 1.05533493, + "balance_loss_mlp": 1.02584219, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 1.835051318318955, + "language_loss": 0.77506477, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79686952, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.15307617, + "step": 5993, + "time_per_iteration": 2.3922338485717773 + }, + { + "auxiliary_loss_clip": 0.01156231, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.06820416, + "balance_loss_mlp": 1.02558565, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 3.2878725727449782, + "language_loss": 0.74669576, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.76866573, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15185547, + "step": 5994, + "time_per_iteration": 2.490992307662964 + }, + { + "auxiliary_loss_clip": 0.0114723, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.0603174, + "balance_loss_mlp": 1.02164841, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8546425062309773, + "language_loss": 0.68915808, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71099257, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.14581299, + "step": 5995, + "time_per_iteration": 3.8922929763793945 + }, + { + "auxiliary_loss_clip": 0.0114007, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.05170012, + "balance_loss_mlp": 1.03071189, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.9461589199367637, + "language_loss": 0.82498831, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84686029, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.1640625, + "step": 5996, + "time_per_iteration": 2.4231550693511963 + }, + { + "auxiliary_loss_clip": 0.01141779, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.05450487, + "balance_loss_mlp": 1.02153862, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.7766929679242494, + "language_loss": 0.73474181, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75652772, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.152771, + "step": 5997, + "time_per_iteration": 2.3715176582336426 + }, + { + "auxiliary_loss_clip": 0.01151906, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.06751776, + "balance_loss_mlp": 1.02175856, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.2518463407455123, + "language_loss": 0.68762684, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.70950818, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14477539, + "step": 5998, + "time_per_iteration": 2.423496723175049 + }, + { + "auxiliary_loss_clip": 0.01143648, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.05935252, + "balance_loss_mlp": 1.01786578, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.670756101804645, + "language_loss": 0.7677027, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.78945994, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14190674, + "step": 5999, + "time_per_iteration": 2.3741586208343506 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01044783, + "balance_loss_clip": 1.05922461, + "balance_loss_mlp": 1.0304358, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 1.7727357476716836, + "language_loss": 0.78012413, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80200869, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14349365, + "step": 6000, + "time_per_iteration": 2.4128339290618896 + }, + { + "auxiliary_loss_clip": 0.01143601, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.05947661, + "balance_loss_mlp": 1.0200237, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.69376900870277, + "language_loss": 0.78120804, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80298412, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13970947, + "step": 6001, + "time_per_iteration": 2.394282817840576 + }, + { + "auxiliary_loss_clip": 0.01141628, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.06070185, + "balance_loss_mlp": 1.01939261, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.939031239983963, + "language_loss": 0.8375091, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85925388, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13458252, + "step": 6002, + "time_per_iteration": 2.4377553462982178 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01035915, + "balance_loss_clip": 1.0579679, + "balance_loss_mlp": 1.02070355, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.13420925205481, + "language_loss": 0.90908742, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.93084484, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.15209961, + "step": 6003, + "time_per_iteration": 2.4125401973724365 + }, + { + "auxiliary_loss_clip": 0.01096992, + "auxiliary_loss_mlp": 0.01009709, + "balance_loss_clip": 1.06720138, + "balance_loss_mlp": 1.00650501, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8848461674815039, + "language_loss": 0.53354043, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55460745, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.29785156, + "router_z_loss_mlp": 0.03201294, + "step": 6004, + "time_per_iteration": 3.0037357807159424 + }, + { + "auxiliary_loss_clip": 0.01141196, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.05523086, + "balance_loss_mlp": 1.02141273, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7529225138613953, + "language_loss": 0.77814883, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79993522, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.16003418, + "step": 6005, + "time_per_iteration": 2.893916368484497 + }, + { + "auxiliary_loss_clip": 0.01145696, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.06013274, + "balance_loss_mlp": 1.02532887, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 1.9615359817236568, + "language_loss": 0.78675032, + "learning_rate": 2.956065454793429e-06, + "loss": 0.80860251, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.14196777, + "step": 6006, + "time_per_iteration": 2.4257969856262207 + }, + { + "auxiliary_loss_clip": 0.01152957, + "auxiliary_loss_mlp": 0.01039109, + "balance_loss_clip": 1.06469011, + "balance_loss_mlp": 1.02246714, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.8139759474711117, + "language_loss": 0.84576213, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86768281, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.16638184, + "step": 6007, + "time_per_iteration": 2.4629569053649902 + }, + { + "auxiliary_loss_clip": 0.0115508, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.06390476, + "balance_loss_mlp": 1.02165115, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.0527274713499777, + "language_loss": 0.72288966, + "learning_rate": 2.955381221179198e-06, + "loss": 0.7448194, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.16223145, + "step": 6008, + "time_per_iteration": 2.47023606300354 + }, + { + "auxiliary_loss_clip": 0.01140546, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.0558393, + "balance_loss_mlp": 1.01757109, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 2.1550416648240485, + "language_loss": 0.82772559, + "learning_rate": 2.955039050023368e-06, + "loss": 0.84945011, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14324951, + "step": 6009, + "time_per_iteration": 2.4843955039978027 + }, + { + "auxiliary_loss_clip": 0.01146579, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.05956411, + "balance_loss_mlp": 1.02607143, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 2.1563256597709453, + "language_loss": 0.76362848, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78550655, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.15161133, + "step": 6010, + "time_per_iteration": 2.448234796524048 + }, + { + "auxiliary_loss_clip": 0.01138464, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.05396676, + "balance_loss_mlp": 1.0224843, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.61173646625619, + "language_loss": 0.83087927, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85263228, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14355469, + "step": 6011, + "time_per_iteration": 2.426934003829956 + }, + { + "auxiliary_loss_clip": 0.01140978, + "auxiliary_loss_mlp": 0.01040421, + "balance_loss_clip": 1.05311513, + "balance_loss_mlp": 1.02507901, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8932696997277039, + "language_loss": 0.6242218, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64603573, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.15344238, + "step": 6012, + "time_per_iteration": 2.50384521484375 + }, + { + "auxiliary_loss_clip": 0.01139447, + "auxiliary_loss_mlp": 0.01057145, + "balance_loss_clip": 1.05462778, + "balance_loss_mlp": 1.04112303, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.7359519808551813, + "language_loss": 0.83733135, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.85929728, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.16003418, + "step": 6013, + "time_per_iteration": 2.4180166721343994 + }, + { + "auxiliary_loss_clip": 0.01141713, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.05611968, + "balance_loss_mlp": 1.02214217, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.828519081212836, + "language_loss": 0.9168579, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93865144, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.1550293, + "step": 6014, + "time_per_iteration": 2.459699869155884 + }, + { + "auxiliary_loss_clip": 0.01155916, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.06795287, + "balance_loss_mlp": 1.02382135, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.9269460601978516, + "language_loss": 0.73098528, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75293529, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.15270996, + "step": 6015, + "time_per_iteration": 2.437039375305176 + }, + { + "auxiliary_loss_clip": 0.01147452, + "auxiliary_loss_mlp": 0.01037946, + "balance_loss_clip": 1.05997872, + "balance_loss_mlp": 1.02237749, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.9263007573765853, + "language_loss": 0.6549803, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67683434, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.15563965, + "step": 6016, + "time_per_iteration": 2.483914613723755 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01040692, + "balance_loss_clip": 1.06611872, + "balance_loss_mlp": 1.02373481, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 2.4118581129632095, + "language_loss": 0.7165091, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.73844558, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.16949463, + "step": 6017, + "time_per_iteration": 4.010053873062134 + }, + { + "auxiliary_loss_clip": 0.01146114, + "auxiliary_loss_mlp": 0.01038209, + "balance_loss_clip": 1.05719066, + "balance_loss_mlp": 1.02324247, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.9420609528508554, + "language_loss": 0.73511755, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75696075, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14953613, + "step": 6018, + "time_per_iteration": 2.77236008644104 + }, + { + "auxiliary_loss_clip": 0.01137397, + "auxiliary_loss_mlp": 0.01037145, + "balance_loss_clip": 1.05633354, + "balance_loss_mlp": 1.02207661, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.5850875224332215, + "language_loss": 0.69262266, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.7143681, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.15075684, + "step": 6019, + "time_per_iteration": 2.5811405181884766 + }, + { + "auxiliary_loss_clip": 0.01140531, + "auxiliary_loss_mlp": 0.01046493, + "balance_loss_clip": 1.05285645, + "balance_loss_mlp": 1.02888608, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.579742622196752, + "language_loss": 0.76612902, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78799927, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.17602539, + "step": 6020, + "time_per_iteration": 2.436840534210205 + }, + { + "auxiliary_loss_clip": 0.01151877, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_clip": 1.06466711, + "balance_loss_mlp": 1.02809739, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 2.8789606325645125, + "language_loss": 0.73975945, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76172757, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.16833496, + "step": 6021, + "time_per_iteration": 2.4791109561920166 + }, + { + "auxiliary_loss_clip": 0.01146156, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.06021285, + "balance_loss_mlp": 1.01963234, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.9830972250150978, + "language_loss": 0.80945212, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83125114, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14117432, + "step": 6022, + "time_per_iteration": 2.413814067840576 + }, + { + "auxiliary_loss_clip": 0.01137574, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.05659306, + "balance_loss_mlp": 1.02314246, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.894961309339498, + "language_loss": 0.81631213, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83805412, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13482666, + "step": 6023, + "time_per_iteration": 2.498884439468384 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.0519799, + "balance_loss_mlp": 1.02111316, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.7208224365140266, + "language_loss": 0.79868817, + "learning_rate": 2.9499021441341e-06, + "loss": 0.82043636, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.15185547, + "step": 6024, + "time_per_iteration": 2.4772167205810547 + }, + { + "auxiliary_loss_clip": 0.01138691, + "auxiliary_loss_mlp": 0.01030445, + "balance_loss_clip": 1.05656302, + "balance_loss_mlp": 1.01635396, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.8368896432396438, + "language_loss": 0.74404526, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.76573658, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14099121, + "step": 6025, + "time_per_iteration": 2.4451119899749756 + }, + { + "auxiliary_loss_clip": 0.01145639, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.06190252, + "balance_loss_mlp": 1.01987433, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.699323972895484, + "language_loss": 0.73165715, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.75345546, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14331055, + "step": 6026, + "time_per_iteration": 3.8708879947662354 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_clip": 1.05390966, + "balance_loss_mlp": 1.02841747, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.076884154263978, + "language_loss": 0.79006386, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81193572, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.16125488, + "step": 6027, + "time_per_iteration": 2.5473313331604004 + }, + { + "auxiliary_loss_clip": 0.01144606, + "auxiliary_loss_mlp": 0.01054787, + "balance_loss_clip": 1.05691934, + "balance_loss_mlp": 1.03602362, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7756281021004519, + "language_loss": 0.6749922, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.6969862, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.18762207, + "step": 6028, + "time_per_iteration": 2.4689698219299316 + }, + { + "auxiliary_loss_clip": 0.01156139, + "auxiliary_loss_mlp": 0.01041938, + "balance_loss_clip": 1.06728864, + "balance_loss_mlp": 1.02750182, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.8919074689332351, + "language_loss": 0.85551238, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87749314, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14428711, + "step": 6029, + "time_per_iteration": 2.428347587585449 + }, + { + "auxiliary_loss_clip": 0.01150372, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.06365538, + "balance_loss_mlp": 1.02133548, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.839794488451332, + "language_loss": 0.72953528, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.75140345, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.15106201, + "step": 6030, + "time_per_iteration": 2.4218618869781494 + }, + { + "auxiliary_loss_clip": 0.01148472, + "auxiliary_loss_mlp": 0.01042528, + "balance_loss_clip": 1.06028438, + "balance_loss_mlp": 1.0244441, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 3.000750815832992, + "language_loss": 0.74542665, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76733661, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.18078613, + "step": 6031, + "time_per_iteration": 2.5152411460876465 + }, + { + "auxiliary_loss_clip": 0.01146237, + "auxiliary_loss_mlp": 0.01031659, + "balance_loss_clip": 1.06166148, + "balance_loss_mlp": 1.01670969, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.7678006889386375, + "language_loss": 0.73257935, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75435835, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.1494751, + "step": 6032, + "time_per_iteration": 2.7906177043914795 + }, + { + "auxiliary_loss_clip": 0.01152698, + "auxiliary_loss_mlp": 0.01041118, + "balance_loss_clip": 1.06569147, + "balance_loss_mlp": 1.02600217, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 2.1934671814553504, + "language_loss": 0.78065169, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80258977, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15112305, + "step": 6033, + "time_per_iteration": 3.875554323196411 + }, + { + "auxiliary_loss_clip": 0.01090405, + "auxiliary_loss_mlp": 0.01013387, + "balance_loss_clip": 1.06131434, + "balance_loss_mlp": 1.0115962, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.7838630899216866, + "language_loss": 0.64721847, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.6682564, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 0.01791382, + "step": 6034, + "time_per_iteration": 3.149599313735962 + }, + { + "auxiliary_loss_clip": 0.01133808, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.05061245, + "balance_loss_mlp": 1.01942945, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.8602764449196236, + "language_loss": 0.89749002, + "learning_rate": 2.946129926425273e-06, + "loss": 0.91917676, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.15454102, + "step": 6035, + "time_per_iteration": 2.5262019634246826 + }, + { + "auxiliary_loss_clip": 0.01138174, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.05103767, + "balance_loss_mlp": 1.02373672, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 2.1066311373981215, + "language_loss": 0.7351172, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.75689608, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15979004, + "step": 6036, + "time_per_iteration": 2.4677820205688477 + }, + { + "auxiliary_loss_clip": 0.01142254, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.05355239, + "balance_loss_mlp": 1.01813507, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.9406369858582575, + "language_loss": 0.76113534, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78291124, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.17211914, + "step": 6037, + "time_per_iteration": 2.3763530254364014 + }, + { + "auxiliary_loss_clip": 0.01143076, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.05960727, + "balance_loss_mlp": 1.03055298, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6649492458563708, + "language_loss": 0.78658426, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80847812, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.15765381, + "step": 6038, + "time_per_iteration": 3.821969509124756 + }, + { + "auxiliary_loss_clip": 0.01089091, + "auxiliary_loss_mlp": 0.01003999, + "balance_loss_clip": 1.05968404, + "balance_loss_mlp": 1.00221646, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8276193661226082, + "language_loss": 0.63397765, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65490854, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.29394531, + "router_z_loss_mlp": 0.01782227, + "step": 6039, + "time_per_iteration": 3.129504442214966 + }, + { + "auxiliary_loss_clip": 0.01141026, + "auxiliary_loss_mlp": 0.01042689, + "balance_loss_clip": 1.05731869, + "balance_loss_mlp": 1.02818131, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.2120432604389806, + "language_loss": 0.71301067, + "learning_rate": 2.944413845878002e-06, + "loss": 0.7348479, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14508057, + "step": 6040, + "time_per_iteration": 2.4216132164001465 + }, + { + "auxiliary_loss_clip": 0.01150906, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.06253958, + "balance_loss_mlp": 1.02268267, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.684731959620188, + "language_loss": 0.81410819, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83599693, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.1529541, + "step": 6041, + "time_per_iteration": 2.4663619995117188 + }, + { + "auxiliary_loss_clip": 0.01153737, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.06551886, + "balance_loss_mlp": 1.01868522, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.019803741526476, + "language_loss": 0.84095883, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86284304, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15991211, + "step": 6042, + "time_per_iteration": 2.455211877822876 + }, + { + "auxiliary_loss_clip": 0.01145558, + "auxiliary_loss_mlp": 0.0104843, + "balance_loss_clip": 1.06010592, + "balance_loss_mlp": 1.03349948, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6572572563701347, + "language_loss": 0.78267241, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80461228, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14923096, + "step": 6043, + "time_per_iteration": 2.4483108520507812 + }, + { + "auxiliary_loss_clip": 0.01142277, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.05977929, + "balance_loss_mlp": 1.02022147, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 1.9342665892758184, + "language_loss": 0.65758002, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67936265, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.1574707, + "step": 6044, + "time_per_iteration": 2.826594591140747 + }, + { + "auxiliary_loss_clip": 0.01132598, + "auxiliary_loss_mlp": 0.01036099, + "balance_loss_clip": 1.05016041, + "balance_loss_mlp": 1.02137637, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.716127844534665, + "language_loss": 0.81144905, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83313602, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.14733887, + "step": 6045, + "time_per_iteration": 2.5287575721740723 + }, + { + "auxiliary_loss_clip": 0.01144639, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_clip": 1.05908251, + "balance_loss_mlp": 1.03208327, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 2.206809128962494, + "language_loss": 0.64909279, + "learning_rate": 2.942353367559755e-06, + "loss": 0.6710099, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14984131, + "step": 6046, + "time_per_iteration": 2.5008485317230225 + }, + { + "auxiliary_loss_clip": 0.01139427, + "auxiliary_loss_mlp": 0.01045322, + "balance_loss_clip": 1.05652249, + "balance_loss_mlp": 1.03012264, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.5031698038963504, + "language_loss": 0.77453279, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79638034, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.15197754, + "step": 6047, + "time_per_iteration": 2.5002074241638184 + }, + { + "auxiliary_loss_clip": 0.0115168, + "auxiliary_loss_mlp": 0.01046351, + "balance_loss_clip": 1.06181848, + "balance_loss_mlp": 1.02997208, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.8287515580817142, + "language_loss": 0.79713535, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.8191157, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.16381836, + "step": 6048, + "time_per_iteration": 2.485734224319458 + }, + { + "auxiliary_loss_clip": 0.01080127, + "auxiliary_loss_mlp": 0.01006312, + "balance_loss_clip": 1.05132294, + "balance_loss_mlp": 1.00479627, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.756363361766298, + "language_loss": 0.52542305, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54628742, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.28759766, + "router_z_loss_mlp": 0.01515198, + "step": 6049, + "time_per_iteration": 3.204970121383667 + }, + { + "auxiliary_loss_clip": 0.01144682, + "auxiliary_loss_mlp": 0.01042705, + "balance_loss_clip": 1.05680573, + "balance_loss_mlp": 1.02663589, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0440739762538227, + "language_loss": 0.86114824, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88302207, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.1607666, + "step": 6050, + "time_per_iteration": 2.482787609100342 + }, + { + "auxiliary_loss_clip": 0.01147191, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.06471872, + "balance_loss_mlp": 1.02412176, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.8327730431608293, + "language_loss": 0.78222752, + "learning_rate": 2.940635319486546e-06, + "loss": 0.8040818, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14111328, + "step": 6051, + "time_per_iteration": 2.4983949661254883 + }, + { + "auxiliary_loss_clip": 0.01141972, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.05672169, + "balance_loss_mlp": 1.0182687, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.8891051060340522, + "language_loss": 0.82568765, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84743434, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14428711, + "step": 6052, + "time_per_iteration": 2.4784209728240967 + }, + { + "auxiliary_loss_clip": 0.01146025, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.06351459, + "balance_loss_mlp": 1.02152634, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 1.5140449272635739, + "language_loss": 0.72475111, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74656278, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.1362915, + "step": 6053, + "time_per_iteration": 2.54194712638855 + }, + { + "auxiliary_loss_clip": 0.0108607, + "auxiliary_loss_mlp": 0.01004899, + "balance_loss_clip": 1.05755901, + "balance_loss_mlp": 1.0033313, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7857505553589816, + "language_loss": 0.61221182, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63312155, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.01564026, + "step": 6054, + "time_per_iteration": 3.0723729133605957 + }, + { + "auxiliary_loss_clip": 0.01148008, + "auxiliary_loss_mlp": 0.01037756, + "balance_loss_clip": 1.0618422, + "balance_loss_mlp": 1.02230608, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.8038522866254512, + "language_loss": 0.75583088, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.7776885, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.15460205, + "step": 6055, + "time_per_iteration": 2.4764809608459473 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.0618, + "balance_loss_mlp": 1.02326381, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.57781682915308, + "language_loss": 0.75172323, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77359426, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.15112305, + "step": 6056, + "time_per_iteration": 2.472020387649536 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.05694985, + "balance_loss_mlp": 1.02289653, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.348054257076952, + "language_loss": 0.80381918, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82559925, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14294434, + "step": 6057, + "time_per_iteration": 2.80053973197937 + }, + { + "auxiliary_loss_clip": 0.01146391, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.06274414, + "balance_loss_mlp": 1.02279031, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 2.0167529019557526, + "language_loss": 0.80412662, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82596695, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.1484375, + "step": 6058, + "time_per_iteration": 2.6164426803588867 + }, + { + "auxiliary_loss_clip": 0.01141626, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.05831349, + "balance_loss_mlp": 1.02011299, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 2.3275683754404257, + "language_loss": 0.8489421, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87070704, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14758301, + "step": 6059, + "time_per_iteration": 2.4444963932037354 + }, + { + "auxiliary_loss_clip": 0.01137131, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.05321693, + "balance_loss_mlp": 1.02356994, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.581151106913848, + "language_loss": 0.88138664, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90314591, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15246582, + "step": 6060, + "time_per_iteration": 3.920064687728882 + }, + { + "auxiliary_loss_clip": 0.01137343, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.05247903, + "balance_loss_mlp": 1.03346634, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 1.8963761203991607, + "language_loss": 0.68004531, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70192432, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.17089844, + "step": 6061, + "time_per_iteration": 2.4310433864593506 + }, + { + "auxiliary_loss_clip": 0.01138531, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.05318308, + "balance_loss_mlp": 1.02180421, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 63.84859938277605, + "language_loss": 0.75297642, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77474821, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.16833496, + "step": 6062, + "time_per_iteration": 2.4815964698791504 + }, + { + "auxiliary_loss_clip": 0.01143172, + "auxiliary_loss_mlp": 0.01038825, + "balance_loss_clip": 1.05863714, + "balance_loss_mlp": 1.02164078, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.6167701389070643, + "language_loss": 0.72287393, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74469388, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.17181396, + "step": 6063, + "time_per_iteration": 2.5126726627349854 + }, + { + "auxiliary_loss_clip": 0.01142756, + "auxiliary_loss_mlp": 0.01054495, + "balance_loss_clip": 1.05637968, + "balance_loss_mlp": 1.03834224, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 2.8088694170793413, + "language_loss": 0.67901611, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70098859, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.16149902, + "step": 6064, + "time_per_iteration": 2.4598090648651123 + }, + { + "auxiliary_loss_clip": 0.01149261, + "auxiliary_loss_mlp": 0.01051535, + "balance_loss_clip": 1.06115341, + "balance_loss_mlp": 1.03652012, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 2.009619539358502, + "language_loss": 0.74861276, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.7706207, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15020752, + "step": 6065, + "time_per_iteration": 2.559462547302246 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.01041946, + "balance_loss_clip": 1.06247151, + "balance_loss_mlp": 1.02556682, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.9469759888447942, + "language_loss": 0.75414532, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77607048, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.16369629, + "step": 6066, + "time_per_iteration": 2.504987955093384 + }, + { + "auxiliary_loss_clip": 0.01135736, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.0526638, + "balance_loss_mlp": 1.02760458, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.177217081233208, + "language_loss": 0.77144819, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79322338, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.1418457, + "step": 6067, + "time_per_iteration": 2.4521536827087402 + }, + { + "auxiliary_loss_clip": 0.01144297, + "auxiliary_loss_mlp": 0.01043727, + "balance_loss_clip": 1.06412423, + "balance_loss_mlp": 1.03008318, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 1.8178177628936754, + "language_loss": 0.70924491, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73112512, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13635254, + "step": 6068, + "time_per_iteration": 2.4997987747192383 + }, + { + "auxiliary_loss_clip": 0.01150507, + "auxiliary_loss_mlp": 0.01039436, + "balance_loss_clip": 1.06512642, + "balance_loss_mlp": 1.02434969, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.770726591295836, + "language_loss": 0.73861563, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76051509, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.15087891, + "step": 6069, + "time_per_iteration": 3.8928167819976807 + }, + { + "auxiliary_loss_clip": 0.01143987, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.05779767, + "balance_loss_mlp": 1.02065301, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.8182148043819268, + "language_loss": 0.66163117, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68342757, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.14990234, + "step": 6070, + "time_per_iteration": 2.867170572280884 + }, + { + "auxiliary_loss_clip": 0.01137523, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.05618954, + "balance_loss_mlp": 1.01834798, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.5981454259026202, + "language_loss": 0.74436921, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76607311, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14520264, + "step": 6071, + "time_per_iteration": 2.549093008041382 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.05427361, + "balance_loss_mlp": 1.01927388, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 1.9188731413736957, + "language_loss": 0.88724631, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90895617, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.14990234, + "step": 6072, + "time_per_iteration": 2.4218335151672363 + }, + { + "auxiliary_loss_clip": 0.01144067, + "auxiliary_loss_mlp": 0.01034765, + "balance_loss_clip": 1.05967045, + "balance_loss_mlp": 1.02096677, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.0038806018579316, + "language_loss": 0.73112363, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.75291193, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13793945, + "step": 6073, + "time_per_iteration": 2.455622673034668 + }, + { + "auxiliary_loss_clip": 0.01132001, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.04897738, + "balance_loss_mlp": 1.02038503, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 2.4102023896853866, + "language_loss": 0.67506659, + "learning_rate": 2.932720838132236e-06, + "loss": 0.69674379, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.15344238, + "step": 6074, + "time_per_iteration": 2.504356861114502 + }, + { + "auxiliary_loss_clip": 0.0113209, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.04999506, + "balance_loss_mlp": 1.02608013, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.4862872093872448, + "language_loss": 0.72954959, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75129914, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.16784668, + "step": 6075, + "time_per_iteration": 2.53548264503479 + }, + { + "auxiliary_loss_clip": 0.01141174, + "auxiliary_loss_mlp": 0.01043492, + "balance_loss_clip": 1.05495393, + "balance_loss_mlp": 1.02830493, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.12447286029495, + "language_loss": 0.89074349, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91259021, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15179443, + "step": 6076, + "time_per_iteration": 3.873009443283081 + }, + { + "auxiliary_loss_clip": 0.01138709, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.05682659, + "balance_loss_mlp": 1.02257657, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.87700473671538, + "language_loss": 0.69580817, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71758336, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.16235352, + "step": 6077, + "time_per_iteration": 2.437439203262329 + }, + { + "auxiliary_loss_clip": 0.01092795, + "auxiliary_loss_mlp": 0.01003789, + "balance_loss_clip": 1.06241262, + "balance_loss_mlp": 1.00197411, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7524311155417072, + "language_loss": 0.61747026, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63843608, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.30371094, + "router_z_loss_mlp": 0.01812744, + "step": 6078, + "time_per_iteration": 3.1421585083007812 + }, + { + "auxiliary_loss_clip": 0.01134075, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.05250192, + "balance_loss_mlp": 1.02511013, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 2.2892684701605495, + "language_loss": 0.78667569, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80840737, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13995361, + "step": 6079, + "time_per_iteration": 2.438263416290283 + }, + { + "auxiliary_loss_clip": 0.01138453, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.05465424, + "balance_loss_mlp": 1.0218122, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 4.751261748662675, + "language_loss": 0.62954247, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.6512984, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.15319824, + "step": 6080, + "time_per_iteration": 2.627885580062866 + }, + { + "auxiliary_loss_clip": 0.01138643, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.05231273, + "balance_loss_mlp": 1.02667212, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.587208726720031, + "language_loss": 0.67728937, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69910717, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.16467285, + "step": 6081, + "time_per_iteration": 2.4698264598846436 + }, + { + "auxiliary_loss_clip": 0.01150956, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_clip": 1.06281221, + "balance_loss_mlp": 1.0309943, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 2.1084226046720604, + "language_loss": 0.74896896, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.77094698, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.15844727, + "step": 6082, + "time_per_iteration": 2.546999454498291 + }, + { + "auxiliary_loss_clip": 0.01144908, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.05988026, + "balance_loss_mlp": 1.01805973, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.7387107681656335, + "language_loss": 0.83000344, + "learning_rate": 2.929618765277987e-06, + "loss": 0.85176826, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13513184, + "step": 6083, + "time_per_iteration": 4.168022632598877 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01010288, + "balance_loss_clip": 1.08083963, + "balance_loss_mlp": 1.00759149, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.80703728463931, + "language_loss": 0.59246802, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61369258, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.31347656, + "router_z_loss_mlp": 0.02694702, + "step": 6084, + "time_per_iteration": 3.1487579345703125 + }, + { + "auxiliary_loss_clip": 0.01146932, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.06343234, + "balance_loss_mlp": 1.02326059, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 1.8413191558160755, + "language_loss": 0.73089802, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75274062, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14080811, + "step": 6085, + "time_per_iteration": 2.5305635929107666 + }, + { + "auxiliary_loss_clip": 0.01142009, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.05724883, + "balance_loss_mlp": 1.02544212, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 2.628475377450944, + "language_loss": 0.78463709, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80645788, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.1461792, + "step": 6086, + "time_per_iteration": 2.4440128803253174 + }, + { + "auxiliary_loss_clip": 0.01134234, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.05373597, + "balance_loss_mlp": 1.02049887, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.955458913608912, + "language_loss": 0.7666139, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.78830302, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.14178467, + "step": 6087, + "time_per_iteration": 2.5952236652374268 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.06418014, + "balance_loss_mlp": 1.02169323, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.2376032639603656, + "language_loss": 0.71245146, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.73432004, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14935303, + "step": 6088, + "time_per_iteration": 2.4999473094940186 + }, + { + "auxiliary_loss_clip": 0.01146659, + "auxiliary_loss_mlp": 0.01040217, + "balance_loss_clip": 1.05547702, + "balance_loss_mlp": 1.02387893, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.886948026622534, + "language_loss": 0.80105674, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82292557, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.16339111, + "step": 6089, + "time_per_iteration": 2.6268177032470703 + }, + { + "auxiliary_loss_clip": 0.01134783, + "auxiliary_loss_mlp": 0.0104177, + "balance_loss_clip": 1.05290866, + "balance_loss_mlp": 1.02564061, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.9440636636354092, + "language_loss": 0.71081507, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73258054, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.16125488, + "step": 6090, + "time_per_iteration": 2.4562294483184814 + }, + { + "auxiliary_loss_clip": 0.01146198, + "auxiliary_loss_mlp": 0.01044616, + "balance_loss_clip": 1.06512499, + "balance_loss_mlp": 1.0310322, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.841304630790669, + "language_loss": 0.74378073, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.7656889, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13574219, + "step": 6091, + "time_per_iteration": 2.449990749359131 + }, + { + "auxiliary_loss_clip": 0.01142236, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.06005049, + "balance_loss_mlp": 1.0214982, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.7799625607390392, + "language_loss": 0.72597277, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74775457, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14434814, + "step": 6092, + "time_per_iteration": 2.4431231021881104 + }, + { + "auxiliary_loss_clip": 0.0113518, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.05127025, + "balance_loss_mlp": 1.02856064, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 2.0902687851545205, + "language_loss": 0.78216803, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80396366, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15820312, + "step": 6093, + "time_per_iteration": 2.505195379257202 + }, + { + "auxiliary_loss_clip": 0.01137227, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.05440307, + "balance_loss_mlp": 1.01962721, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.85660906732875, + "language_loss": 0.74605715, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76776582, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14007568, + "step": 6094, + "time_per_iteration": 2.5752696990966797 + }, + { + "auxiliary_loss_clip": 0.01141912, + "auxiliary_loss_mlp": 0.01046655, + "balance_loss_clip": 1.05807924, + "balance_loss_mlp": 1.03137279, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.741223040883979, + "language_loss": 0.79459029, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81647599, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1529541, + "step": 6095, + "time_per_iteration": 2.534660816192627 + }, + { + "auxiliary_loss_clip": 0.01136076, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.05162072, + "balance_loss_mlp": 1.01868582, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.507629507957303, + "language_loss": 0.73303878, + "learning_rate": 2.925132954945834e-06, + "loss": 0.7547456, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15924072, + "step": 6096, + "time_per_iteration": 2.953124761581421 + }, + { + "auxiliary_loss_clip": 0.01142094, + "auxiliary_loss_mlp": 0.01039536, + "balance_loss_clip": 1.05713677, + "balance_loss_mlp": 1.02332377, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.4393401219822457, + "language_loss": 0.66921639, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69103265, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.16235352, + "step": 6097, + "time_per_iteration": 2.533393383026123 + }, + { + "auxiliary_loss_clip": 0.01138978, + "auxiliary_loss_mlp": 0.01039152, + "balance_loss_clip": 1.0545392, + "balance_loss_mlp": 1.02464414, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.8165249804290355, + "language_loss": 0.7769019, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79868323, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.1451416, + "step": 6098, + "time_per_iteration": 2.5255675315856934 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.05556631, + "balance_loss_mlp": 1.02835119, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.988348694001035, + "language_loss": 0.73500127, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75679624, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.15045166, + "step": 6099, + "time_per_iteration": 2.4688098430633545 + }, + { + "auxiliary_loss_clip": 0.01132388, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.05200553, + "balance_loss_mlp": 1.02675104, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.8094269742911466, + "language_loss": 0.84444726, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86617678, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13806152, + "step": 6100, + "time_per_iteration": 2.388108491897583 + }, + { + "auxiliary_loss_clip": 0.01146584, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.05788445, + "balance_loss_mlp": 1.02232659, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 1.8071922109727976, + "language_loss": 0.70787108, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72971058, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.15045166, + "step": 6101, + "time_per_iteration": 2.506707191467285 + }, + { + "auxiliary_loss_clip": 0.01139156, + "auxiliary_loss_mlp": 0.01046698, + "balance_loss_clip": 1.05479193, + "balance_loss_mlp": 1.03117728, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.2040082190578976, + "language_loss": 0.76227498, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78413349, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.15515137, + "step": 6102, + "time_per_iteration": 2.384476661682129 + }, + { + "auxiliary_loss_clip": 0.01148501, + "auxiliary_loss_mlp": 0.01036323, + "balance_loss_clip": 1.05948067, + "balance_loss_mlp": 1.02018237, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.5272833513704904, + "language_loss": 0.69879216, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72064042, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.16162109, + "step": 6103, + "time_per_iteration": 2.7004098892211914 + }, + { + "auxiliary_loss_clip": 0.01161333, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.07571149, + "balance_loss_mlp": 1.02108526, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8898279971157768, + "language_loss": 0.71290416, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73487902, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.15075684, + "step": 6104, + "time_per_iteration": 3.859743356704712 + }, + { + "auxiliary_loss_clip": 0.01146731, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.06165242, + "balance_loss_mlp": 1.01982331, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 1.8956663347959744, + "language_loss": 0.81349969, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83531845, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.15319824, + "step": 6105, + "time_per_iteration": 2.432812452316284 + }, + { + "auxiliary_loss_clip": 0.01144761, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.05701923, + "balance_loss_mlp": 1.02187324, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 2.0071005727682665, + "language_loss": 0.80188334, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.8237077, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.15795898, + "step": 6106, + "time_per_iteration": 2.484081983566284 + }, + { + "auxiliary_loss_clip": 0.01080671, + "auxiliary_loss_mlp": 0.01011123, + "balance_loss_clip": 1.05181122, + "balance_loss_mlp": 1.00935316, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6926076758523105, + "language_loss": 0.59123796, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61215591, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.0177002, + "step": 6107, + "time_per_iteration": 3.138075351715088 + }, + { + "auxiliary_loss_clip": 0.01143686, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.05979204, + "balance_loss_mlp": 1.01783156, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.5813560846217187, + "language_loss": 0.74229443, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76405048, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14086914, + "step": 6108, + "time_per_iteration": 2.501448392868042 + }, + { + "auxiliary_loss_clip": 0.01140363, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.05586386, + "balance_loss_mlp": 1.02161193, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 9.444169930873603, + "language_loss": 0.73119485, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75295877, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14422607, + "step": 6109, + "time_per_iteration": 2.897991418838501 + }, + { + "auxiliary_loss_clip": 0.01147075, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.06112444, + "balance_loss_mlp": 1.02263403, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 2.095616419937579, + "language_loss": 0.53333807, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55518186, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14678955, + "step": 6110, + "time_per_iteration": 2.580660820007324 + }, + { + "auxiliary_loss_clip": 0.01154251, + "auxiliary_loss_mlp": 0.01039692, + "balance_loss_clip": 1.07086945, + "balance_loss_mlp": 1.0239923, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.66802805055545, + "language_loss": 0.80518782, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82712734, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.15686035, + "step": 6111, + "time_per_iteration": 2.6194257736206055 + }, + { + "auxiliary_loss_clip": 0.011393, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.05676556, + "balance_loss_mlp": 1.02366364, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7488600112652768, + "language_loss": 0.72326636, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74504042, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14447021, + "step": 6112, + "time_per_iteration": 3.9774389266967773 + }, + { + "auxiliary_loss_clip": 0.01147549, + "auxiliary_loss_mlp": 0.0104757, + "balance_loss_clip": 1.06426644, + "balance_loss_mlp": 1.03270483, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6401515238505593, + "language_loss": 0.85144144, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87339264, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.14874268, + "step": 6113, + "time_per_iteration": 2.511011838912964 + }, + { + "auxiliary_loss_clip": 0.01145606, + "auxiliary_loss_mlp": 0.01044891, + "balance_loss_clip": 1.06041551, + "balance_loss_mlp": 1.02832079, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.965242026983093, + "language_loss": 0.7865299, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80843484, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.16564941, + "step": 6114, + "time_per_iteration": 2.5242700576782227 + }, + { + "auxiliary_loss_clip": 0.01151595, + "auxiliary_loss_mlp": 0.01049835, + "balance_loss_clip": 1.06177092, + "balance_loss_mlp": 1.03209698, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.6983552546097993, + "language_loss": 0.67466372, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69667804, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.17736816, + "step": 6115, + "time_per_iteration": 2.498042583465576 + }, + { + "auxiliary_loss_clip": 0.01143569, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_clip": 1.05873466, + "balance_loss_mlp": 1.02996802, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.5039316938152054, + "language_loss": 0.76504886, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78694224, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15808105, + "step": 6116, + "time_per_iteration": 2.5337445735931396 + }, + { + "auxiliary_loss_clip": 0.01148708, + "auxiliary_loss_mlp": 0.0103897, + "balance_loss_clip": 1.06413507, + "balance_loss_mlp": 1.02483165, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 2.0536058146099636, + "language_loss": 0.62776732, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64964408, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14135742, + "step": 6117, + "time_per_iteration": 2.505363941192627 + }, + { + "auxiliary_loss_clip": 0.01154453, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.06847978, + "balance_loss_mlp": 1.02075481, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.8882859914086227, + "language_loss": 0.73837948, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.76028407, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.15246582, + "step": 6118, + "time_per_iteration": 2.551510810852051 + }, + { + "auxiliary_loss_clip": 0.01153487, + "auxiliary_loss_mlp": 0.01042307, + "balance_loss_clip": 1.06496787, + "balance_loss_mlp": 1.02621329, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 1.5624661620294968, + "language_loss": 0.72447622, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.74643409, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.16088867, + "step": 6119, + "time_per_iteration": 2.5726397037506104 + }, + { + "auxiliary_loss_clip": 0.01146066, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.06237364, + "balance_loss_mlp": 1.01960027, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.8805854817055065, + "language_loss": 0.80562079, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82743806, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.1605835, + "step": 6120, + "time_per_iteration": 3.998577833175659 + }, + { + "auxiliary_loss_clip": 0.01146608, + "auxiliary_loss_mlp": 0.01042405, + "balance_loss_clip": 1.06193519, + "balance_loss_mlp": 1.02700329, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 2.1921934343181086, + "language_loss": 0.6481483, + "learning_rate": 2.916489757978126e-06, + "loss": 0.67003846, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.15405273, + "step": 6121, + "time_per_iteration": 2.636168956756592 + }, + { + "auxiliary_loss_clip": 0.01159006, + "auxiliary_loss_mlp": 0.01041147, + "balance_loss_clip": 1.07235289, + "balance_loss_mlp": 1.025841, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 2.064828753138313, + "language_loss": 0.71245843, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73446, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.1529541, + "step": 6122, + "time_per_iteration": 3.1062519550323486 + }, + { + "auxiliary_loss_clip": 0.01137461, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.0577662, + "balance_loss_mlp": 1.02174926, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.7610039699418323, + "language_loss": 0.69287997, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71462834, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.15637207, + "step": 6123, + "time_per_iteration": 2.720991373062134 + }, + { + "auxiliary_loss_clip": 0.01154739, + "auxiliary_loss_mlp": 0.01039526, + "balance_loss_clip": 1.06647038, + "balance_loss_mlp": 1.02306283, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.950310148902883, + "language_loss": 0.73736668, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.75930929, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.16455078, + "step": 6124, + "time_per_iteration": 2.5595104694366455 + }, + { + "auxiliary_loss_clip": 0.01144676, + "auxiliary_loss_mlp": 0.01052327, + "balance_loss_clip": 1.05885959, + "balance_loss_mlp": 1.03409958, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 2.191087460740284, + "language_loss": 0.74715018, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76912022, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.18225098, + "step": 6125, + "time_per_iteration": 3.0634636878967285 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01049781, + "balance_loss_clip": 1.06589258, + "balance_loss_mlp": 1.03273439, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.7429684896535214, + "language_loss": 0.78191209, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80396366, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.17041016, + "step": 6126, + "time_per_iteration": 2.7180402278900146 + }, + { + "auxiliary_loss_clip": 0.01163054, + "auxiliary_loss_mlp": 0.01043848, + "balance_loss_clip": 1.0734427, + "balance_loss_mlp": 1.02729011, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.1457553799053217, + "language_loss": 0.65626097, + "learning_rate": 2.914412150914888e-06, + "loss": 0.67833, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16552734, + "step": 6127, + "time_per_iteration": 4.133497476577759 + }, + { + "auxiliary_loss_clip": 0.01153641, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.06773162, + "balance_loss_mlp": 1.02510285, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 2.1166730467164894, + "language_loss": 0.70460153, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72654724, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.15844727, + "step": 6128, + "time_per_iteration": 2.620004177093506 + }, + { + "auxiliary_loss_clip": 0.0115817, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.07287908, + "balance_loss_mlp": 1.02316082, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 1.7893551847342009, + "language_loss": 0.75328934, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77525663, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.15386963, + "step": 6129, + "time_per_iteration": 2.440532922744751 + }, + { + "auxiliary_loss_clip": 0.01152491, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.06579185, + "balance_loss_mlp": 1.02165723, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.6746146864159874, + "language_loss": 0.84739846, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86929405, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.1539917, + "step": 6130, + "time_per_iteration": 2.5035722255706787 + }, + { + "auxiliary_loss_clip": 0.01097472, + "auxiliary_loss_mlp": 0.01011185, + "balance_loss_clip": 1.06778002, + "balance_loss_mlp": 1.00936699, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8054969939082125, + "language_loss": 0.60282415, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62391073, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.01818848, + "step": 6131, + "time_per_iteration": 3.1947388648986816 + }, + { + "auxiliary_loss_clip": 0.01149308, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.06394434, + "balance_loss_mlp": 1.02429199, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5739004268224817, + "language_loss": 0.72865188, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75055063, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.16271973, + "step": 6132, + "time_per_iteration": 2.5551493167877197 + }, + { + "auxiliary_loss_clip": 0.01149085, + "auxiliary_loss_mlp": 0.01042526, + "balance_loss_clip": 1.06174231, + "balance_loss_mlp": 1.02661121, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.68290976660055, + "language_loss": 0.73996139, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76187754, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15905762, + "step": 6133, + "time_per_iteration": 2.5096800327301025 + }, + { + "auxiliary_loss_clip": 0.01139691, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_clip": 1.05892909, + "balance_loss_mlp": 1.03424144, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.5352109473346836, + "language_loss": 0.71334553, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73524845, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.16345215, + "step": 6134, + "time_per_iteration": 2.4944190979003906 + }, + { + "auxiliary_loss_clip": 0.01149765, + "auxiliary_loss_mlp": 0.01043867, + "balance_loss_clip": 1.06209207, + "balance_loss_mlp": 1.02830982, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.860410386988983, + "language_loss": 0.75482541, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77676177, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.15539551, + "step": 6135, + "time_per_iteration": 2.4705281257629395 + }, + { + "auxiliary_loss_clip": 0.01098876, + "auxiliary_loss_mlp": 0.01006171, + "balance_loss_clip": 1.06900895, + "balance_loss_mlp": 1.00421035, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8111951666605541, + "language_loss": 0.58820164, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.6092521, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 0.0196228, + "step": 6136, + "time_per_iteration": 3.0229005813598633 + }, + { + "auxiliary_loss_clip": 0.01143258, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.06022382, + "balance_loss_mlp": 1.02188742, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 2.2390413735140178, + "language_loss": 0.79211605, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8139137, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14605713, + "step": 6137, + "time_per_iteration": 2.522144317626953 + }, + { + "auxiliary_loss_clip": 0.01144137, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.06088877, + "balance_loss_mlp": 1.02460659, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.179521350494057, + "language_loss": 0.74045658, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76230109, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.15722656, + "step": 6138, + "time_per_iteration": 2.5426249504089355 + }, + { + "auxiliary_loss_clip": 0.011499, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.05932605, + "balance_loss_mlp": 1.02277756, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.8903622829405338, + "language_loss": 0.65533435, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.6772207, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.1595459, + "step": 6139, + "time_per_iteration": 2.543428421020508 + }, + { + "auxiliary_loss_clip": 0.01137591, + "auxiliary_loss_mlp": 0.01039253, + "balance_loss_clip": 1.05603671, + "balance_loss_mlp": 1.02400064, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0585790469721634, + "language_loss": 0.71333981, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73510826, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.15252686, + "step": 6140, + "time_per_iteration": 2.4853570461273193 + }, + { + "auxiliary_loss_clip": 0.01094975, + "auxiliary_loss_mlp": 0.01005221, + "balance_loss_clip": 1.06478155, + "balance_loss_mlp": 1.00350904, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7482610977944404, + "language_loss": 0.59291834, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61392027, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.30224609, + "router_z_loss_mlp": 0.0171051, + "step": 6141, + "time_per_iteration": 3.2974233627319336 + }, + { + "auxiliary_loss_clip": 0.01140597, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.05676341, + "balance_loss_mlp": 1.02464652, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.7360704209884834, + "language_loss": 0.75458491, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77638984, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.15234375, + "step": 6142, + "time_per_iteration": 2.4818904399871826 + }, + { + "auxiliary_loss_clip": 0.01147104, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.0631299, + "balance_loss_mlp": 1.01934576, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 1.8345703119526764, + "language_loss": 0.77029753, + "learning_rate": 2.908865770392555e-06, + "loss": 0.7921077, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14575195, + "step": 6143, + "time_per_iteration": 2.4851300716400146 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.05552149, + "balance_loss_mlp": 1.01517367, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.5803780449431295, + "language_loss": 0.81909764, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84075415, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13708496, + "step": 6144, + "time_per_iteration": 2.5290398597717285 + }, + { + "auxiliary_loss_clip": 0.01149735, + "auxiliary_loss_mlp": 0.01037654, + "balance_loss_clip": 1.06540406, + "balance_loss_mlp": 1.02316427, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 13.744645897391058, + "language_loss": 0.77553207, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79740596, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.1449585, + "step": 6145, + "time_per_iteration": 2.5059945583343506 + }, + { + "auxiliary_loss_clip": 0.01142765, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.05858421, + "balance_loss_mlp": 1.01634645, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7693594193074458, + "language_loss": 0.76879072, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79052681, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.1449585, + "step": 6146, + "time_per_iteration": 2.463278293609619 + }, + { + "auxiliary_loss_clip": 0.01147779, + "auxiliary_loss_mlp": 0.0104477, + "balance_loss_clip": 1.05857253, + "balance_loss_mlp": 1.02883208, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.722441399219568, + "language_loss": 0.80716711, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82909262, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.1595459, + "step": 6147, + "time_per_iteration": 2.4540276527404785 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.05645633, + "balance_loss_mlp": 1.02215672, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7825341358688407, + "language_loss": 0.83532065, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85708976, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13916016, + "step": 6148, + "time_per_iteration": 3.868255853652954 + }, + { + "auxiliary_loss_clip": 0.01141094, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.05913866, + "balance_loss_mlp": 1.0174588, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.2557774809270783, + "language_loss": 0.74752343, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76925659, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14758301, + "step": 6149, + "time_per_iteration": 2.459756374359131 + }, + { + "auxiliary_loss_clip": 0.01145419, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.05698919, + "balance_loss_mlp": 1.02313948, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.7856825599816433, + "language_loss": 0.7051605, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72701544, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.16943359, + "step": 6150, + "time_per_iteration": 2.5432982444763184 + }, + { + "auxiliary_loss_clip": 0.01138221, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.05453753, + "balance_loss_mlp": 1.02401292, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 2.5434644548291288, + "language_loss": 0.81740701, + "learning_rate": 2.906089268194611e-06, + "loss": 0.83917439, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.1449585, + "step": 6151, + "time_per_iteration": 2.4877758026123047 + }, + { + "auxiliary_loss_clip": 0.01082458, + "auxiliary_loss_mlp": 0.01005725, + "balance_loss_clip": 1.05249262, + "balance_loss_mlp": 1.00391912, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.7774011895987907, + "language_loss": 0.63117886, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65206069, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.30029297, + "router_z_loss_mlp": 0.01806641, + "step": 6152, + "time_per_iteration": 3.1783857345581055 + }, + { + "auxiliary_loss_clip": 0.01137555, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.05671382, + "balance_loss_mlp": 1.01823175, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.114311160151698, + "language_loss": 0.69984877, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.7215457, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13903809, + "step": 6153, + "time_per_iteration": 2.4957125186920166 + }, + { + "auxiliary_loss_clip": 0.01145325, + "auxiliary_loss_mlp": 0.01039273, + "balance_loss_clip": 1.05680799, + "balance_loss_mlp": 1.02369225, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.24671553957636, + "language_loss": 0.72616422, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74801016, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15576172, + "step": 6154, + "time_per_iteration": 2.6923093795776367 + }, + { + "auxiliary_loss_clip": 0.01136525, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.05352092, + "balance_loss_mlp": 1.02110076, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.7186064997063877, + "language_loss": 0.67864031, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70038038, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.16369629, + "step": 6155, + "time_per_iteration": 2.5231571197509766 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.05796957, + "balance_loss_mlp": 1.0239718, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.778075692762706, + "language_loss": 0.6759252, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.69776368, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.1484375, + "step": 6156, + "time_per_iteration": 3.904984951019287 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.05693817, + "balance_loss_mlp": 1.02226686, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.756268400271267, + "language_loss": 0.82409763, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84583545, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13330078, + "step": 6157, + "time_per_iteration": 2.5173990726470947 + }, + { + "auxiliary_loss_clip": 0.01141933, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.05674028, + "balance_loss_mlp": 1.02002537, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 3.7751709124084702, + "language_loss": 0.7702505, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.79202616, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.15612793, + "step": 6158, + "time_per_iteration": 2.496581792831421 + }, + { + "auxiliary_loss_clip": 0.0114633, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.06030297, + "balance_loss_mlp": 1.02014208, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.0705795792050052, + "language_loss": 0.68336427, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.70518696, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.15795898, + "step": 6159, + "time_per_iteration": 2.643796920776367 + }, + { + "auxiliary_loss_clip": 0.01145679, + "auxiliary_loss_mlp": 0.0104076, + "balance_loss_clip": 1.06311691, + "balance_loss_mlp": 1.02715874, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7692139939949938, + "language_loss": 0.71025908, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73212349, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13592529, + "step": 6160, + "time_per_iteration": 2.4876935482025146 + }, + { + "auxiliary_loss_clip": 0.01134493, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.05297065, + "balance_loss_mlp": 1.01745629, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.5998493296047573, + "language_loss": 0.7901119, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81176597, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13458252, + "step": 6161, + "time_per_iteration": 2.4636056423187256 + }, + { + "auxiliary_loss_clip": 0.01143417, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.06053329, + "balance_loss_mlp": 1.01968002, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 2.0056128296542948, + "language_loss": 0.7933526, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81513667, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.15307617, + "step": 6162, + "time_per_iteration": 2.467259645462036 + }, + { + "auxiliary_loss_clip": 0.01139175, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.05530882, + "balance_loss_mlp": 1.02433515, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.8233583794419335, + "language_loss": 0.79267412, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81445217, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14300537, + "step": 6163, + "time_per_iteration": 2.462405204772949 + }, + { + "auxiliary_loss_clip": 0.01141675, + "auxiliary_loss_mlp": 0.01037868, + "balance_loss_clip": 1.05829215, + "balance_loss_mlp": 1.02301455, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.7318921454519445, + "language_loss": 0.68544793, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70724332, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14855957, + "step": 6164, + "time_per_iteration": 2.417050838470459 + }, + { + "auxiliary_loss_clip": 0.01159919, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.07308078, + "balance_loss_mlp": 1.02159119, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 3.9837795170317363, + "language_loss": 0.8419112, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.86387634, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.15008545, + "step": 6165, + "time_per_iteration": 3.909517526626587 + }, + { + "auxiliary_loss_clip": 0.01144836, + "auxiliary_loss_mlp": 0.01039151, + "balance_loss_clip": 1.05847776, + "balance_loss_mlp": 1.0222826, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.959605369027804, + "language_loss": 0.69347751, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71531737, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.16882324, + "step": 6166, + "time_per_iteration": 2.5360937118530273 + }, + { + "auxiliary_loss_clip": 0.01094215, + "auxiliary_loss_mlp": 0.01001687, + "balance_loss_clip": 1.06311214, + "balance_loss_mlp": 0.99986571, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.7934764468602612, + "language_loss": 0.56912184, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59008086, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.01821899, + "step": 6167, + "time_per_iteration": 3.070145845413208 + }, + { + "auxiliary_loss_clip": 0.01135552, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.05450988, + "balance_loss_mlp": 1.02321446, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 2.2820028243096075, + "language_loss": 0.75241709, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77414429, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.1395874, + "step": 6168, + "time_per_iteration": 2.4923391342163086 + }, + { + "auxiliary_loss_clip": 0.01142983, + "auxiliary_loss_mlp": 0.01038252, + "balance_loss_clip": 1.06078422, + "balance_loss_mlp": 1.0228802, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 1.7721336478052172, + "language_loss": 0.73800194, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75981426, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.15374756, + "step": 6169, + "time_per_iteration": 2.4858012199401855 + }, + { + "auxiliary_loss_clip": 0.01135435, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.05540478, + "balance_loss_mlp": 1.01827621, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.438986036188636, + "language_loss": 0.79540658, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81707823, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13433838, + "step": 6170, + "time_per_iteration": 2.508697271347046 + }, + { + "auxiliary_loss_clip": 0.01134071, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.05213952, + "balance_loss_mlp": 1.02623868, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 2.655493368688731, + "language_loss": 0.76120949, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78296614, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.15344238, + "step": 6171, + "time_per_iteration": 3.9251842498779297 + }, + { + "auxiliary_loss_clip": 0.01139791, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.05710196, + "balance_loss_mlp": 1.0215863, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9567807080867103, + "language_loss": 0.80374742, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82550794, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14672852, + "step": 6172, + "time_per_iteration": 2.4277474880218506 + }, + { + "auxiliary_loss_clip": 0.01148544, + "auxiliary_loss_mlp": 0.01041108, + "balance_loss_clip": 1.06185555, + "balance_loss_mlp": 1.025599, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 1.9460608400754131, + "language_loss": 0.59750986, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61940634, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.1550293, + "step": 6173, + "time_per_iteration": 2.5481784343719482 + }, + { + "auxiliary_loss_clip": 0.01141521, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.05897439, + "balance_loss_mlp": 1.02305436, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 2.42482637005384, + "language_loss": 0.80754888, + "learning_rate": 2.898094598877435e-06, + "loss": 0.8293339, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13928223, + "step": 6174, + "time_per_iteration": 2.427290201187134 + }, + { + "auxiliary_loss_clip": 0.0113661, + "auxiliary_loss_mlp": 0.01034428, + "balance_loss_clip": 1.05787086, + "balance_loss_mlp": 1.02085042, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.9371654790619728, + "language_loss": 0.8003695, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.8220799, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13586426, + "step": 6175, + "time_per_iteration": 2.4947800636291504 + }, + { + "auxiliary_loss_clip": 0.0113319, + "auxiliary_loss_mlp": 0.01041513, + "balance_loss_clip": 1.05277491, + "balance_loss_mlp": 1.02726722, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 2.266219840569549, + "language_loss": 0.88508493, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90683198, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.14251709, + "step": 6176, + "time_per_iteration": 2.5160088539123535 + }, + { + "auxiliary_loss_clip": 0.01138343, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.0561142, + "balance_loss_mlp": 1.03525484, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.6720562837681006, + "language_loss": 0.73357171, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75546354, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.15600586, + "step": 6177, + "time_per_iteration": 2.434762716293335 + }, + { + "auxiliary_loss_clip": 0.01142012, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.05834651, + "balance_loss_mlp": 1.02915084, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.821717526317077, + "language_loss": 0.75267744, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77453423, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14501953, + "step": 6178, + "time_per_iteration": 2.495715379714966 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.07406282, + "balance_loss_mlp": 1.02393043, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.826275206690274, + "language_loss": 0.72179323, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74372184, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13861084, + "step": 6179, + "time_per_iteration": 2.4134202003479004 + }, + { + "auxiliary_loss_clip": 0.01140604, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.0553987, + "balance_loss_mlp": 1.02032292, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.6282783071868896, + "language_loss": 0.69820106, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71996665, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.15649414, + "step": 6180, + "time_per_iteration": 2.710148572921753 + }, + { + "auxiliary_loss_clip": 0.01132417, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.05242157, + "balance_loss_mlp": 1.02068615, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.6353705444410553, + "language_loss": 0.77485538, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.7965278, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14154053, + "step": 6181, + "time_per_iteration": 2.5102667808532715 + }, + { + "auxiliary_loss_clip": 0.0113554, + "auxiliary_loss_mlp": 0.01035696, + "balance_loss_clip": 1.0546639, + "balance_loss_mlp": 1.02059841, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.8553958297223865, + "language_loss": 0.79105926, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.81277162, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.15093994, + "step": 6182, + "time_per_iteration": 2.518470048904419 + }, + { + "auxiliary_loss_clip": 0.01102001, + "auxiliary_loss_mlp": 0.0101096, + "balance_loss_clip": 1.07110238, + "balance_loss_mlp": 1.0091095, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7846942163614989, + "language_loss": 0.57515121, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59628081, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.30908203, + "router_z_loss_mlp": 0.01849365, + "step": 6183, + "time_per_iteration": 3.061079978942871 + }, + { + "auxiliary_loss_clip": 0.0114012, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.05473328, + "balance_loss_mlp": 1.02453053, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 1.7860367833518869, + "language_loss": 0.7719866, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79379272, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.15966797, + "step": 6184, + "time_per_iteration": 2.496931314468384 + }, + { + "auxiliary_loss_clip": 0.0113383, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.05440068, + "balance_loss_mlp": 1.02207351, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 1.8659488018234491, + "language_loss": 0.72758621, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74930084, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.15563965, + "step": 6185, + "time_per_iteration": 2.539966344833374 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.05571437, + "balance_loss_mlp": 1.01618195, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.506422262113325, + "language_loss": 0.77100223, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79266459, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.14691162, + "step": 6186, + "time_per_iteration": 2.5275304317474365 + }, + { + "auxiliary_loss_clip": 0.01143482, + "auxiliary_loss_mlp": 0.0104513, + "balance_loss_clip": 1.05608177, + "balance_loss_mlp": 1.02921557, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 2.4683153750915516, + "language_loss": 0.83386225, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85574836, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.15905762, + "step": 6187, + "time_per_iteration": 2.4951095581054688 + }, + { + "auxiliary_loss_clip": 0.01145129, + "auxiliary_loss_mlp": 0.01037851, + "balance_loss_clip": 1.0628109, + "balance_loss_mlp": 1.02392697, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.8699847369569385, + "language_loss": 0.84970868, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87153846, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13922119, + "step": 6188, + "time_per_iteration": 2.456242084503174 + }, + { + "auxiliary_loss_clip": 0.01140474, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.059201, + "balance_loss_mlp": 1.0167408, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6872150032510842, + "language_loss": 0.65492189, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67664647, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.15246582, + "step": 6189, + "time_per_iteration": 2.495457649230957 + }, + { + "auxiliary_loss_clip": 0.01149568, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.06450224, + "balance_loss_mlp": 1.02394903, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.1368883372628287, + "language_loss": 0.84163713, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.8635304, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.15814209, + "step": 6190, + "time_per_iteration": 3.898698329925537 + }, + { + "auxiliary_loss_clip": 0.01143853, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.05869055, + "balance_loss_mlp": 1.01735282, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 3.354240277654322, + "language_loss": 0.88961017, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.91136992, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.14782715, + "step": 6191, + "time_per_iteration": 2.417283535003662 + }, + { + "auxiliary_loss_clip": 0.01139035, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.05292559, + "balance_loss_mlp": 1.0176661, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.5313846446881696, + "language_loss": 0.73783934, + "learning_rate": 2.891825326449073e-06, + "loss": 0.75957656, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.17034912, + "step": 6192, + "time_per_iteration": 2.4889447689056396 + }, + { + "auxiliary_loss_clip": 0.01129068, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.0475173, + "balance_loss_mlp": 1.019804, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 11.74170244133507, + "language_loss": 0.79698831, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81861663, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13964844, + "step": 6193, + "time_per_iteration": 2.5309338569641113 + }, + { + "auxiliary_loss_clip": 0.01137177, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.05293298, + "balance_loss_mlp": 1.02494502, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.8132420654356336, + "language_loss": 0.84102333, + "learning_rate": 2.891128062852194e-06, + "loss": 0.8627947, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.15020752, + "step": 6194, + "time_per_iteration": 2.669740676879883 + }, + { + "auxiliary_loss_clip": 0.01144854, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.05778992, + "balance_loss_mlp": 1.02154505, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.2234288978502184, + "language_loss": 0.77571326, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79752827, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.15112305, + "step": 6195, + "time_per_iteration": 2.55413556098938 + }, + { + "auxiliary_loss_clip": 0.01137502, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.05629444, + "balance_loss_mlp": 1.01868653, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.5301872582560498, + "language_loss": 0.79193747, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81364727, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.14782715, + "step": 6196, + "time_per_iteration": 2.4963958263397217 + }, + { + "auxiliary_loss_clip": 0.01142271, + "auxiliary_loss_mlp": 0.01050611, + "balance_loss_clip": 1.05573034, + "balance_loss_mlp": 1.03522098, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.9847636787844083, + "language_loss": 0.83191419, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85384297, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.15393066, + "step": 6197, + "time_per_iteration": 2.575139284133911 + }, + { + "auxiliary_loss_clip": 0.01154601, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.06899047, + "balance_loss_mlp": 1.01991773, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.9023418438449096, + "language_loss": 0.64861256, + "learning_rate": 2.889733130264237e-06, + "loss": 0.67051172, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.15405273, + "step": 6198, + "time_per_iteration": 3.8599853515625 + }, + { + "auxiliary_loss_clip": 0.0113534, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.05427706, + "balance_loss_mlp": 1.03096044, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 1.4596996687031634, + "language_loss": 0.74213719, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76394057, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.14031982, + "step": 6199, + "time_per_iteration": 2.505981683731079 + }, + { + "auxiliary_loss_clip": 0.01136212, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.05393505, + "balance_loss_mlp": 1.02223146, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.8181224517907866, + "language_loss": 0.8040936, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82582021, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14215088, + "step": 6200, + "time_per_iteration": 2.896754026412964 + }, + { + "auxiliary_loss_clip": 0.0113814, + "auxiliary_loss_mlp": 0.01040642, + "balance_loss_clip": 1.05659199, + "balance_loss_mlp": 1.02591968, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 1.8185049017029695, + "language_loss": 0.60051054, + "learning_rate": 2.88868657651991e-06, + "loss": 0.6222983, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.14697266, + "step": 6201, + "time_per_iteration": 2.6587226390838623 + }, + { + "auxiliary_loss_clip": 0.01143781, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.05936348, + "balance_loss_mlp": 1.02487767, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.8695020141827676, + "language_loss": 0.73087823, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75271702, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15222168, + "step": 6202, + "time_per_iteration": 2.564011573791504 + }, + { + "auxiliary_loss_clip": 0.0113621, + "auxiliary_loss_mlp": 0.01038283, + "balance_loss_clip": 1.05460048, + "balance_loss_mlp": 1.02330995, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.005095895736158, + "language_loss": 0.74302268, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76476765, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.14978027, + "step": 6203, + "time_per_iteration": 2.5005502700805664 + }, + { + "auxiliary_loss_clip": 0.01130057, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.05054748, + "balance_loss_mlp": 1.01922226, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6646806317351028, + "language_loss": 0.82157648, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.84319818, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12902832, + "step": 6204, + "time_per_iteration": 2.509221315383911 + }, + { + "auxiliary_loss_clip": 0.01141786, + "auxiliary_loss_mlp": 0.01038651, + "balance_loss_clip": 1.05752218, + "balance_loss_mlp": 1.02376187, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.6367857238911188, + "language_loss": 0.75517297, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77697736, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14868164, + "step": 6205, + "time_per_iteration": 2.5336856842041016 + }, + { + "auxiliary_loss_clip": 0.01133181, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.05181551, + "balance_loss_mlp": 1.02351797, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8553685603197618, + "language_loss": 0.78176308, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80348623, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.15637207, + "step": 6206, + "time_per_iteration": 2.47904634475708 + }, + { + "auxiliary_loss_clip": 0.01146172, + "auxiliary_loss_mlp": 0.01041076, + "balance_loss_clip": 1.0611223, + "balance_loss_mlp": 1.02516127, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.717782057553567, + "language_loss": 0.93435103, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95622349, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.15905762, + "step": 6207, + "time_per_iteration": 2.4422433376312256 + }, + { + "auxiliary_loss_clip": 0.01137716, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.05463219, + "balance_loss_mlp": 1.01573455, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 6.022335097681071, + "language_loss": 0.83167493, + "learning_rate": 2.886243438932759e-06, + "loss": 0.85334134, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13195801, + "step": 6208, + "time_per_iteration": 4.0829668045043945 + }, + { + "auxiliary_loss_clip": 0.01135826, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.05332255, + "balance_loss_mlp": 1.01850593, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.0213616314336305, + "language_loss": 0.73479414, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75650382, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.16625977, + "step": 6209, + "time_per_iteration": 2.5030622482299805 + }, + { + "auxiliary_loss_clip": 0.0113212, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.05128551, + "balance_loss_mlp": 1.01924014, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.4637736696397918, + "language_loss": 0.70267928, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72435087, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.15783691, + "step": 6210, + "time_per_iteration": 2.503016233444214 + }, + { + "auxiliary_loss_clip": 0.01138753, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.05388534, + "balance_loss_mlp": 1.01706839, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.68406822345081, + "language_loss": 0.77944076, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.80115926, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.16027832, + "step": 6211, + "time_per_iteration": 2.4802234172821045 + }, + { + "auxiliary_loss_clip": 0.01138844, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.05471969, + "balance_loss_mlp": 1.02393985, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.8830176791173316, + "language_loss": 0.73483431, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75661242, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.15020752, + "step": 6212, + "time_per_iteration": 2.622039794921875 + }, + { + "auxiliary_loss_clip": 0.01146237, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_clip": 1.05672669, + "balance_loss_mlp": 1.03218579, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 1.952614629853574, + "language_loss": 0.82227921, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84424102, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.1776123, + "step": 6213, + "time_per_iteration": 2.4507253170013428 + }, + { + "auxiliary_loss_clip": 0.01137723, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.05480587, + "balance_loss_mlp": 1.02935708, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.169928293937184, + "language_loss": 0.786134, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.80797637, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.17175293, + "step": 6214, + "time_per_iteration": 2.445815324783325 + }, + { + "auxiliary_loss_clip": 0.01135867, + "auxiliary_loss_mlp": 0.01048197, + "balance_loss_clip": 1.05429995, + "balance_loss_mlp": 1.03268242, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.73910350684607, + "language_loss": 0.85143417, + "learning_rate": 2.883798654630296e-06, + "loss": 0.8732748, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.15509033, + "step": 6215, + "time_per_iteration": 3.972517490386963 + }, + { + "auxiliary_loss_clip": 0.01144197, + "auxiliary_loss_mlp": 0.01045189, + "balance_loss_clip": 1.05796289, + "balance_loss_mlp": 1.02890491, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.944844508586362, + "language_loss": 0.6808641, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70275795, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.1628418, + "step": 6216, + "time_per_iteration": 2.4592111110687256 + }, + { + "auxiliary_loss_clip": 0.01136916, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.05325079, + "balance_loss_mlp": 1.02373469, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.5789827834757255, + "language_loss": 0.65864885, + "learning_rate": 2.883099843007303e-06, + "loss": 0.68040949, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15405273, + "step": 6217, + "time_per_iteration": 2.50559401512146 + }, + { + "auxiliary_loss_clip": 0.01145693, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.05872619, + "balance_loss_mlp": 1.02677393, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 1.7218171870426722, + "language_loss": 0.80411923, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82600272, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.15887451, + "step": 6218, + "time_per_iteration": 2.413288116455078 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.05895531, + "balance_loss_mlp": 1.02248883, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.363758878539105, + "language_loss": 0.78655535, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.8083111, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13952637, + "step": 6219, + "time_per_iteration": 2.499311685562134 + }, + { + "auxiliary_loss_clip": 0.01151374, + "auxiliary_loss_mlp": 0.01041612, + "balance_loss_clip": 1.06964481, + "balance_loss_mlp": 1.02700329, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 1.6301706539033058, + "language_loss": 0.77020937, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79213923, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.14605713, + "step": 6220, + "time_per_iteration": 2.5872249603271484 + }, + { + "auxiliary_loss_clip": 0.01140273, + "auxiliary_loss_mlp": 0.01042797, + "balance_loss_clip": 1.0576973, + "balance_loss_mlp": 1.02778304, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.6063431210965122, + "language_loss": 0.8313024, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85313308, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.15014648, + "step": 6221, + "time_per_iteration": 2.5826642513275146 + }, + { + "auxiliary_loss_clip": 0.01134546, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_clip": 1.05343485, + "balance_loss_mlp": 1.02783835, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.7059029335674816, + "language_loss": 0.7617262, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78349519, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14508057, + "step": 6222, + "time_per_iteration": 2.4109878540039062 + }, + { + "auxiliary_loss_clip": 0.01140423, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.05860424, + "balance_loss_mlp": 1.01889896, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.761308858808159, + "language_loss": 0.70743746, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72918355, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15283203, + "step": 6223, + "time_per_iteration": 2.5148017406463623 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01044974, + "balance_loss_clip": 1.05730987, + "balance_loss_mlp": 1.02813017, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 1.9711403902463196, + "language_loss": 0.68744028, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.7092936, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.16845703, + "step": 6224, + "time_per_iteration": 2.6604907512664795 + }, + { + "auxiliary_loss_clip": 0.01136588, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.05539632, + "balance_loss_mlp": 1.02295923, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.6533731842281851, + "language_loss": 0.70159662, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72334421, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.15209961, + "step": 6225, + "time_per_iteration": 2.471632480621338 + }, + { + "auxiliary_loss_clip": 0.01136194, + "auxiliary_loss_mlp": 0.01045061, + "balance_loss_clip": 1.05344892, + "balance_loss_mlp": 1.02876556, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 1.9245179341722929, + "language_loss": 0.7887361, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81054866, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.16308594, + "step": 6226, + "time_per_iteration": 2.464395761489868 + }, + { + "auxiliary_loss_clip": 0.01139043, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.05414975, + "balance_loss_mlp": 1.02347493, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.0208472862176543, + "language_loss": 0.67983937, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70162177, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.1572876, + "step": 6227, + "time_per_iteration": 2.544893980026245 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01034925, + "balance_loss_clip": 1.05711651, + "balance_loss_mlp": 1.0202626, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.7208800983534298, + "language_loss": 0.82966542, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85139108, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14660645, + "step": 6228, + "time_per_iteration": 2.4880805015563965 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.05239081, + "balance_loss_mlp": 1.02447236, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.8032470437872954, + "language_loss": 0.74741834, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76916915, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.15515137, + "step": 6229, + "time_per_iteration": 2.44742751121521 + }, + { + "auxiliary_loss_clip": 0.01138867, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_clip": 1.05120254, + "balance_loss_mlp": 1.03088379, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.88836561416239, + "language_loss": 0.83366579, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85552502, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.16162109, + "step": 6230, + "time_per_iteration": 2.460150718688965 + }, + { + "auxiliary_loss_clip": 0.01140926, + "auxiliary_loss_mlp": 0.01040754, + "balance_loss_clip": 1.05772138, + "balance_loss_mlp": 1.02497113, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.7611355985074495, + "language_loss": 0.7320714, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75388819, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.15783691, + "step": 6231, + "time_per_iteration": 2.5453543663024902 + }, + { + "auxiliary_loss_clip": 0.01154131, + "auxiliary_loss_mlp": 0.01043103, + "balance_loss_clip": 1.06665337, + "balance_loss_mlp": 1.02736187, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.597218300517543, + "language_loss": 0.73987389, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.76184618, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.15740967, + "step": 6232, + "time_per_iteration": 2.4598031044006348 + }, + { + "auxiliary_loss_clip": 0.01149655, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.06391025, + "balance_loss_mlp": 1.01584375, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.9713296714573392, + "language_loss": 0.77074558, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79255241, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.1517334, + "step": 6233, + "time_per_iteration": 2.5224838256835938 + }, + { + "auxiliary_loss_clip": 0.01140634, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.05712891, + "balance_loss_mlp": 1.02290094, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 2.013882134117437, + "language_loss": 0.69250166, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.7142908, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15374756, + "step": 6234, + "time_per_iteration": 4.113081216812134 + }, + { + "auxiliary_loss_clip": 0.01133086, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.05357003, + "balance_loss_mlp": 1.02458644, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.9761958934757833, + "language_loss": 0.82499552, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84670919, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13677979, + "step": 6235, + "time_per_iteration": 2.4745495319366455 + }, + { + "auxiliary_loss_clip": 0.01160264, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.07385743, + "balance_loss_mlp": 1.01980186, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8179930027391962, + "language_loss": 0.78082925, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.80277777, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.14782715, + "step": 6236, + "time_per_iteration": 2.5420339107513428 + }, + { + "auxiliary_loss_clip": 0.01141244, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.05517972, + "balance_loss_mlp": 1.02557874, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.4679512890107578, + "language_loss": 0.73254508, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75438577, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.17236328, + "step": 6237, + "time_per_iteration": 2.492914915084839 + }, + { + "auxiliary_loss_clip": 0.01136441, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.05123293, + "balance_loss_mlp": 1.02055693, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 1.9564623258814922, + "language_loss": 0.93401903, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95574516, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.15612793, + "step": 6238, + "time_per_iteration": 2.4574923515319824 + }, + { + "auxiliary_loss_clip": 0.0113991, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.05719662, + "balance_loss_mlp": 1.01970887, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 2.3208582407553555, + "language_loss": 0.70623267, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.72799057, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.16149902, + "step": 6239, + "time_per_iteration": 2.4149227142333984 + }, + { + "auxiliary_loss_clip": 0.01151703, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.06390595, + "balance_loss_mlp": 1.02288544, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.6208061157131903, + "language_loss": 0.65855539, + "learning_rate": 2.875053908444895e-06, + "loss": 0.68046093, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.15966797, + "step": 6240, + "time_per_iteration": 2.586733818054199 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.05458641, + "balance_loss_mlp": 1.01964521, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.3967364215022164, + "language_loss": 0.75637597, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77811342, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.15032959, + "step": 6241, + "time_per_iteration": 2.431288719177246 + }, + { + "auxiliary_loss_clip": 0.01147923, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.06233597, + "balance_loss_mlp": 1.02450752, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.2116140863239, + "language_loss": 0.83800274, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85989439, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.1673584, + "step": 6242, + "time_per_iteration": 3.926534414291382 + }, + { + "auxiliary_loss_clip": 0.01136617, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_clip": 1.05194831, + "balance_loss_mlp": 1.02965546, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.065741954715924, + "language_loss": 0.68243647, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70425802, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.15869141, + "step": 6243, + "time_per_iteration": 2.5275087356567383 + }, + { + "auxiliary_loss_clip": 0.01134355, + "auxiliary_loss_mlp": 0.01053638, + "balance_loss_clip": 1.05121386, + "balance_loss_mlp": 1.03398037, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7641169791317775, + "language_loss": 0.83733577, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85921574, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.19677734, + "step": 6244, + "time_per_iteration": 2.49234676361084 + }, + { + "auxiliary_loss_clip": 0.01136523, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.05533266, + "balance_loss_mlp": 1.02335525, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.634618197121922, + "language_loss": 0.83189672, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85366607, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.17077637, + "step": 6245, + "time_per_iteration": 2.4664547443389893 + }, + { + "auxiliary_loss_clip": 0.01138657, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.05348492, + "balance_loss_mlp": 1.02563131, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 2.2334337669610145, + "language_loss": 0.64247781, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.6642828, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.16210938, + "step": 6246, + "time_per_iteration": 2.467985153198242 + }, + { + "auxiliary_loss_clip": 0.01149899, + "auxiliary_loss_mlp": 0.01044757, + "balance_loss_clip": 1.05844641, + "balance_loss_mlp": 1.02788889, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 2.052889887549674, + "language_loss": 0.74951679, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77146333, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16870117, + "step": 6247, + "time_per_iteration": 2.7570762634277344 + }, + { + "auxiliary_loss_clip": 0.01142824, + "auxiliary_loss_mlp": 0.01042413, + "balance_loss_clip": 1.05755353, + "balance_loss_mlp": 1.02672529, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 4.4164190352194845, + "language_loss": 0.54870963, + "learning_rate": 2.872251199697598e-06, + "loss": 0.57056201, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.15686035, + "step": 6248, + "time_per_iteration": 2.4927806854248047 + }, + { + "auxiliary_loss_clip": 0.01136744, + "auxiliary_loss_mlp": 0.01039674, + "balance_loss_clip": 1.05289936, + "balance_loss_mlp": 1.02417636, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.0307125183949317, + "language_loss": 0.84405142, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86581564, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.1550293, + "step": 6249, + "time_per_iteration": 2.6149404048919678 + }, + { + "auxiliary_loss_clip": 0.01141489, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.05880606, + "balance_loss_mlp": 1.01545525, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.7100181448726663, + "language_loss": 0.67846578, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70017672, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14154053, + "step": 6250, + "time_per_iteration": 2.657538890838623 + }, + { + "auxiliary_loss_clip": 0.01146078, + "auxiliary_loss_mlp": 0.01046703, + "balance_loss_clip": 1.05929136, + "balance_loss_mlp": 1.03264856, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.316495402652882, + "language_loss": 0.77643788, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79836571, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.140625, + "step": 6251, + "time_per_iteration": 2.4807918071746826 + }, + { + "auxiliary_loss_clip": 0.01149096, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.06553078, + "balance_loss_mlp": 1.02035904, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.308445686097898, + "language_loss": 0.58322924, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.6050753, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.15136719, + "step": 6252, + "time_per_iteration": 4.008868932723999 + }, + { + "auxiliary_loss_clip": 0.01147914, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.06096959, + "balance_loss_mlp": 1.02304351, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.882147446057616, + "language_loss": 0.89947253, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.92134261, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.16052246, + "step": 6253, + "time_per_iteration": 2.5346357822418213 + }, + { + "auxiliary_loss_clip": 0.01138708, + "auxiliary_loss_mlp": 0.01034662, + "balance_loss_clip": 1.05747294, + "balance_loss_mlp": 1.02005303, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9246056372312725, + "language_loss": 0.77171606, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.79344976, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14593506, + "step": 6254, + "time_per_iteration": 2.429716110229492 + }, + { + "auxiliary_loss_clip": 0.01136053, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.05149555, + "balance_loss_mlp": 1.02502537, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.3373440234261404, + "language_loss": 0.62036878, + "learning_rate": 2.869797092829169e-06, + "loss": 0.64213634, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.15679932, + "step": 6255, + "time_per_iteration": 2.4781627655029297 + }, + { + "auxiliary_loss_clip": 0.01137997, + "auxiliary_loss_mlp": 0.01038725, + "balance_loss_clip": 1.0524596, + "balance_loss_mlp": 1.02293026, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 3.9138752418351443, + "language_loss": 0.73758286, + "learning_rate": 2.869446374096135e-06, + "loss": 0.75935012, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.15808105, + "step": 6256, + "time_per_iteration": 2.4478867053985596 + }, + { + "auxiliary_loss_clip": 0.01145967, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.06027412, + "balance_loss_mlp": 1.0226624, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.8598847131215657, + "language_loss": 0.7043519, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72620171, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.16345215, + "step": 6257, + "time_per_iteration": 2.552274227142334 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.05295181, + "balance_loss_mlp": 1.01885307, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.911527159764755, + "language_loss": 0.84552187, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86722952, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14428711, + "step": 6258, + "time_per_iteration": 2.4290449619293213 + }, + { + "auxiliary_loss_clip": 0.0114844, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.06314778, + "balance_loss_mlp": 1.02716959, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.565094290699664, + "language_loss": 0.80768257, + "learning_rate": 2.868394020133277e-06, + "loss": 0.82958043, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14178467, + "step": 6259, + "time_per_iteration": 3.8606059551239014 + }, + { + "auxiliary_loss_clip": 0.01148433, + "auxiliary_loss_mlp": 0.0104665, + "balance_loss_clip": 1.05893755, + "balance_loss_mlp": 1.02977037, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 2.237934902714874, + "language_loss": 0.71518654, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73713738, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.16870117, + "step": 6260, + "time_per_iteration": 2.7145817279815674 + }, + { + "auxiliary_loss_clip": 0.01146025, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.0585444, + "balance_loss_mlp": 1.01861668, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.528545423515787, + "language_loss": 0.78534353, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80715144, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.16162109, + "step": 6261, + "time_per_iteration": 2.473079204559326 + }, + { + "auxiliary_loss_clip": 0.01142994, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.05675435, + "balance_loss_mlp": 1.02539206, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.6463833459456616, + "language_loss": 0.80621451, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.15319824, + "step": 6262, + "time_per_iteration": 2.559083938598633 + }, + { + "auxiliary_loss_clip": 0.01144348, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.0627687, + "balance_loss_mlp": 1.02183723, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.6537244635939525, + "language_loss": 0.80407524, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82588482, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.14764404, + "step": 6263, + "time_per_iteration": 2.598660707473755 + }, + { + "auxiliary_loss_clip": 0.01145349, + "auxiliary_loss_mlp": 0.01041358, + "balance_loss_clip": 1.05976701, + "balance_loss_mlp": 1.02575374, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.9747845500661425, + "language_loss": 0.7943117, + "learning_rate": 2.866639438447501e-06, + "loss": 0.8161788, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.15600586, + "step": 6264, + "time_per_iteration": 2.4735512733459473 + }, + { + "auxiliary_loss_clip": 0.01138096, + "auxiliary_loss_mlp": 0.01052757, + "balance_loss_clip": 1.0529561, + "balance_loss_mlp": 1.03718865, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 3.2483640525772595, + "language_loss": 0.73930621, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.76121473, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.15576172, + "step": 6265, + "time_per_iteration": 2.4674594402313232 + }, + { + "auxiliary_loss_clip": 0.01140014, + "auxiliary_loss_mlp": 0.01037177, + "balance_loss_clip": 1.05598831, + "balance_loss_mlp": 1.02305079, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.5705287982467284, + "language_loss": 0.68714905, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70892096, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14111328, + "step": 6266, + "time_per_iteration": 2.5304274559020996 + }, + { + "auxiliary_loss_clip": 0.01148554, + "auxiliary_loss_mlp": 0.01048698, + "balance_loss_clip": 1.05891407, + "balance_loss_mlp": 1.03234267, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 6.386690832202742, + "language_loss": 0.6291095, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65108204, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16357422, + "step": 6267, + "time_per_iteration": 2.5004916191101074 + }, + { + "auxiliary_loss_clip": 0.01083235, + "auxiliary_loss_mlp": 0.01004062, + "balance_loss_clip": 1.05257201, + "balance_loss_mlp": 1.00218737, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7209332055003025, + "language_loss": 0.58917534, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.61004835, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.30615234, + "router_z_loss_mlp": 0.01872253, + "step": 6268, + "time_per_iteration": 3.2135283946990967 + }, + { + "auxiliary_loss_clip": 0.01134894, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.05061078, + "balance_loss_mlp": 1.02474201, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.5579099826012188, + "language_loss": 0.65059519, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67234337, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.15185547, + "step": 6269, + "time_per_iteration": 2.4801058769226074 + }, + { + "auxiliary_loss_clip": 0.01158446, + "auxiliary_loss_mlp": 0.01039792, + "balance_loss_clip": 1.07656693, + "balance_loss_mlp": 1.02369249, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.5620990864220667, + "language_loss": 0.70814621, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.73012859, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.16094971, + "step": 6270, + "time_per_iteration": 2.640460252761841 + }, + { + "auxiliary_loss_clip": 0.01072558, + "auxiliary_loss_mlp": 0.01003867, + "balance_loss_clip": 1.04326808, + "balance_loss_mlp": 1.00209951, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7125783460681338, + "language_loss": 0.56143647, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58220077, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.29248047, + "router_z_loss_mlp": 0.01766968, + "step": 6271, + "time_per_iteration": 3.122831344604492 + }, + { + "auxiliary_loss_clip": 0.01139426, + "auxiliary_loss_mlp": 0.01039311, + "balance_loss_clip": 1.05644166, + "balance_loss_mlp": 1.02178764, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 1.7634736926951697, + "language_loss": 0.79873359, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82052094, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.1751709, + "step": 6272, + "time_per_iteration": 2.524305820465088 + }, + { + "auxiliary_loss_clip": 0.01134473, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.05311608, + "balance_loss_mlp": 1.01850033, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.9681763038949633, + "language_loss": 0.74244595, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76411861, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14294434, + "step": 6273, + "time_per_iteration": 2.750018358230591 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.01040653, + "balance_loss_clip": 1.05418408, + "balance_loss_mlp": 1.02625275, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.537844401386751, + "language_loss": 0.71926796, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74103189, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14416504, + "step": 6274, + "time_per_iteration": 2.4890341758728027 + }, + { + "auxiliary_loss_clip": 0.01140002, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.05415392, + "balance_loss_mlp": 1.02834868, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 2.1063526975418885, + "language_loss": 0.83972883, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.86156058, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14837646, + "step": 6275, + "time_per_iteration": 2.45776629447937 + }, + { + "auxiliary_loss_clip": 0.01132684, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.05224192, + "balance_loss_mlp": 1.01984239, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.3923274473733322, + "language_loss": 0.7529068, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77456611, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13421631, + "step": 6276, + "time_per_iteration": 2.5700902938842773 + }, + { + "auxiliary_loss_clip": 0.01138102, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.05467832, + "balance_loss_mlp": 1.02053952, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 2.313401583824041, + "language_loss": 0.86137468, + "learning_rate": 2.862073685241366e-06, + "loss": 0.88311493, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.15386963, + "step": 6277, + "time_per_iteration": 2.507476329803467 + }, + { + "auxiliary_loss_clip": 0.01141834, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.05846667, + "balance_loss_mlp": 1.02165186, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 1.7985563147316963, + "language_loss": 0.78484964, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80663067, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14605713, + "step": 6278, + "time_per_iteration": 4.0210490226745605 + }, + { + "auxiliary_loss_clip": 0.01140337, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.05416214, + "balance_loss_mlp": 1.0243938, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.7465268563093583, + "language_loss": 0.8294642, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85127628, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.16479492, + "step": 6279, + "time_per_iteration": 2.5214056968688965 + }, + { + "auxiliary_loss_clip": 0.01146596, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_clip": 1.06096232, + "balance_loss_mlp": 1.02251875, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 1.9022955270920876, + "language_loss": 0.75000811, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77184367, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14440918, + "step": 6280, + "time_per_iteration": 2.503361940383911 + }, + { + "auxiliary_loss_clip": 0.0113591, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.05559361, + "balance_loss_mlp": 1.02627766, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.498640060081602, + "language_loss": 0.76071846, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78247607, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13562012, + "step": 6281, + "time_per_iteration": 2.5042176246643066 + }, + { + "auxiliary_loss_clip": 0.01134791, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.05263698, + "balance_loss_mlp": 1.02023244, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.3952786113357059, + "language_loss": 0.84106201, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86275792, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14569092, + "step": 6282, + "time_per_iteration": 2.495856523513794 + }, + { + "auxiliary_loss_clip": 0.01141278, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.05745387, + "balance_loss_mlp": 1.01760423, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 3.2892069077021655, + "language_loss": 0.6959312, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.71766686, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14685059, + "step": 6283, + "time_per_iteration": 2.449894428253174 + }, + { + "auxiliary_loss_clip": 0.01153688, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.07174063, + "balance_loss_mlp": 1.02325797, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 4.937420272347549, + "language_loss": 0.76394343, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78586334, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.15032959, + "step": 6284, + "time_per_iteration": 2.4753262996673584 + }, + { + "auxiliary_loss_clip": 0.01144488, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.05712414, + "balance_loss_mlp": 1.02074599, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.003606279718256, + "language_loss": 0.86208636, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88389766, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15887451, + "step": 6285, + "time_per_iteration": 2.418863534927368 + }, + { + "auxiliary_loss_clip": 0.01138262, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.054268, + "balance_loss_mlp": 1.0236094, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.7870980242658, + "language_loss": 0.84802711, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86980462, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15875244, + "step": 6286, + "time_per_iteration": 3.9537439346313477 + }, + { + "auxiliary_loss_clip": 0.01132177, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_clip": 1.05065167, + "balance_loss_mlp": 1.02686763, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 1.9651188107113644, + "language_loss": 0.82040739, + "learning_rate": 2.858557806518775e-06, + "loss": 0.84215689, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.15905762, + "step": 6287, + "time_per_iteration": 2.4454376697540283 + }, + { + "auxiliary_loss_clip": 0.01128218, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.04734945, + "balance_loss_mlp": 1.02120543, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 3.087397838997304, + "language_loss": 0.73652321, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75817269, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.1552124, + "step": 6288, + "time_per_iteration": 2.46781849861145 + }, + { + "auxiliary_loss_clip": 0.01141174, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.05662084, + "balance_loss_mlp": 1.02044702, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 2.261830413104225, + "language_loss": 0.75474823, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77651763, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15307617, + "step": 6289, + "time_per_iteration": 2.5212042331695557 + }, + { + "auxiliary_loss_clip": 0.01135973, + "auxiliary_loss_mlp": 0.01041208, + "balance_loss_clip": 1.05213213, + "balance_loss_mlp": 1.02526963, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.7626017232416176, + "language_loss": 0.73805761, + "learning_rate": 2.857502407441593e-06, + "loss": 0.7598294, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.15942383, + "step": 6290, + "time_per_iteration": 2.5068600177764893 + }, + { + "auxiliary_loss_clip": 0.01141334, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.05393147, + "balance_loss_mlp": 1.02123368, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.6908985913657815, + "language_loss": 0.80084836, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.82264203, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.16784668, + "step": 6291, + "time_per_iteration": 2.4455459117889404 + }, + { + "auxiliary_loss_clip": 0.01147923, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.06302178, + "balance_loss_mlp": 1.02089274, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 3.334490974279266, + "language_loss": 0.75611484, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.77796358, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.1607666, + "step": 6292, + "time_per_iteration": 2.531200885772705 + }, + { + "auxiliary_loss_clip": 0.01143658, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.05780554, + "balance_loss_mlp": 1.03050148, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.7851856966116268, + "language_loss": 0.6979835, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71988857, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.16357422, + "step": 6293, + "time_per_iteration": 2.5084238052368164 + }, + { + "auxiliary_loss_clip": 0.01137169, + "auxiliary_loss_mlp": 0.01050512, + "balance_loss_clip": 1.05170584, + "balance_loss_mlp": 1.03235674, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.9933662711123004, + "language_loss": 0.71967614, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.74155295, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.18164062, + "step": 6294, + "time_per_iteration": 2.506152391433716 + }, + { + "auxiliary_loss_clip": 0.01143993, + "auxiliary_loss_mlp": 0.01041018, + "balance_loss_clip": 1.05635393, + "balance_loss_mlp": 1.02454364, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0611415925242396, + "language_loss": 0.82747048, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84932059, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.16473389, + "step": 6295, + "time_per_iteration": 2.504690170288086 + }, + { + "auxiliary_loss_clip": 0.01143703, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.05575001, + "balance_loss_mlp": 1.01696754, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.7100711932591939, + "language_loss": 0.72175169, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.7435168, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15844727, + "step": 6296, + "time_per_iteration": 4.0355448722839355 + }, + { + "auxiliary_loss_clip": 0.01148965, + "auxiliary_loss_mlp": 0.01048098, + "balance_loss_clip": 1.06390882, + "balance_loss_mlp": 1.0323565, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 2.1300690678026477, + "language_loss": 0.77167827, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79364884, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15740967, + "step": 6297, + "time_per_iteration": 2.56719970703125 + }, + { + "auxiliary_loss_clip": 0.01148257, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.06150413, + "balance_loss_mlp": 1.01879621, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 1.857976618671027, + "language_loss": 0.79157758, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81340051, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.15240479, + "step": 6298, + "time_per_iteration": 2.5252394676208496 + }, + { + "auxiliary_loss_clip": 0.01138879, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.05744815, + "balance_loss_mlp": 1.02378845, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.9336494333130756, + "language_loss": 0.84230554, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86408114, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.14880371, + "step": 6299, + "time_per_iteration": 2.691718101501465 + }, + { + "auxiliary_loss_clip": 0.01145268, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.06125522, + "balance_loss_mlp": 1.01850557, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.2343843348127077, + "language_loss": 0.76274455, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78453106, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14874268, + "step": 6300, + "time_per_iteration": 2.53463077545166 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.05517435, + "balance_loss_mlp": 1.02055812, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 1.9241277514323802, + "language_loss": 0.83100104, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.85280067, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.171875, + "step": 6301, + "time_per_iteration": 2.466538190841675 + }, + { + "auxiliary_loss_clip": 0.01143355, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.0595715, + "balance_loss_mlp": 1.02526045, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.7780559487142564, + "language_loss": 0.6812129, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70305198, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.15313721, + "step": 6302, + "time_per_iteration": 2.5086557865142822 + }, + { + "auxiliary_loss_clip": 0.0114227, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.05969286, + "balance_loss_mlp": 1.02667344, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 2.514984585982699, + "language_loss": 0.68353444, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70536858, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14477539, + "step": 6303, + "time_per_iteration": 3.971381902694702 + }, + { + "auxiliary_loss_clip": 0.01135021, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.05251706, + "balance_loss_mlp": 1.02220702, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 2.5793898250112743, + "language_loss": 0.77460009, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79631913, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14678955, + "step": 6304, + "time_per_iteration": 2.555933952331543 + }, + { + "auxiliary_loss_clip": 0.01157552, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.06797445, + "balance_loss_mlp": 1.02290702, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.7417543328923957, + "language_loss": 0.80206972, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.8240453, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.17102051, + "step": 6305, + "time_per_iteration": 2.4867308139801025 + }, + { + "auxiliary_loss_clip": 0.01100096, + "auxiliary_loss_mlp": 0.01013118, + "balance_loss_clip": 1.07004547, + "balance_loss_mlp": 1.0094198, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9707159366775411, + "language_loss": 0.64521194, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66634405, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.29980469, + "router_z_loss_mlp": 0.03695679, + "step": 6306, + "time_per_iteration": 3.044008493423462 + }, + { + "auxiliary_loss_clip": 0.01148546, + "auxiliary_loss_mlp": 0.01058562, + "balance_loss_clip": 1.06233513, + "balance_loss_mlp": 1.04096651, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.5745186503336575, + "language_loss": 0.73826897, + "learning_rate": 2.851516295441817e-06, + "loss": 0.76034009, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.17590332, + "step": 6307, + "time_per_iteration": 2.4953176975250244 + }, + { + "auxiliary_loss_clip": 0.01150666, + "auxiliary_loss_mlp": 0.01063724, + "balance_loss_clip": 1.05839419, + "balance_loss_mlp": 1.04580712, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.5412863717445004, + "language_loss": 0.7842471, + "learning_rate": 2.851163879959112e-06, + "loss": 0.806391, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.17919922, + "step": 6308, + "time_per_iteration": 2.481940507888794 + }, + { + "auxiliary_loss_clip": 0.01138345, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.05567849, + "balance_loss_mlp": 1.02737856, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.6132612965095685, + "language_loss": 0.7357446, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75755632, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.15447998, + "step": 6309, + "time_per_iteration": 2.462531805038452 + }, + { + "auxiliary_loss_clip": 0.01142744, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.0609746, + "balance_loss_mlp": 1.02303743, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.8824525906944896, + "language_loss": 0.78620386, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80801702, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.15533447, + "step": 6310, + "time_per_iteration": 2.4761674404144287 + }, + { + "auxiliary_loss_clip": 0.01139615, + "auxiliary_loss_mlp": 0.01039517, + "balance_loss_clip": 1.05681205, + "balance_loss_mlp": 1.0245924, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 2.510478040370415, + "language_loss": 0.76520765, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.78699899, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14916992, + "step": 6311, + "time_per_iteration": 2.440936326980591 + }, + { + "auxiliary_loss_clip": 0.01142958, + "auxiliary_loss_mlp": 0.01061105, + "balance_loss_clip": 1.05486202, + "balance_loss_mlp": 1.04441547, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4502261777296659, + "language_loss": 0.70632797, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.72836864, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.16674805, + "step": 6312, + "time_per_iteration": 2.734950304031372 + }, + { + "auxiliary_loss_clip": 0.01080926, + "auxiliary_loss_mlp": 0.01008839, + "balance_loss_clip": 1.05114722, + "balance_loss_mlp": 1.00718915, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7945684679617964, + "language_loss": 0.5603621, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58125973, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.29736328, + "router_z_loss_mlp": 0.01649475, + "step": 6313, + "time_per_iteration": 3.061319589614868 + }, + { + "auxiliary_loss_clip": 0.01131628, + "auxiliary_loss_mlp": 0.0104384, + "balance_loss_clip": 1.04963875, + "balance_loss_mlp": 1.02821136, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.719689627089976, + "language_loss": 0.71578979, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73754442, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15637207, + "step": 6314, + "time_per_iteration": 2.6189522743225098 + }, + { + "auxiliary_loss_clip": 0.01150601, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.06304312, + "balance_loss_mlp": 1.02691364, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.831094527222594, + "language_loss": 0.7355057, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75744355, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.1628418, + "step": 6315, + "time_per_iteration": 2.445720672607422 + }, + { + "auxiliary_loss_clip": 0.0113728, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.0542568, + "balance_loss_mlp": 1.02285695, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 1.9096354005332743, + "language_loss": 0.7095024, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73126304, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.15930176, + "step": 6316, + "time_per_iteration": 2.772674798965454 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01037877, + "balance_loss_clip": 1.06426573, + "balance_loss_mlp": 1.02358377, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.7637022997492928, + "language_loss": 0.65436661, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67622101, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.14300537, + "step": 6317, + "time_per_iteration": 2.6221230030059814 + }, + { + "auxiliary_loss_clip": 0.01132949, + "auxiliary_loss_mlp": 0.01041053, + "balance_loss_clip": 1.05120027, + "balance_loss_mlp": 1.02478051, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.3901061414423292, + "language_loss": 0.85746467, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87920469, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.16259766, + "step": 6318, + "time_per_iteration": 2.501190423965454 + }, + { + "auxiliary_loss_clip": 0.01143224, + "auxiliary_loss_mlp": 0.01043065, + "balance_loss_clip": 1.05718088, + "balance_loss_mlp": 1.02679276, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.2813949065440866, + "language_loss": 0.76127231, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.78313518, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.16259766, + "step": 6319, + "time_per_iteration": 2.4630532264709473 + }, + { + "auxiliary_loss_clip": 0.01143212, + "auxiliary_loss_mlp": 0.01039129, + "balance_loss_clip": 1.058465, + "balance_loss_mlp": 1.02428699, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.615718455744812, + "language_loss": 0.63709903, + "learning_rate": 2.846932380444744e-06, + "loss": 0.65892243, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14831543, + "step": 6320, + "time_per_iteration": 2.510477066040039 + }, + { + "auxiliary_loss_clip": 0.01143452, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.06052792, + "balance_loss_mlp": 1.02486897, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.921088380470083, + "language_loss": 0.71665341, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73848766, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.15100098, + "step": 6321, + "time_per_iteration": 4.109529972076416 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.06299186, + "balance_loss_mlp": 1.02321017, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.7827601858232829, + "language_loss": 0.75114971, + "learning_rate": 2.846226680280859e-06, + "loss": 0.77302158, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14978027, + "step": 6322, + "time_per_iteration": 2.5154457092285156 + }, + { + "auxiliary_loss_clip": 0.01136351, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.05563807, + "balance_loss_mlp": 1.02130389, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.1916663402306393, + "language_loss": 0.85079426, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87252176, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.15112305, + "step": 6323, + "time_per_iteration": 2.475759744644165 + }, + { + "auxiliary_loss_clip": 0.01141595, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.05577278, + "balance_loss_mlp": 1.01972985, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.2314026061422743, + "language_loss": 0.73055577, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75232869, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.15942383, + "step": 6324, + "time_per_iteration": 2.454249382019043 + }, + { + "auxiliary_loss_clip": 0.01152597, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.06744301, + "balance_loss_mlp": 1.0194428, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.8985236849492015, + "language_loss": 0.84513307, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86700833, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.1550293, + "step": 6325, + "time_per_iteration": 2.724163293838501 + }, + { + "auxiliary_loss_clip": 0.01140546, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.05798471, + "balance_loss_mlp": 1.02212584, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.6886475903178122, + "language_loss": 0.79792559, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81969267, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14044189, + "step": 6326, + "time_per_iteration": 2.479449510574341 + }, + { + "auxiliary_loss_clip": 0.01134088, + "auxiliary_loss_mlp": 0.01047072, + "balance_loss_clip": 1.05130196, + "balance_loss_mlp": 1.03043067, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.7237025607293954, + "language_loss": 0.7305634, + "learning_rate": 2.844461868547842e-06, + "loss": 0.75237495, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.16625977, + "step": 6327, + "time_per_iteration": 2.65179181098938 + }, + { + "auxiliary_loss_clip": 0.01151557, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.06941152, + "balance_loss_mlp": 1.02223396, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.5883450271078374, + "language_loss": 0.83220446, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85409349, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.15118408, + "step": 6328, + "time_per_iteration": 2.484848976135254 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.06760669, + "balance_loss_mlp": 1.01974916, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.3751688201468923, + "language_loss": 0.61506736, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63691598, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.14648438, + "step": 6329, + "time_per_iteration": 3.9826903343200684 + }, + { + "auxiliary_loss_clip": 0.01138408, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.05613101, + "balance_loss_mlp": 1.0188638, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 2.6280321666219306, + "language_loss": 0.56007969, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.5818001, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.14752197, + "step": 6330, + "time_per_iteration": 2.442869186401367 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.05713928, + "balance_loss_mlp": 1.02234674, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.4291900683861285, + "language_loss": 0.66124105, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68294489, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13226318, + "step": 6331, + "time_per_iteration": 2.5627200603485107 + }, + { + "auxiliary_loss_clip": 0.0114767, + "auxiliary_loss_mlp": 0.01043288, + "balance_loss_clip": 1.06559467, + "balance_loss_mlp": 1.02838087, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 3.598560163101526, + "language_loss": 0.7563324, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77824205, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14904785, + "step": 6332, + "time_per_iteration": 2.4736101627349854 + }, + { + "auxiliary_loss_clip": 0.01139628, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.05508065, + "balance_loss_mlp": 1.02583766, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.073914612392286, + "language_loss": 0.8174386, + "learning_rate": 2.842343037886987e-06, + "loss": 0.83924872, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.15551758, + "step": 6333, + "time_per_iteration": 2.475518226623535 + }, + { + "auxiliary_loss_clip": 0.01139443, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.05815947, + "balance_loss_mlp": 1.02007246, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.516885076195319, + "language_loss": 0.862463, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88420439, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1463623, + "step": 6334, + "time_per_iteration": 2.5232443809509277 + }, + { + "auxiliary_loss_clip": 0.01148004, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.06358504, + "balance_loss_mlp": 1.02555656, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 1.9904002889547463, + "language_loss": 0.79639131, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81827319, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.1461792, + "step": 6335, + "time_per_iteration": 2.4470694065093994 + }, + { + "auxiliary_loss_clip": 0.01137737, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.05460715, + "balance_loss_mlp": 1.01972318, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 2.3873944642973823, + "language_loss": 0.7325418, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75426459, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14813232, + "step": 6336, + "time_per_iteration": 2.470667839050293 + }, + { + "auxiliary_loss_clip": 0.01139177, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.0577476, + "balance_loss_mlp": 1.01963234, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.8058474877080553, + "language_loss": 0.69271934, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71445137, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14385986, + "step": 6337, + "time_per_iteration": 2.5265917778015137 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.05533481, + "balance_loss_mlp": 1.02145016, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9299625290118212, + "language_loss": 0.63827771, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.66003704, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.16308594, + "step": 6338, + "time_per_iteration": 2.9729092121124268 + }, + { + "auxiliary_loss_clip": 0.01132587, + "auxiliary_loss_mlp": 0.01045692, + "balance_loss_clip": 1.05027986, + "balance_loss_mlp": 1.02808523, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8194208130065366, + "language_loss": 0.69519651, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71697927, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.17614746, + "step": 6339, + "time_per_iteration": 2.753328561782837 + }, + { + "auxiliary_loss_clip": 0.01140716, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.05616629, + "balance_loss_mlp": 1.02883101, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.12754413213376, + "language_loss": 0.68797803, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70981669, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14343262, + "step": 6340, + "time_per_iteration": 4.04421067237854 + }, + { + "auxiliary_loss_clip": 0.01136514, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.05244863, + "balance_loss_mlp": 1.02525151, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.1341921537517976, + "language_loss": 0.89801294, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91979825, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.16772461, + "step": 6341, + "time_per_iteration": 2.495173215866089 + }, + { + "auxiliary_loss_clip": 0.01140277, + "auxiliary_loss_mlp": 0.01038863, + "balance_loss_clip": 1.05467117, + "balance_loss_mlp": 1.02313375, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.9249294170699665, + "language_loss": 0.75434148, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77613288, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.15716553, + "step": 6342, + "time_per_iteration": 2.5151638984680176 + }, + { + "auxiliary_loss_clip": 0.01138052, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.05374742, + "balance_loss_mlp": 1.02094769, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.6542243482968215, + "language_loss": 0.83445519, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85620046, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.1552124, + "step": 6343, + "time_per_iteration": 2.478179693222046 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.05646801, + "balance_loss_mlp": 1.02549112, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 2.083996681112279, + "language_loss": 0.77075946, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79254723, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13848877, + "step": 6344, + "time_per_iteration": 2.496283769607544 + }, + { + "auxiliary_loss_clip": 0.01144742, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.05912495, + "balance_loss_mlp": 1.02485716, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.0934164247621863, + "language_loss": 0.73473716, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75659055, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.15759277, + "step": 6345, + "time_per_iteration": 2.4919445514678955 + }, + { + "auxiliary_loss_clip": 0.011349, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.05301011, + "balance_loss_mlp": 1.02006543, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.6962289292434245, + "language_loss": 0.70465547, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.72634298, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13793945, + "step": 6346, + "time_per_iteration": 3.903780460357666 + }, + { + "auxiliary_loss_clip": 0.01139232, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.05332077, + "balance_loss_mlp": 1.01741099, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.5712344078164113, + "language_loss": 0.75416178, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77587098, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.1427002, + "step": 6347, + "time_per_iteration": 2.461219549179077 + }, + { + "auxiliary_loss_clip": 0.01151204, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.06720185, + "balance_loss_mlp": 1.02223706, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.502847198130642, + "language_loss": 0.74592352, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76779515, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.137146, + "step": 6348, + "time_per_iteration": 2.450856924057007 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.04549944, + "balance_loss_mlp": 1.02238929, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 1.842364033082583, + "language_loss": 0.87256134, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89419913, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.14331055, + "step": 6349, + "time_per_iteration": 2.4901204109191895 + }, + { + "auxiliary_loss_clip": 0.01128313, + "auxiliary_loss_mlp": 0.01043374, + "balance_loss_clip": 1.04599917, + "balance_loss_mlp": 1.02705467, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 1.6940079298806232, + "language_loss": 0.76900333, + "learning_rate": 2.836333449345341e-06, + "loss": 0.79072011, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.16320801, + "step": 6350, + "time_per_iteration": 2.6389923095703125 + }, + { + "auxiliary_loss_clip": 0.01132804, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.05115747, + "balance_loss_mlp": 1.01452053, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.2021976117301074, + "language_loss": 0.75881964, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78044879, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.15576172, + "step": 6351, + "time_per_iteration": 2.5017786026000977 + }, + { + "auxiliary_loss_clip": 0.01134392, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.05092049, + "balance_loss_mlp": 1.01906693, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6130733292577697, + "language_loss": 0.73900193, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76069152, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.1550293, + "step": 6352, + "time_per_iteration": 2.5379745960235596 + }, + { + "auxiliary_loss_clip": 0.01129149, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.04746127, + "balance_loss_mlp": 1.01980805, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.770063257001085, + "language_loss": 0.64278769, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66441405, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13677979, + "step": 6353, + "time_per_iteration": 2.427550792694092 + }, + { + "auxiliary_loss_clip": 0.01129656, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.04893374, + "balance_loss_mlp": 1.01992583, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.5324459882625512, + "language_loss": 0.83370745, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85534054, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13720703, + "step": 6354, + "time_per_iteration": 2.5073976516723633 + }, + { + "auxiliary_loss_clip": 0.01134526, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.05258584, + "balance_loss_mlp": 1.02237451, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.7508619492313624, + "language_loss": 0.80459189, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82629901, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13818359, + "step": 6355, + "time_per_iteration": 2.6275134086608887 + }, + { + "auxiliary_loss_clip": 0.01134519, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.05170953, + "balance_loss_mlp": 1.02728081, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.880837560242578, + "language_loss": 0.75173271, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77349555, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.1449585, + "step": 6356, + "time_per_iteration": 2.4871222972869873 + }, + { + "auxiliary_loss_clip": 0.01142696, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.05990851, + "balance_loss_mlp": 1.01985979, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 2.260635523578571, + "language_loss": 0.81513959, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83690828, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14324951, + "step": 6357, + "time_per_iteration": 2.5424141883850098 + }, + { + "auxiliary_loss_clip": 0.01143231, + "auxiliary_loss_mlp": 0.0104576, + "balance_loss_clip": 1.05972362, + "balance_loss_mlp": 1.03027487, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 1.6140808502115072, + "language_loss": 0.77392566, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79581553, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.15472412, + "step": 6358, + "time_per_iteration": 2.49725079536438 + }, + { + "auxiliary_loss_clip": 0.01145716, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.06089115, + "balance_loss_mlp": 1.02275586, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.8092536473178487, + "language_loss": 0.78889447, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.81072438, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14501953, + "step": 6359, + "time_per_iteration": 2.4798789024353027 + }, + { + "auxiliary_loss_clip": 0.0113035, + "auxiliary_loss_mlp": 0.01038094, + "balance_loss_clip": 1.05014706, + "balance_loss_mlp": 1.02227449, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 1.7671585351199672, + "language_loss": 0.69728994, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71897441, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.15808105, + "step": 6360, + "time_per_iteration": 2.7327399253845215 + }, + { + "auxiliary_loss_clip": 0.01128668, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.04817533, + "balance_loss_mlp": 1.01502728, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.4077753516882778, + "language_loss": 0.79029912, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81188762, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.15142822, + "step": 6361, + "time_per_iteration": 2.5653791427612305 + }, + { + "auxiliary_loss_clip": 0.01128457, + "auxiliary_loss_mlp": 0.01041082, + "balance_loss_clip": 1.04930866, + "balance_loss_mlp": 1.0258528, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4094126953760142, + "language_loss": 0.65474105, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67643642, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.15222168, + "step": 6362, + "time_per_iteration": 2.7289772033691406 + }, + { + "auxiliary_loss_clip": 0.01137767, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.05648756, + "balance_loss_mlp": 1.01928806, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 2.043999421700678, + "language_loss": 0.8210215, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84273916, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14691162, + "step": 6363, + "time_per_iteration": 2.4585859775543213 + }, + { + "auxiliary_loss_clip": 0.01145678, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.0645777, + "balance_loss_mlp": 1.02796078, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.7296145041576791, + "language_loss": 0.58626342, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60814273, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14300537, + "step": 6364, + "time_per_iteration": 2.882523536682129 + }, + { + "auxiliary_loss_clip": 0.01130408, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.04781973, + "balance_loss_mlp": 1.02482426, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 1.6980154374426484, + "language_loss": 0.68606949, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.70776999, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14819336, + "step": 6365, + "time_per_iteration": 3.992349624633789 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.04954934, + "balance_loss_mlp": 1.02078211, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 1.9075936109605278, + "language_loss": 0.73364729, + "learning_rate": 2.830668992382758e-06, + "loss": 0.755337, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14892578, + "step": 6366, + "time_per_iteration": 2.5096073150634766 + }, + { + "auxiliary_loss_clip": 0.01138384, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.05678701, + "balance_loss_mlp": 1.01932287, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.4837513709722514, + "language_loss": 0.6855008, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70722175, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14398193, + "step": 6367, + "time_per_iteration": 2.503728151321411 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.05111289, + "balance_loss_mlp": 1.02273142, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 4.2157083634424914, + "language_loss": 0.64351046, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66519356, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14575195, + "step": 6368, + "time_per_iteration": 2.528355121612549 + }, + { + "auxiliary_loss_clip": 0.01147554, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.06440949, + "balance_loss_mlp": 1.01986921, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.3604047399799413, + "language_loss": 0.68213016, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70395058, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14599609, + "step": 6369, + "time_per_iteration": 2.5086052417755127 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.05208421, + "balance_loss_mlp": 1.02680647, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 2.226623454829492, + "language_loss": 0.79112828, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.81287426, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14801025, + "step": 6370, + "time_per_iteration": 2.478693962097168 + }, + { + "auxiliary_loss_clip": 0.01134619, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.05359077, + "balance_loss_mlp": 1.01934409, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 8.410569553837762, + "language_loss": 0.65023249, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.67191428, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14221191, + "step": 6371, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01141777, + "auxiliary_loss_mlp": 0.01041133, + "balance_loss_clip": 1.05627406, + "balance_loss_mlp": 1.02466989, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.7455299494260514, + "language_loss": 0.72898984, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.75081897, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.16467285, + "step": 6372, + "time_per_iteration": 2.5029962062835693 + }, + { + "auxiliary_loss_clip": 0.01135866, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.05181909, + "balance_loss_mlp": 1.02193642, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.7261408758035766, + "language_loss": 0.85051918, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.8722496, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15246582, + "step": 6373, + "time_per_iteration": 3.9291274547576904 + }, + { + "auxiliary_loss_clip": 0.01136863, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.05135703, + "balance_loss_mlp": 1.02732992, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 2.176695879644552, + "language_loss": 0.75255585, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77435231, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.15454102, + "step": 6374, + "time_per_iteration": 2.5958919525146484 + }, + { + "auxiliary_loss_clip": 0.01138563, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.05426359, + "balance_loss_mlp": 1.02897406, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 3.5184107702075114, + "language_loss": 0.76017195, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.78201789, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.1706543, + "step": 6375, + "time_per_iteration": 2.4514994621276855 + }, + { + "auxiliary_loss_clip": 0.01145755, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.06152022, + "balance_loss_mlp": 1.02119589, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.0362016855659917, + "language_loss": 0.72338033, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74520063, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.15063477, + "step": 6376, + "time_per_iteration": 2.500941276550293 + }, + { + "auxiliary_loss_clip": 0.01132798, + "auxiliary_loss_mlp": 0.01039358, + "balance_loss_clip": 1.0514133, + "balance_loss_mlp": 1.02413535, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.8033524056707362, + "language_loss": 0.68160224, + "learning_rate": 2.826769997289796e-06, + "loss": 0.70332384, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.15222168, + "step": 6377, + "time_per_iteration": 2.683122158050537 + }, + { + "auxiliary_loss_clip": 0.01139183, + "auxiliary_loss_mlp": 0.01041196, + "balance_loss_clip": 1.05606496, + "balance_loss_mlp": 1.02527595, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 1.8078718452000517, + "language_loss": 0.73317492, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75497878, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.15930176, + "step": 6378, + "time_per_iteration": 2.477236032485962 + }, + { + "auxiliary_loss_clip": 0.01142682, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.05600035, + "balance_loss_mlp": 1.02685583, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.8054171143310895, + "language_loss": 0.69818664, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.7200284, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.14624023, + "step": 6379, + "time_per_iteration": 2.48736310005188 + }, + { + "auxiliary_loss_clip": 0.01134097, + "auxiliary_loss_mlp": 0.01044959, + "balance_loss_clip": 1.0534333, + "balance_loss_mlp": 1.02806711, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 2.272140651292167, + "language_loss": 0.83590496, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85769552, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.16882324, + "step": 6380, + "time_per_iteration": 2.448939561843872 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.05350256, + "balance_loss_mlp": 1.0227561, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.9365424243439266, + "language_loss": 0.81053066, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83226401, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14508057, + "step": 6381, + "time_per_iteration": 2.464369058609009 + }, + { + "auxiliary_loss_clip": 0.01071364, + "auxiliary_loss_mlp": 0.01014066, + "balance_loss_clip": 1.04279804, + "balance_loss_mlp": 1.01254046, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.7896153647535025, + "language_loss": 0.60499269, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62584698, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.28564453, + "router_z_loss_mlp": 0.01527405, + "step": 6382, + "time_per_iteration": 3.072538137435913 + }, + { + "auxiliary_loss_clip": 0.01141223, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.0563798, + "balance_loss_mlp": 1.01975608, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 3.2657981482469496, + "language_loss": 0.66428936, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68605137, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15222168, + "step": 6383, + "time_per_iteration": 2.5610580444335938 + }, + { + "auxiliary_loss_clip": 0.01136001, + "auxiliary_loss_mlp": 0.01040357, + "balance_loss_clip": 1.05413091, + "balance_loss_mlp": 1.02562833, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 1.661712000934794, + "language_loss": 0.74614352, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76790714, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14727783, + "step": 6384, + "time_per_iteration": 3.913834810256958 + }, + { + "auxiliary_loss_clip": 0.0114241, + "auxiliary_loss_mlp": 0.01040537, + "balance_loss_clip": 1.06150472, + "balance_loss_mlp": 1.02566624, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.3649310108346573, + "language_loss": 0.7630468, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78487623, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14874268, + "step": 6385, + "time_per_iteration": 2.507277727127075 + }, + { + "auxiliary_loss_clip": 0.01076223, + "auxiliary_loss_mlp": 0.01004849, + "balance_loss_clip": 1.04596233, + "balance_loss_mlp": 1.00324869, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9206815378781525, + "language_loss": 0.6699127, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69072336, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.01600647, + "step": 6386, + "time_per_iteration": 2.9852335453033447 + }, + { + "auxiliary_loss_clip": 0.01130861, + "auxiliary_loss_mlp": 0.01041286, + "balance_loss_clip": 1.05090928, + "balance_loss_mlp": 1.02716589, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7462084996316494, + "language_loss": 0.72508812, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.7468096, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14117432, + "step": 6387, + "time_per_iteration": 2.5059382915496826 + }, + { + "auxiliary_loss_clip": 0.01152338, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.07136726, + "balance_loss_mlp": 1.02699471, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.7177649438949167, + "language_loss": 0.81563932, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83757699, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14453125, + "step": 6388, + "time_per_iteration": 2.558943510055542 + }, + { + "auxiliary_loss_clip": 0.01134575, + "auxiliary_loss_mlp": 0.01043082, + "balance_loss_clip": 1.05491233, + "balance_loss_mlp": 1.0271678, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6389623068426005, + "language_loss": 0.761186, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78296256, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.15917969, + "step": 6389, + "time_per_iteration": 4.027446508407593 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01049135, + "balance_loss_clip": 1.06266093, + "balance_loss_mlp": 1.03221953, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5828940267672515, + "language_loss": 0.76550436, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78748977, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.16918945, + "step": 6390, + "time_per_iteration": 2.479649782180786 + }, + { + "auxiliary_loss_clip": 0.01133656, + "auxiliary_loss_mlp": 0.01051017, + "balance_loss_clip": 1.04750168, + "balance_loss_mlp": 1.03453028, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.8529504367596044, + "language_loss": 0.70266694, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72451365, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.16491699, + "step": 6391, + "time_per_iteration": 2.5740537643432617 + }, + { + "auxiliary_loss_clip": 0.01135908, + "auxiliary_loss_mlp": 0.01038894, + "balance_loss_clip": 1.05183983, + "balance_loss_mlp": 1.02326536, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.6401532582553975, + "language_loss": 0.84082174, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86256975, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.15625, + "step": 6392, + "time_per_iteration": 2.5268282890319824 + }, + { + "auxiliary_loss_clip": 0.01132529, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.04948115, + "balance_loss_mlp": 1.02197373, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.023269971084382, + "language_loss": 0.61014599, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63183749, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14654541, + "step": 6393, + "time_per_iteration": 2.4380199909210205 + }, + { + "auxiliary_loss_clip": 0.01139032, + "auxiliary_loss_mlp": 0.01039396, + "balance_loss_clip": 1.05183935, + "balance_loss_mlp": 1.02301669, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 1.6639330471250218, + "language_loss": 0.71313202, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73491627, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.16381836, + "step": 6394, + "time_per_iteration": 2.502869129180908 + }, + { + "auxiliary_loss_clip": 0.01141405, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.05325818, + "balance_loss_mlp": 1.02204514, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.1278018119448485, + "language_loss": 0.80898517, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.8307817, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.1619873, + "step": 6395, + "time_per_iteration": 2.4837865829467773 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01047154, + "balance_loss_clip": 1.05869532, + "balance_loss_mlp": 1.0319072, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 2.1004112804145008, + "language_loss": 0.70999527, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73191172, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.15258789, + "step": 6396, + "time_per_iteration": 2.4862921237945557 + }, + { + "auxiliary_loss_clip": 0.01071321, + "auxiliary_loss_mlp": 0.01004881, + "balance_loss_clip": 1.04158497, + "balance_loss_mlp": 1.00312269, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8893265739265787, + "language_loss": 0.59716344, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61792541, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.29785156, + "router_z_loss_mlp": 0.01757812, + "step": 6397, + "time_per_iteration": 3.1486425399780273 + }, + { + "auxiliary_loss_clip": 0.0113207, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.04919302, + "balance_loss_mlp": 1.01900089, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.7000525753189897, + "language_loss": 0.85296822, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87464237, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.16333008, + "step": 6398, + "time_per_iteration": 2.4991273880004883 + }, + { + "auxiliary_loss_clip": 0.0114384, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.06105828, + "balance_loss_mlp": 1.01808083, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 2.8083504037127898, + "language_loss": 0.79910964, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.82087183, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14294434, + "step": 6399, + "time_per_iteration": 2.4322128295898438 + }, + { + "auxiliary_loss_clip": 0.01138397, + "auxiliary_loss_mlp": 0.01046575, + "balance_loss_clip": 1.05098796, + "balance_loss_mlp": 1.02912295, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.85364947561612, + "language_loss": 0.67520678, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69705653, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.17443848, + "step": 6400, + "time_per_iteration": 2.4589123725891113 + }, + { + "auxiliary_loss_clip": 0.0114229, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_clip": 1.05610633, + "balance_loss_mlp": 1.03068209, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.9162709759245886, + "language_loss": 0.73123425, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75311768, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15356445, + "step": 6401, + "time_per_iteration": 2.4925293922424316 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.06286633, + "balance_loss_mlp": 1.02701592, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 2.4083814392405847, + "language_loss": 0.72081965, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74272126, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15551758, + "step": 6402, + "time_per_iteration": 2.578325033187866 + }, + { + "auxiliary_loss_clip": 0.01132885, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.05238271, + "balance_loss_mlp": 1.01937914, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.8668874872843084, + "language_loss": 0.82739055, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84905732, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.144104, + "step": 6403, + "time_per_iteration": 2.6221320629119873 + }, + { + "auxiliary_loss_clip": 0.01132742, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.05047321, + "balance_loss_mlp": 1.02628136, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.9721200522419857, + "language_loss": 0.83001888, + "learning_rate": 2.817183690261189e-06, + "loss": 0.85176456, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.15545654, + "step": 6404, + "time_per_iteration": 2.5313189029693604 + }, + { + "auxiliary_loss_clip": 0.01139205, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.05126643, + "balance_loss_mlp": 1.021631, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.8329975169522685, + "language_loss": 0.69636017, + "learning_rate": 2.816828206390563e-06, + "loss": 0.7181133, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.14483643, + "step": 6405, + "time_per_iteration": 2.549530506134033 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.05450892, + "balance_loss_mlp": 1.02330589, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.897643031500619, + "language_loss": 0.7938503, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81555939, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.1295166, + "step": 6406, + "time_per_iteration": 2.4666080474853516 + }, + { + "auxiliary_loss_clip": 0.01141176, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.0561955, + "balance_loss_mlp": 1.01840103, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.2039444731348703, + "language_loss": 0.84585011, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86759877, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.1529541, + "step": 6407, + "time_per_iteration": 2.4472947120666504 + }, + { + "auxiliary_loss_clip": 0.01071272, + "auxiliary_loss_mlp": 0.01015074, + "balance_loss_clip": 1.04101777, + "balance_loss_mlp": 1.01330042, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.842339984928688, + "language_loss": 0.64936924, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67023265, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.30224609, + "router_z_loss_mlp": 0.01773071, + "step": 6408, + "time_per_iteration": 4.715095520019531 + }, + { + "auxiliary_loss_clip": 0.01137287, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.05310833, + "balance_loss_mlp": 1.02627182, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.3857255558537984, + "language_loss": 0.73819482, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75998837, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.15795898, + "step": 6409, + "time_per_iteration": 2.4912352561950684 + }, + { + "auxiliary_loss_clip": 0.01149601, + "auxiliary_loss_mlp": 0.01046697, + "balance_loss_clip": 1.05800748, + "balance_loss_mlp": 1.02990055, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.019072434948333, + "language_loss": 0.7033152, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72527826, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.16784668, + "step": 6410, + "time_per_iteration": 2.4459192752838135 + }, + { + "auxiliary_loss_clip": 0.01081885, + "auxiliary_loss_mlp": 0.01005362, + "balance_loss_clip": 1.04971254, + "balance_loss_mlp": 1.0036006, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 1.7519739693682082, + "language_loss": 0.60317516, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62404764, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 0.01760864, + "step": 6411, + "time_per_iteration": 3.174168825149536 + }, + { + "auxiliary_loss_clip": 0.0113879, + "auxiliary_loss_mlp": 0.0102711, + "balance_loss_clip": 1.05674148, + "balance_loss_mlp": 1.01375854, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.8065751031394992, + "language_loss": 0.77578461, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79744363, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13348389, + "step": 6412, + "time_per_iteration": 2.461933135986328 + }, + { + "auxiliary_loss_clip": 0.01150318, + "auxiliary_loss_mlp": 0.01040954, + "balance_loss_clip": 1.0635469, + "balance_loss_mlp": 1.02404976, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7372776700572696, + "language_loss": 0.7790345, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80094725, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.16906738, + "step": 6413, + "time_per_iteration": 2.4799957275390625 + }, + { + "auxiliary_loss_clip": 0.01102722, + "auxiliary_loss_mlp": 0.010103, + "balance_loss_clip": 1.07212925, + "balance_loss_mlp": 1.00659597, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8327992437104501, + "language_loss": 0.61228997, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63342017, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.30517578, + "router_z_loss_mlp": 0.03704834, + "step": 6414, + "time_per_iteration": 2.9290060997009277 + }, + { + "auxiliary_loss_clip": 0.01145184, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.05846286, + "balance_loss_mlp": 1.02832568, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.23371796229838, + "language_loss": 0.77665067, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79853559, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14978027, + "step": 6415, + "time_per_iteration": 2.6922624111175537 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.05502796, + "balance_loss_mlp": 1.01768112, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 2.2660538526440153, + "language_loss": 0.80155867, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82321489, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13220215, + "step": 6416, + "time_per_iteration": 3.888167142868042 + }, + { + "auxiliary_loss_clip": 0.01135859, + "auxiliary_loss_mlp": 0.01053825, + "balance_loss_clip": 1.05035472, + "balance_loss_mlp": 1.03940058, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.793606396364806, + "language_loss": 0.79218924, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81408608, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.14416504, + "step": 6417, + "time_per_iteration": 2.470308303833008 + }, + { + "auxiliary_loss_clip": 0.01137112, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.05517554, + "balance_loss_mlp": 1.02114058, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 2.0477807161340205, + "language_loss": 0.79804039, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.81975901, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13604736, + "step": 6418, + "time_per_iteration": 2.4928689002990723 + }, + { + "auxiliary_loss_clip": 0.01126911, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.04666436, + "balance_loss_mlp": 1.02247119, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.8089885157779841, + "language_loss": 0.79919899, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.82084882, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.15606689, + "step": 6419, + "time_per_iteration": 2.49387526512146 + }, + { + "auxiliary_loss_clip": 0.01130569, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.04850543, + "balance_loss_mlp": 1.02181149, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0607784238357185, + "language_loss": 0.67419863, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.6958766, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.15405273, + "step": 6420, + "time_per_iteration": 2.468040704727173 + }, + { + "auxiliary_loss_clip": 0.01130673, + "auxiliary_loss_mlp": 0.01039749, + "balance_loss_clip": 1.05214834, + "balance_loss_mlp": 1.02586079, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.0770590272378886, + "language_loss": 0.8111673, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.8328715, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13897705, + "step": 6421, + "time_per_iteration": 2.4068524837493896 + }, + { + "auxiliary_loss_clip": 0.01135229, + "auxiliary_loss_mlp": 0.01040647, + "balance_loss_clip": 1.05032825, + "balance_loss_mlp": 1.02495921, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.028724507437199, + "language_loss": 0.71988755, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74164641, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15673828, + "step": 6422, + "time_per_iteration": 2.4608559608459473 + }, + { + "auxiliary_loss_clip": 0.01136088, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.05395484, + "balance_loss_mlp": 1.02452517, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.740203332122486, + "language_loss": 0.66677928, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68852723, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.1418457, + "step": 6423, + "time_per_iteration": 2.453191041946411 + }, + { + "auxiliary_loss_clip": 0.01145587, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.06034088, + "balance_loss_mlp": 1.02573895, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.8936505549415164, + "language_loss": 0.69152963, + "learning_rate": 2.810068143123449e-06, + "loss": 0.71338129, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13842773, + "step": 6424, + "time_per_iteration": 2.571678400039673 + }, + { + "auxiliary_loss_clip": 0.01133283, + "auxiliary_loss_mlp": 0.01048761, + "balance_loss_clip": 1.0514437, + "balance_loss_mlp": 1.03269112, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.5454042020691436, + "language_loss": 0.7281931, + "learning_rate": 2.809712042331429e-06, + "loss": 0.75001353, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.16064453, + "step": 6425, + "time_per_iteration": 2.464550256729126 + }, + { + "auxiliary_loss_clip": 0.01135703, + "auxiliary_loss_mlp": 0.01041489, + "balance_loss_clip": 1.05012417, + "balance_loss_mlp": 1.02692747, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.081916700059365, + "language_loss": 0.80225575, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.8240276, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14562988, + "step": 6426, + "time_per_iteration": 2.4901363849639893 + }, + { + "auxiliary_loss_clip": 0.01142923, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.05865049, + "balance_loss_mlp": 1.022645, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.6106975253617883, + "language_loss": 0.74870276, + "learning_rate": 2.80899974864781e-06, + "loss": 0.77051079, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.15228271, + "step": 6427, + "time_per_iteration": 2.494964599609375 + }, + { + "auxiliary_loss_clip": 0.01139016, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.05663717, + "balance_loss_mlp": 1.02652597, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 3.571159998687931, + "language_loss": 0.69813859, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71993613, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.14202881, + "step": 6428, + "time_per_iteration": 4.054299354553223 + }, + { + "auxiliary_loss_clip": 0.01146152, + "auxiliary_loss_mlp": 0.01042119, + "balance_loss_clip": 1.06209862, + "balance_loss_mlp": 1.02722931, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.159448717557901, + "language_loss": 0.8448261, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86670887, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.14880371, + "step": 6429, + "time_per_iteration": 2.4917380809783936 + }, + { + "auxiliary_loss_clip": 0.01143357, + "auxiliary_loss_mlp": 0.01038578, + "balance_loss_clip": 1.0581845, + "balance_loss_mlp": 1.02417159, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 1.7454327115866288, + "language_loss": 0.81385028, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83566964, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.144104, + "step": 6430, + "time_per_iteration": 2.488452434539795 + }, + { + "auxiliary_loss_clip": 0.01079922, + "auxiliary_loss_mlp": 0.01011236, + "balance_loss_clip": 1.04823017, + "balance_loss_mlp": 1.00959706, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7133252794811975, + "language_loss": 0.58812374, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60903531, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.01638794, + "step": 6431, + "time_per_iteration": 3.2196924686431885 + }, + { + "auxiliary_loss_clip": 0.01142249, + "auxiliary_loss_mlp": 0.01041892, + "balance_loss_clip": 1.05646586, + "balance_loss_mlp": 1.02629948, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.9587630454421416, + "language_loss": 0.789693, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.8115344, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.15588379, + "step": 6432, + "time_per_iteration": 2.4894773960113525 + }, + { + "auxiliary_loss_clip": 0.01144755, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.05589199, + "balance_loss_mlp": 1.02571762, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 4.127685761879871, + "language_loss": 0.80388296, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82574594, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.15820312, + "step": 6433, + "time_per_iteration": 2.534745454788208 + }, + { + "auxiliary_loss_clip": 0.01142546, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.05721092, + "balance_loss_mlp": 1.02396047, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 2.209591815164893, + "language_loss": 0.7078681, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72968805, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.15509033, + "step": 6434, + "time_per_iteration": 4.0019800662994385 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.05569911, + "balance_loss_mlp": 1.02525246, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 1.7427073830864834, + "language_loss": 0.77439928, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79623032, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15039062, + "step": 6435, + "time_per_iteration": 2.4322526454925537 + }, + { + "auxiliary_loss_clip": 0.01140557, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.05749381, + "balance_loss_mlp": 1.02109659, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.814224582414787, + "language_loss": 0.79704762, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81880844, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.14416504, + "step": 6436, + "time_per_iteration": 2.504984140396118 + }, + { + "auxiliary_loss_clip": 0.01139079, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.05792499, + "balance_loss_mlp": 1.01847911, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.8398868895490814, + "language_loss": 0.77044827, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79217136, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1473999, + "step": 6437, + "time_per_iteration": 2.45933198928833 + }, + { + "auxiliary_loss_clip": 0.01137946, + "auxiliary_loss_mlp": 0.01038504, + "balance_loss_clip": 1.05750322, + "balance_loss_mlp": 1.02503359, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0654611287068434, + "language_loss": 0.82016927, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84193373, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13470459, + "step": 6438, + "time_per_iteration": 2.455994129180908 + }, + { + "auxiliary_loss_clip": 0.01139907, + "auxiliary_loss_mlp": 0.01038382, + "balance_loss_clip": 1.05528975, + "balance_loss_mlp": 1.02276003, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.426973153620346, + "language_loss": 0.75464773, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77643061, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.15618896, + "step": 6439, + "time_per_iteration": 2.439293384552002 + }, + { + "auxiliary_loss_clip": 0.01146089, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.0656743, + "balance_loss_mlp": 1.02026916, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4415354825958404, + "language_loss": 0.73879838, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76060128, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.1394043, + "step": 6440, + "time_per_iteration": 2.4968676567077637 + }, + { + "auxiliary_loss_clip": 0.01153697, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.06620169, + "balance_loss_mlp": 1.02486074, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.9264612480004848, + "language_loss": 0.8224138, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84435493, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15545654, + "step": 6441, + "time_per_iteration": 2.732832670211792 + }, + { + "auxiliary_loss_clip": 0.01135877, + "auxiliary_loss_mlp": 0.01043561, + "balance_loss_clip": 1.05138588, + "balance_loss_mlp": 1.02964926, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.2276739978148195, + "language_loss": 0.81226516, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83405954, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13897705, + "step": 6442, + "time_per_iteration": 2.4534788131713867 + }, + { + "auxiliary_loss_clip": 0.0114518, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.06018662, + "balance_loss_mlp": 1.0204134, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 2.1811396696694167, + "language_loss": 0.83773953, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85954559, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15020752, + "step": 6443, + "time_per_iteration": 2.4673099517822266 + }, + { + "auxiliary_loss_clip": 0.01068666, + "auxiliary_loss_mlp": 0.01009157, + "balance_loss_clip": 1.04011345, + "balance_loss_mlp": 1.00750339, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.756963856891182, + "language_loss": 0.50245392, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.5232321, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.01652527, + "step": 6444, + "time_per_iteration": 3.0882980823516846 + }, + { + "auxiliary_loss_clip": 0.01138153, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.05730391, + "balance_loss_mlp": 1.01852703, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.3962472319545702, + "language_loss": 0.79040527, + "learning_rate": 2.802583596543065e-06, + "loss": 0.81210893, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13684082, + "step": 6445, + "time_per_iteration": 2.4264140129089355 + }, + { + "auxiliary_loss_clip": 0.01133676, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.05167377, + "balance_loss_mlp": 1.01897359, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.7672882850274712, + "language_loss": 0.81302059, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83468908, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14202881, + "step": 6446, + "time_per_iteration": 2.4731574058532715 + }, + { + "auxiliary_loss_clip": 0.01147852, + "auxiliary_loss_mlp": 0.01038289, + "balance_loss_clip": 1.06578946, + "balance_loss_mlp": 1.02376294, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.6996538536628758, + "language_loss": 0.77109849, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79295987, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14508057, + "step": 6447, + "time_per_iteration": 2.4689853191375732 + }, + { + "auxiliary_loss_clip": 0.01137522, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.05530119, + "balance_loss_mlp": 1.02322245, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.7417417072182242, + "language_loss": 0.76263011, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78437763, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14007568, + "step": 6448, + "time_per_iteration": 2.5134379863739014 + }, + { + "auxiliary_loss_clip": 0.01139523, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.05570757, + "balance_loss_mlp": 1.02281165, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.556115341697855, + "language_loss": 0.76302361, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78479153, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14447021, + "step": 6449, + "time_per_iteration": 2.530426502227783 + }, + { + "auxiliary_loss_clip": 0.01140408, + "auxiliary_loss_mlp": 0.01044872, + "balance_loss_clip": 1.05545783, + "balance_loss_mlp": 1.02918386, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.5986462803265353, + "language_loss": 0.78767765, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80953044, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.15698242, + "step": 6450, + "time_per_iteration": 2.527825117111206 + }, + { + "auxiliary_loss_clip": 0.01138412, + "auxiliary_loss_mlp": 0.01038049, + "balance_loss_clip": 1.05150366, + "balance_loss_mlp": 1.02295756, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.2610477501244692, + "language_loss": 0.77664554, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79841018, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.15087891, + "step": 6451, + "time_per_iteration": 2.5545620918273926 + }, + { + "auxiliary_loss_clip": 0.0113679, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.05626285, + "balance_loss_mlp": 1.01905978, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.7570915047389937, + "language_loss": 0.76598918, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78768629, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13891602, + "step": 6452, + "time_per_iteration": 3.9857425689697266 + }, + { + "auxiliary_loss_clip": 0.01133163, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.05191422, + "balance_loss_mlp": 1.03188252, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.6750191165398782, + "language_loss": 0.79667932, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81846714, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13745117, + "step": 6453, + "time_per_iteration": 2.4928624629974365 + }, + { + "auxiliary_loss_clip": 0.01140262, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.05262935, + "balance_loss_mlp": 1.02736819, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.8998010838882733, + "language_loss": 0.71471834, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73655236, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.15771484, + "step": 6454, + "time_per_iteration": 2.6983413696289062 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01051286, + "balance_loss_clip": 1.04979527, + "balance_loss_mlp": 1.03278506, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 1.7590433013673064, + "language_loss": 0.77886063, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80072713, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.18493652, + "step": 6455, + "time_per_iteration": 2.5121898651123047 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01046343, + "balance_loss_clip": 1.05266201, + "balance_loss_mlp": 1.03132868, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.477822293245643, + "language_loss": 0.75918818, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78101075, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.15008545, + "step": 6456, + "time_per_iteration": 2.4516282081604004 + }, + { + "auxiliary_loss_clip": 0.01137156, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.05286837, + "balance_loss_mlp": 1.02260721, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.0992902654812404, + "language_loss": 0.59770918, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.61945486, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14807129, + "step": 6457, + "time_per_iteration": 2.4682250022888184 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.05618596, + "balance_loss_mlp": 1.02167773, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.4556270338832045, + "language_loss": 0.80046535, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82229197, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.16357422, + "step": 6458, + "time_per_iteration": 2.411590814590454 + }, + { + "auxiliary_loss_clip": 0.01143511, + "auxiliary_loss_mlp": 0.01043202, + "balance_loss_clip": 1.05427694, + "balance_loss_mlp": 1.02849758, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.781334357581545, + "language_loss": 0.81913161, + "learning_rate": 2.797586434755509e-06, + "loss": 0.84099877, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.14691162, + "step": 6459, + "time_per_iteration": 2.5209157466888428 + }, + { + "auxiliary_loss_clip": 0.01138615, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.05807543, + "balance_loss_mlp": 1.02302766, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 1.6626895002737707, + "language_loss": 0.61911929, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.64087683, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14111328, + "step": 6460, + "time_per_iteration": 3.862637758255005 + }, + { + "auxiliary_loss_clip": 0.01145972, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.0625757, + "balance_loss_mlp": 1.02088308, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.6709173414033065, + "language_loss": 0.86187077, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88367343, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13391113, + "step": 6461, + "time_per_iteration": 2.5001587867736816 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.06460953, + "balance_loss_mlp": 1.0203594, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 2.2454566870941814, + "language_loss": 0.71102393, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73290485, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.14404297, + "step": 6462, + "time_per_iteration": 2.4898722171783447 + }, + { + "auxiliary_loss_clip": 0.01141246, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.0563736, + "balance_loss_mlp": 1.02006686, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.0760087657233606, + "language_loss": 0.75776452, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77952325, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14550781, + "step": 6463, + "time_per_iteration": 2.481349229812622 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.06107736, + "balance_loss_mlp": 1.02432358, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 2.0011909027845247, + "language_loss": 0.70665133, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72851717, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.15753174, + "step": 6464, + "time_per_iteration": 2.435499906539917 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.0579474, + "balance_loss_mlp": 1.01662707, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 2.9088489579126637, + "language_loss": 0.69424748, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71595323, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14978027, + "step": 6465, + "time_per_iteration": 2.493744373321533 + }, + { + "auxiliary_loss_clip": 0.01147502, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.06386864, + "balance_loss_mlp": 1.02176106, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.1270098617995523, + "language_loss": 0.78035903, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80221117, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.1595459, + "step": 6466, + "time_per_iteration": 2.4792063236236572 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.0646925, + "balance_loss_mlp": 1.02549613, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.4888194605716079, + "language_loss": 0.69388658, + "learning_rate": 2.794728249830611e-06, + "loss": 0.7157948, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14752197, + "step": 6467, + "time_per_iteration": 2.736842155456543 + }, + { + "auxiliary_loss_clip": 0.01139881, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.05334759, + "balance_loss_mlp": 1.02414083, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.329531061301831, + "language_loss": 0.83154815, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85334706, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15869141, + "step": 6468, + "time_per_iteration": 2.435183048248291 + }, + { + "auxiliary_loss_clip": 0.01141069, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.05709672, + "balance_loss_mlp": 1.02377033, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 1.9187114085251327, + "language_loss": 0.84361178, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86539721, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13708496, + "step": 6469, + "time_per_iteration": 2.4667136669158936 + }, + { + "auxiliary_loss_clip": 0.01141911, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.05992377, + "balance_loss_mlp": 1.02210236, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.905682604566291, + "language_loss": 0.74718511, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76897383, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.14855957, + "step": 6470, + "time_per_iteration": 2.505600929260254 + }, + { + "auxiliary_loss_clip": 0.01142149, + "auxiliary_loss_mlp": 0.01039356, + "balance_loss_clip": 1.05609202, + "balance_loss_mlp": 1.02319074, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.537547423131066, + "language_loss": 0.74838388, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77019894, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.16162109, + "step": 6471, + "time_per_iteration": 2.5324883460998535 + }, + { + "auxiliary_loss_clip": 0.01141897, + "auxiliary_loss_mlp": 0.01041206, + "balance_loss_clip": 1.05944133, + "balance_loss_mlp": 1.02603102, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 2.4513520990123694, + "language_loss": 0.67732722, + "learning_rate": 2.792940904386562e-06, + "loss": 0.69915831, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.1517334, + "step": 6472, + "time_per_iteration": 3.97680401802063 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.05479515, + "balance_loss_mlp": 1.02675557, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.7556924762822204, + "language_loss": 0.76441205, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78619003, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13989258, + "step": 6473, + "time_per_iteration": 2.5608327388763428 + }, + { + "auxiliary_loss_clip": 0.01146268, + "auxiliary_loss_mlp": 0.01046817, + "balance_loss_clip": 1.05969012, + "balance_loss_mlp": 1.03112948, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.920358980801505, + "language_loss": 0.70666826, + "learning_rate": 2.792225755635257e-06, + "loss": 0.72859913, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.15686035, + "step": 6474, + "time_per_iteration": 2.5376698970794678 + }, + { + "auxiliary_loss_clip": 0.01141809, + "auxiliary_loss_mlp": 0.01038326, + "balance_loss_clip": 1.0587765, + "balance_loss_mlp": 1.0244025, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 2.2263555271632067, + "language_loss": 0.68962801, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71142936, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13909912, + "step": 6475, + "time_per_iteration": 2.494208335876465 + }, + { + "auxiliary_loss_clip": 0.01142214, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.05415249, + "balance_loss_mlp": 1.03175056, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.9201767156884322, + "language_loss": 0.7591151, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78102708, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.17224121, + "step": 6476, + "time_per_iteration": 2.535958766937256 + }, + { + "auxiliary_loss_clip": 0.01078433, + "auxiliary_loss_mlp": 0.01003591, + "balance_loss_clip": 1.04904246, + "balance_loss_mlp": 1.00170422, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7866450360121584, + "language_loss": 0.58266485, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60348511, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.29394531, + "router_z_loss_mlp": 0.01882935, + "step": 6477, + "time_per_iteration": 3.053223133087158 + }, + { + "auxiliary_loss_clip": 0.01136951, + "auxiliary_loss_mlp": 0.01040972, + "balance_loss_clip": 1.05353868, + "balance_loss_mlp": 1.02476573, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.8715768955892347, + "language_loss": 0.78051496, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80229419, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.16204834, + "step": 6478, + "time_per_iteration": 2.5272257328033447 + }, + { + "auxiliary_loss_clip": 0.01133498, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.05210733, + "balance_loss_mlp": 1.02696609, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.2798692672064025, + "language_loss": 0.82423836, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84598577, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.14300537, + "step": 6479, + "time_per_iteration": 3.9883363246917725 + }, + { + "auxiliary_loss_clip": 0.01128921, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.04645824, + "balance_loss_mlp": 1.02335691, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.748724493478799, + "language_loss": 0.80348527, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82515901, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.15087891, + "step": 6480, + "time_per_iteration": 2.7065634727478027 + }, + { + "auxiliary_loss_clip": 0.01128913, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.04816282, + "balance_loss_mlp": 1.02469254, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.657275054200409, + "language_loss": 0.83086967, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85254967, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14404297, + "step": 6481, + "time_per_iteration": 2.5414247512817383 + }, + { + "auxiliary_loss_clip": 0.01127113, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.0486511, + "balance_loss_mlp": 1.02540755, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.6166311630861772, + "language_loss": 0.75698006, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77865672, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.15136719, + "step": 6482, + "time_per_iteration": 2.5122182369232178 + }, + { + "auxiliary_loss_clip": 0.01139309, + "auxiliary_loss_mlp": 0.01048528, + "balance_loss_clip": 1.05298245, + "balance_loss_mlp": 1.03340685, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 2.3454463609615814, + "language_loss": 0.78862345, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81050187, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.15124512, + "step": 6483, + "time_per_iteration": 2.4957408905029297 + }, + { + "auxiliary_loss_clip": 0.01137719, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.05145788, + "balance_loss_mlp": 1.02013457, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.5651572973139538, + "language_loss": 0.80395091, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82568902, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.15966797, + "step": 6484, + "time_per_iteration": 2.524275779724121 + }, + { + "auxiliary_loss_clip": 0.01146838, + "auxiliary_loss_mlp": 0.01050402, + "balance_loss_clip": 1.06209946, + "balance_loss_mlp": 1.03409433, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.6623023003050763, + "language_loss": 0.77627879, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.79825115, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.16296387, + "step": 6485, + "time_per_iteration": 2.4638679027557373 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01040391, + "balance_loss_clip": 1.06167459, + "balance_loss_mlp": 1.0248704, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.657886409316022, + "language_loss": 0.85532439, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87719262, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.1552124, + "step": 6486, + "time_per_iteration": 2.465294599533081 + }, + { + "auxiliary_loss_clip": 0.01142728, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.05565619, + "balance_loss_mlp": 1.02186656, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.760101131064427, + "language_loss": 0.8529551, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87474597, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14483643, + "step": 6487, + "time_per_iteration": 2.620579719543457 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01036192, + "balance_loss_clip": 1.05968857, + "balance_loss_mlp": 1.0210886, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5629846185448455, + "language_loss": 0.73100597, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75278419, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15100098, + "step": 6488, + "time_per_iteration": 2.461069345474243 + }, + { + "auxiliary_loss_clip": 0.01149454, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.06292748, + "balance_loss_mlp": 1.02519393, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.411396313071018, + "language_loss": 0.68986762, + "learning_rate": 2.786858317231779e-06, + "loss": 0.71176648, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15234375, + "step": 6489, + "time_per_iteration": 2.507610321044922 + }, + { + "auxiliary_loss_clip": 0.01133489, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.05192971, + "balance_loss_mlp": 1.02651548, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.6284409327199187, + "language_loss": 0.80908751, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83082926, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.14160156, + "step": 6490, + "time_per_iteration": 2.5041873455047607 + }, + { + "auxiliary_loss_clip": 0.01152768, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.0645256, + "balance_loss_mlp": 1.02463996, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 2.9206653875346107, + "language_loss": 0.89275539, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91468519, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.15563965, + "step": 6491, + "time_per_iteration": 2.460820436477661 + }, + { + "auxiliary_loss_clip": 0.01129747, + "auxiliary_loss_mlp": 0.01047528, + "balance_loss_clip": 1.04681063, + "balance_loss_mlp": 1.03131604, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 2.101818619430747, + "language_loss": 0.78904569, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.81081849, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.16223145, + "step": 6492, + "time_per_iteration": 2.521601438522339 + }, + { + "auxiliary_loss_clip": 0.01139018, + "auxiliary_loss_mlp": 0.01042354, + "balance_loss_clip": 1.0540098, + "balance_loss_mlp": 1.02655911, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.6468486221737144, + "language_loss": 0.74431831, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76613206, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15808105, + "step": 6493, + "time_per_iteration": 2.717024564743042 + }, + { + "auxiliary_loss_clip": 0.01149061, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.057531, + "balance_loss_mlp": 1.02441502, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 2.2109090204789545, + "language_loss": 0.76155978, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78344738, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.1529541, + "step": 6494, + "time_per_iteration": 2.5546109676361084 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01047522, + "balance_loss_clip": 1.0518899, + "balance_loss_mlp": 1.03084433, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.083240163658723, + "language_loss": 0.74519819, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76713145, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.16662598, + "step": 6495, + "time_per_iteration": 2.4258713722229004 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.0104303, + "balance_loss_clip": 1.05809784, + "balance_loss_mlp": 1.02692509, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 5.212833999929651, + "language_loss": 0.68274593, + "learning_rate": 2.784351212350352e-06, + "loss": 0.7045998, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.16107178, + "step": 6496, + "time_per_iteration": 4.007100820541382 + }, + { + "auxiliary_loss_clip": 0.01067072, + "auxiliary_loss_mlp": 0.01016553, + "balance_loss_clip": 1.03695726, + "balance_loss_mlp": 1.01475549, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6586996136162587, + "language_loss": 0.53978026, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56061655, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.30126953, + "router_z_loss_mlp": 0.01794434, + "step": 6497, + "time_per_iteration": 3.205495834350586 + }, + { + "auxiliary_loss_clip": 0.01137928, + "auxiliary_loss_mlp": 0.0103488, + "balance_loss_clip": 1.05282891, + "balance_loss_mlp": 1.0199132, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.593991556122208, + "language_loss": 0.69098437, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71271241, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14953613, + "step": 6498, + "time_per_iteration": 2.5041956901550293 + }, + { + "auxiliary_loss_clip": 0.01073504, + "auxiliary_loss_mlp": 0.0101416, + "balance_loss_clip": 1.0441128, + "balance_loss_mlp": 1.01235378, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7332810025239959, + "language_loss": 0.51795316, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5388298, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.29443359, + "router_z_loss_mlp": 0.01806641, + "step": 6499, + "time_per_iteration": 3.135016679763794 + }, + { + "auxiliary_loss_clip": 0.01136304, + "auxiliary_loss_mlp": 0.01056346, + "balance_loss_clip": 1.04962921, + "balance_loss_mlp": 1.03716564, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.6102849533958628, + "language_loss": 0.74511051, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76703691, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.19189453, + "step": 6500, + "time_per_iteration": 2.564802646636963 + }, + { + "auxiliary_loss_clip": 0.01132374, + "auxiliary_loss_mlp": 0.0104228, + "balance_loss_clip": 1.04752409, + "balance_loss_mlp": 1.02629447, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 1.984688232670152, + "language_loss": 0.69578576, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.71753234, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.15979004, + "step": 6501, + "time_per_iteration": 2.579709768295288 + }, + { + "auxiliary_loss_clip": 0.01134057, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.05070984, + "balance_loss_mlp": 1.02202463, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 2.40659838209748, + "language_loss": 0.7886337, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81033933, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14459229, + "step": 6502, + "time_per_iteration": 2.4900221824645996 + }, + { + "auxiliary_loss_clip": 0.01133756, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.05269337, + "balance_loss_mlp": 1.02549529, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.038558217196806, + "language_loss": 0.79737616, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81911075, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14233398, + "step": 6503, + "time_per_iteration": 2.5256295204162598 + }, + { + "auxiliary_loss_clip": 0.01135588, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.0532304, + "balance_loss_mlp": 1.01633894, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.9922500095215292, + "language_loss": 0.71509004, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73674458, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13525391, + "step": 6504, + "time_per_iteration": 3.837146043777466 + }, + { + "auxiliary_loss_clip": 0.01131995, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.05036259, + "balance_loss_mlp": 1.01553535, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 3.4228412450274197, + "language_loss": 0.83608437, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85770899, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.14953613, + "step": 6505, + "time_per_iteration": 2.5965845584869385 + }, + { + "auxiliary_loss_clip": 0.0113283, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.05061054, + "balance_loss_mlp": 1.02603745, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9300006919642716, + "language_loss": 0.71058166, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73233098, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.16064453, + "step": 6506, + "time_per_iteration": 2.5769450664520264 + }, + { + "auxiliary_loss_clip": 0.01126423, + "auxiliary_loss_mlp": 0.01038775, + "balance_loss_clip": 1.04746866, + "balance_loss_mlp": 1.02462506, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 1.7574861282348213, + "language_loss": 0.75372779, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77537978, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.14135742, + "step": 6507, + "time_per_iteration": 2.4427988529205322 + }, + { + "auxiliary_loss_clip": 0.01063619, + "auxiliary_loss_mlp": 0.01008294, + "balance_loss_clip": 1.03229141, + "balance_loss_mlp": 1.00641644, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7612242463066593, + "language_loss": 0.56548077, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58619988, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.31298828, + "router_z_loss_mlp": 0.01873779, + "step": 6508, + "time_per_iteration": 3.2144267559051514 + }, + { + "auxiliary_loss_clip": 0.01129242, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.04690409, + "balance_loss_mlp": 1.02615368, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.9866620130908947, + "language_loss": 0.76257873, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78429401, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.16137695, + "step": 6509, + "time_per_iteration": 2.4542315006256104 + }, + { + "auxiliary_loss_clip": 0.01138762, + "auxiliary_loss_mlp": 0.01048217, + "balance_loss_clip": 1.05195451, + "balance_loss_mlp": 1.03057361, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 3.0002614974564272, + "language_loss": 0.82965577, + "learning_rate": 2.779332635075825e-06, + "loss": 0.85152555, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.17626953, + "step": 6510, + "time_per_iteration": 2.484116792678833 + }, + { + "auxiliary_loss_clip": 0.01129698, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.04634547, + "balance_loss_mlp": 1.02438283, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.7104474425869538, + "language_loss": 0.76668155, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78837383, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.1517334, + "step": 6511, + "time_per_iteration": 2.4838006496429443 + }, + { + "auxiliary_loss_clip": 0.01053816, + "auxiliary_loss_mlp": 0.01003925, + "balance_loss_clip": 1.02352571, + "balance_loss_mlp": 1.00218415, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7270775669959795, + "language_loss": 0.57799911, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59857655, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.01739502, + "step": 6512, + "time_per_iteration": 3.1378636360168457 + }, + { + "auxiliary_loss_clip": 0.01135972, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.05248463, + "balance_loss_mlp": 1.02010202, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.631512364686999, + "language_loss": 0.69618475, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71792555, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.17993164, + "step": 6513, + "time_per_iteration": 2.506843090057373 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.04874527, + "balance_loss_mlp": 1.02491784, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.53691801966333, + "language_loss": 0.75894648, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7807045, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.15057373, + "step": 6514, + "time_per_iteration": 2.479567289352417 + }, + { + "auxiliary_loss_clip": 0.01127988, + "auxiliary_loss_mlp": 0.01041916, + "balance_loss_clip": 1.04519773, + "balance_loss_mlp": 1.02637088, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.6794485110087018, + "language_loss": 0.77214682, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79384589, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.15539551, + "step": 6515, + "time_per_iteration": 3.903174877166748 + }, + { + "auxiliary_loss_clip": 0.0112761, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.04670167, + "balance_loss_mlp": 1.02682316, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.5623198546796342, + "language_loss": 0.80013591, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82181132, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13098145, + "step": 6516, + "time_per_iteration": 2.5657970905303955 + }, + { + "auxiliary_loss_clip": 0.01138624, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.05540848, + "balance_loss_mlp": 1.0268873, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.9774481891764437, + "language_loss": 0.70433795, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72614014, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14709473, + "step": 6517, + "time_per_iteration": 2.4501166343688965 + }, + { + "auxiliary_loss_clip": 0.01142967, + "auxiliary_loss_mlp": 0.01041768, + "balance_loss_clip": 1.05902553, + "balance_loss_mlp": 1.02715302, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.664108095672269, + "language_loss": 0.71772957, + "learning_rate": 2.776462273631956e-06, + "loss": 0.73957694, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14624023, + "step": 6518, + "time_per_iteration": 2.8190133571624756 + }, + { + "auxiliary_loss_clip": 0.01133138, + "auxiliary_loss_mlp": 0.01047897, + "balance_loss_clip": 1.04989684, + "balance_loss_mlp": 1.03164828, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.7627453364402552, + "language_loss": 0.61524987, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63706028, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.16259766, + "step": 6519, + "time_per_iteration": 2.6160404682159424 + }, + { + "auxiliary_loss_clip": 0.01137401, + "auxiliary_loss_mlp": 0.01045821, + "balance_loss_clip": 1.05030727, + "balance_loss_mlp": 1.02995384, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 1.7972786047398712, + "language_loss": 0.66952229, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69135451, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15856934, + "step": 6520, + "time_per_iteration": 2.51396107673645 + }, + { + "auxiliary_loss_clip": 0.01132126, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.04889655, + "balance_loss_mlp": 1.02724814, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.8039546689742079, + "language_loss": 0.78895438, + "learning_rate": 2.775385401898104e-06, + "loss": 0.81069094, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.1427002, + "step": 6521, + "time_per_iteration": 4.023303747177124 + }, + { + "auxiliary_loss_clip": 0.01145104, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.05624306, + "balance_loss_mlp": 1.02305448, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.185948849646246, + "language_loss": 0.70169413, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72354734, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.17163086, + "step": 6522, + "time_per_iteration": 2.4417500495910645 + }, + { + "auxiliary_loss_clip": 0.01138028, + "auxiliary_loss_mlp": 0.01035642, + "balance_loss_clip": 1.053424, + "balance_loss_mlp": 1.02071738, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.9371348349689097, + "language_loss": 0.77024162, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79197824, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14935303, + "step": 6523, + "time_per_iteration": 2.4858856201171875 + }, + { + "auxiliary_loss_clip": 0.01139488, + "auxiliary_loss_mlp": 0.01050442, + "balance_loss_clip": 1.05182588, + "balance_loss_mlp": 1.03539205, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.4346532346907113, + "language_loss": 0.61594254, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63784182, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.15057373, + "step": 6524, + "time_per_iteration": 2.554354429244995 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01042732, + "balance_loss_clip": 1.04950798, + "balance_loss_mlp": 1.02724624, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.5743256855962486, + "language_loss": 0.74341118, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76517057, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15484619, + "step": 6525, + "time_per_iteration": 2.5275776386260986 + }, + { + "auxiliary_loss_clip": 0.01134684, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.05028534, + "balance_loss_mlp": 1.02923679, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0200753573360566, + "language_loss": 0.81581765, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83759844, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14172363, + "step": 6526, + "time_per_iteration": 2.4554977416992188 + }, + { + "auxiliary_loss_clip": 0.01129285, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.04610169, + "balance_loss_mlp": 1.0373733, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.757499273766055, + "language_loss": 0.70296288, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72480381, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.17419434, + "step": 6527, + "time_per_iteration": 2.4749109745025635 + }, + { + "auxiliary_loss_clip": 0.01136024, + "auxiliary_loss_mlp": 0.01043986, + "balance_loss_clip": 1.0533123, + "balance_loss_mlp": 1.02939498, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.675539262386873, + "language_loss": 0.82615143, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84795147, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14587402, + "step": 6528, + "time_per_iteration": 2.443309783935547 + }, + { + "auxiliary_loss_clip": 0.01128628, + "auxiliary_loss_mlp": 0.01060419, + "balance_loss_clip": 1.04741716, + "balance_loss_mlp": 1.04324067, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.6076126768343717, + "language_loss": 0.68529546, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70718592, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.17175293, + "step": 6529, + "time_per_iteration": 2.552579641342163 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.05254626, + "balance_loss_mlp": 1.02800775, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.464614738719163, + "language_loss": 0.801202, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82299566, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.15209961, + "step": 6530, + "time_per_iteration": 2.4897894859313965 + }, + { + "auxiliary_loss_clip": 0.01138232, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.0549531, + "balance_loss_mlp": 1.02252328, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 2.7267540787413203, + "language_loss": 0.75409693, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77584684, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14227295, + "step": 6531, + "time_per_iteration": 2.612795829772949 + }, + { + "auxiliary_loss_clip": 0.01064817, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.0345515, + "balance_loss_mlp": 1.02742577, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8194407342176047, + "language_loss": 0.60343313, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62437379, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 0.01824951, + "step": 6532, + "time_per_iteration": 3.014380931854248 + }, + { + "auxiliary_loss_clip": 0.01094516, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.06496525, + "balance_loss_mlp": 1.02329135, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7824330220401846, + "language_loss": 0.55543494, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57664394, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.29589844, + "router_z_loss_mlp": 0.03097534, + "step": 6533, + "time_per_iteration": 3.2229182720184326 + }, + { + "auxiliary_loss_clip": 0.01137397, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.05152583, + "balance_loss_mlp": 1.02483523, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.7981844835961094, + "language_loss": 0.75966638, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78143448, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14581299, + "step": 6534, + "time_per_iteration": 2.5236024856567383 + }, + { + "auxiliary_loss_clip": 0.01145207, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.05856681, + "balance_loss_mlp": 1.02209616, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.037865278215905, + "language_loss": 0.77916634, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80099213, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15258789, + "step": 6535, + "time_per_iteration": 2.4536638259887695 + }, + { + "auxiliary_loss_clip": 0.01127386, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.04678786, + "balance_loss_mlp": 1.02272201, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.0270635919511513, + "language_loss": 0.68766731, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70932418, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.15576172, + "step": 6536, + "time_per_iteration": 2.50024151802063 + }, + { + "auxiliary_loss_clip": 0.01151805, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.06999207, + "balance_loss_mlp": 1.02163982, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 5.265062432801448, + "language_loss": 0.69292915, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71479619, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13275146, + "step": 6537, + "time_per_iteration": 2.5553174018859863 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.05044544, + "balance_loss_mlp": 1.01865268, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6372211659493954, + "language_loss": 0.78899181, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81066924, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.1463623, + "step": 6538, + "time_per_iteration": 3.904764413833618 + }, + { + "auxiliary_loss_clip": 0.01059581, + "auxiliary_loss_mlp": 0.01005762, + "balance_loss_clip": 1.03006721, + "balance_loss_mlp": 1.00352073, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8084519807848841, + "language_loss": 0.61947727, + "learning_rate": 2.768918627255683e-06, + "loss": 0.6401307, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.29541016, + "router_z_loss_mlp": 0.0223999, + "step": 6539, + "time_per_iteration": 2.8663716316223145 + }, + { + "auxiliary_loss_clip": 0.01149109, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.0618422, + "balance_loss_mlp": 1.01994693, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.004338250109584, + "language_loss": 0.68062544, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70246941, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.15356445, + "step": 6540, + "time_per_iteration": 2.6829004287719727 + }, + { + "auxiliary_loss_clip": 0.01134633, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.05248249, + "balance_loss_mlp": 1.02084339, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 2.253700664483161, + "language_loss": 0.72699612, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74868739, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13659668, + "step": 6541, + "time_per_iteration": 2.525171995162964 + }, + { + "auxiliary_loss_clip": 0.01079301, + "auxiliary_loss_mlp": 0.01007963, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.00604331, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8275673631808673, + "language_loss": 0.60380101, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62467366, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.29638672, + "router_z_loss_mlp": 0.01919556, + "step": 6542, + "time_per_iteration": 2.9570529460906982 + }, + { + "auxiliary_loss_clip": 0.01136107, + "auxiliary_loss_mlp": 0.01039851, + "balance_loss_clip": 1.05140984, + "balance_loss_mlp": 1.02520013, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.5283394545288784, + "language_loss": 0.8260026, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84776211, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14642334, + "step": 6543, + "time_per_iteration": 2.535452365875244 + }, + { + "auxiliary_loss_clip": 0.01132425, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.04931283, + "balance_loss_mlp": 1.0214045, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.5702243718134645, + "language_loss": 0.69334096, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71502447, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.1451416, + "step": 6544, + "time_per_iteration": 2.7635769844055176 + }, + { + "auxiliary_loss_clip": 0.01140008, + "auxiliary_loss_mlp": 0.01045012, + "balance_loss_clip": 1.05604422, + "balance_loss_mlp": 1.0293479, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 2.0921464979924838, + "language_loss": 0.76264608, + "learning_rate": 2.76676093244553e-06, + "loss": 0.78449631, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15649414, + "step": 6545, + "time_per_iteration": 2.5166895389556885 + }, + { + "auxiliary_loss_clip": 0.01142835, + "auxiliary_loss_mlp": 0.01034965, + "balance_loss_clip": 1.0615952, + "balance_loss_mlp": 1.02230537, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4060882615043497, + "language_loss": 0.74424475, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.7660228, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12658691, + "step": 6546, + "time_per_iteration": 3.877398729324341 + }, + { + "auxiliary_loss_clip": 0.01138222, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.05101228, + "balance_loss_mlp": 1.02236819, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.055710604439377, + "language_loss": 0.81685168, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83860809, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15045166, + "step": 6547, + "time_per_iteration": 2.4737067222595215 + }, + { + "auxiliary_loss_clip": 0.01148452, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.06226659, + "balance_loss_mlp": 1.0203644, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.7774223402841907, + "language_loss": 0.83880585, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.860636, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.14196777, + "step": 6548, + "time_per_iteration": 2.468719244003296 + }, + { + "auxiliary_loss_clip": 0.01135373, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.05592561, + "balance_loss_mlp": 1.02229667, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5766374976294808, + "language_loss": 0.72797352, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74968326, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13317871, + "step": 6549, + "time_per_iteration": 2.5672972202301025 + }, + { + "auxiliary_loss_clip": 0.01144707, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.06098366, + "balance_loss_mlp": 1.02410722, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.523001381235544, + "language_loss": 0.77938533, + "learning_rate": 2.764962053731699e-06, + "loss": 0.80121678, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14324951, + "step": 6550, + "time_per_iteration": 2.4740548133850098 + }, + { + "auxiliary_loss_clip": 0.01149318, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.06856656, + "balance_loss_mlp": 1.02107, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6473276723229706, + "language_loss": 0.81278884, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83463275, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14013672, + "step": 6551, + "time_per_iteration": 2.4828639030456543 + }, + { + "auxiliary_loss_clip": 0.01135065, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.05051064, + "balance_loss_mlp": 1.02629566, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.762019264990669, + "language_loss": 0.79766095, + "learning_rate": 2.764242299098596e-06, + "loss": 0.81942219, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14752197, + "step": 6552, + "time_per_iteration": 2.399383783340454 + }, + { + "auxiliary_loss_clip": 0.0114163, + "auxiliary_loss_mlp": 0.01044377, + "balance_loss_clip": 1.05807638, + "balance_loss_mlp": 1.03027415, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 1.815565844671794, + "language_loss": 0.71395099, + "learning_rate": 2.763882378305003e-06, + "loss": 0.735811, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14093018, + "step": 6553, + "time_per_iteration": 2.460181713104248 + }, + { + "auxiliary_loss_clip": 0.0114115, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.05610502, + "balance_loss_mlp": 1.02773869, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.677748767223163, + "language_loss": 0.64199364, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.66383159, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14898682, + "step": 6554, + "time_per_iteration": 2.5073156356811523 + }, + { + "auxiliary_loss_clip": 0.0113632, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.05470669, + "balance_loss_mlp": 1.03081071, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 1.9539289348397253, + "language_loss": 0.79725313, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.8190614, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13690186, + "step": 6555, + "time_per_iteration": 2.6210403442382812 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.01045408, + "balance_loss_clip": 1.06068075, + "balance_loss_mlp": 1.03039932, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.8244332027811936, + "language_loss": 0.71831524, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.74022388, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.15002441, + "step": 6556, + "time_per_iteration": 2.5174336433410645 + }, + { + "auxiliary_loss_clip": 0.01136939, + "auxiliary_loss_mlp": 0.01037528, + "balance_loss_clip": 1.05394435, + "balance_loss_mlp": 1.02298439, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.8639588706751644, + "language_loss": 0.83430427, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.85604894, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14538574, + "step": 6557, + "time_per_iteration": 2.81787109375 + }, + { + "auxiliary_loss_clip": 0.01131249, + "auxiliary_loss_mlp": 0.01044773, + "balance_loss_clip": 1.04765511, + "balance_loss_mlp": 1.02825117, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 2.9889199197929983, + "language_loss": 0.80029947, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.82205975, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.1651001, + "step": 6558, + "time_per_iteration": 4.028227806091309 + }, + { + "auxiliary_loss_clip": 0.01136608, + "auxiliary_loss_mlp": 0.01041803, + "balance_loss_clip": 1.05481029, + "balance_loss_mlp": 1.02867186, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.7497859546785357, + "language_loss": 0.71047837, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73226249, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13140869, + "step": 6559, + "time_per_iteration": 2.4591708183288574 + }, + { + "auxiliary_loss_clip": 0.01142063, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.05470562, + "balance_loss_mlp": 1.02116418, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.896449949158081, + "language_loss": 0.80289769, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82468683, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.15686035, + "step": 6560, + "time_per_iteration": 2.4800143241882324 + }, + { + "auxiliary_loss_clip": 0.0113742, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.05396533, + "balance_loss_mlp": 1.02668929, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 2.2029667842335647, + "language_loss": 0.83231199, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.8541106, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15771484, + "step": 6561, + "time_per_iteration": 2.498391628265381 + }, + { + "auxiliary_loss_clip": 0.01128451, + "auxiliary_loss_mlp": 0.0104748, + "balance_loss_clip": 1.04567182, + "balance_loss_mlp": 1.03336549, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.0146718104288364, + "language_loss": 0.80057681, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.82233614, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14123535, + "step": 6562, + "time_per_iteration": 2.497736930847168 + }, + { + "auxiliary_loss_clip": 0.0113988, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.05697405, + "balance_loss_mlp": 1.02283216, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.5924892605340404, + "language_loss": 0.8154366, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83721364, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14990234, + "step": 6563, + "time_per_iteration": 2.4979934692382812 + }, + { + "auxiliary_loss_clip": 0.01144682, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.06341338, + "balance_loss_mlp": 1.02207494, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 2.427840851250924, + "language_loss": 0.70271611, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72453475, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.15106201, + "step": 6564, + "time_per_iteration": 2.4695122241973877 + }, + { + "auxiliary_loss_clip": 0.0113143, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.04753911, + "balance_loss_mlp": 1.02567685, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 2.212232324902777, + "language_loss": 0.83289498, + "learning_rate": 2.759561073299676e-06, + "loss": 0.85461611, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14984131, + "step": 6565, + "time_per_iteration": 2.4566166400909424 + }, + { + "auxiliary_loss_clip": 0.01133027, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.05012584, + "balance_loss_mlp": 1.02328372, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 2.0977773294193094, + "language_loss": 0.83701575, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85872465, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14575195, + "step": 6566, + "time_per_iteration": 3.88002872467041 + }, + { + "auxiliary_loss_clip": 0.01142985, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.05502653, + "balance_loss_mlp": 1.02201819, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.1896816531068035, + "language_loss": 0.78053153, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.80233067, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14910889, + "step": 6567, + "time_per_iteration": 2.5329160690307617 + }, + { + "auxiliary_loss_clip": 0.01127583, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.05138791, + "balance_loss_mlp": 1.02474475, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 1.8746179498984339, + "language_loss": 0.80739695, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82905, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12969971, + "step": 6568, + "time_per_iteration": 2.44989275932312 + }, + { + "auxiliary_loss_clip": 0.01123999, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.04395866, + "balance_loss_mlp": 1.02047718, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 2.0248547596600783, + "language_loss": 0.85189551, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.87348127, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14111328, + "step": 6569, + "time_per_iteration": 2.4781315326690674 + }, + { + "auxiliary_loss_clip": 0.01125285, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.04759037, + "balance_loss_mlp": 1.02456141, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.7955002019856674, + "language_loss": 0.74777782, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76942933, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.15307617, + "step": 6570, + "time_per_iteration": 2.7385926246643066 + }, + { + "auxiliary_loss_clip": 0.01131847, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.04785395, + "balance_loss_mlp": 1.02414966, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 2.188823999445538, + "language_loss": 0.79699028, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81870139, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15081787, + "step": 6571, + "time_per_iteration": 2.4268712997436523 + }, + { + "auxiliary_loss_clip": 0.01127597, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.04753423, + "balance_loss_mlp": 1.02547741, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.9278838972160908, + "language_loss": 0.77794999, + "learning_rate": 2.757038395157997e-06, + "loss": 0.79961753, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13659668, + "step": 6572, + "time_per_iteration": 2.469367265701294 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.04493666, + "balance_loss_mlp": 1.02173948, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.745048575713096, + "language_loss": 0.75370753, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77535164, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14642334, + "step": 6573, + "time_per_iteration": 2.5003652572631836 + }, + { + "auxiliary_loss_clip": 0.01133944, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.0544095, + "balance_loss_mlp": 1.02074718, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.445445559238223, + "language_loss": 0.67746252, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.69913745, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12780762, + "step": 6574, + "time_per_iteration": 2.701359510421753 + }, + { + "auxiliary_loss_clip": 0.01137701, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.05400193, + "balance_loss_mlp": 1.01957488, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.288897539740429, + "language_loss": 0.71831751, + "learning_rate": 2.755956816505072e-06, + "loss": 0.74004191, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.15179443, + "step": 6575, + "time_per_iteration": 2.4596383571624756 + }, + { + "auxiliary_loss_clip": 0.01136188, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.0532912, + "balance_loss_mlp": 1.02537584, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.0299896909851585, + "language_loss": 0.73425364, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75601029, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14105225, + "step": 6576, + "time_per_iteration": 2.474677085876465 + }, + { + "auxiliary_loss_clip": 0.01137967, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.05464029, + "balance_loss_mlp": 1.02369452, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.393259088777883, + "language_loss": 0.83989865, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.86164629, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13116455, + "step": 6577, + "time_per_iteration": 2.3699588775634766 + }, + { + "auxiliary_loss_clip": 0.01140168, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.05949211, + "balance_loss_mlp": 1.02131224, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 14.113207397506134, + "language_loss": 0.89489257, + "learning_rate": 2.75487497985853e-06, + "loss": 0.91664374, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13647461, + "step": 6578, + "time_per_iteration": 2.5615687370300293 + }, + { + "auxiliary_loss_clip": 0.01140909, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.05448151, + "balance_loss_mlp": 1.01913691, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.9024058135997752, + "language_loss": 0.78439075, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80615288, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.16162109, + "step": 6579, + "time_per_iteration": 2.4782235622406006 + }, + { + "auxiliary_loss_clip": 0.01142266, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.05868459, + "balance_loss_mlp": 1.01780438, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 3.8699250175807656, + "language_loss": 0.69020462, + "learning_rate": 2.754153612280037e-06, + "loss": 0.711959, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.15380859, + "step": 6580, + "time_per_iteration": 2.4691522121429443 + }, + { + "auxiliary_loss_clip": 0.01138773, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.05872357, + "balance_loss_mlp": 1.01544607, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.8646427478021135, + "language_loss": 0.5903393, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.61202633, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14483643, + "step": 6581, + "time_per_iteration": 2.515813112258911 + }, + { + "auxiliary_loss_clip": 0.01140578, + "auxiliary_loss_mlp": 0.0104037, + "balance_loss_clip": 1.05705667, + "balance_loss_mlp": 1.02529001, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.045032560783352, + "language_loss": 0.69949311, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72130257, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15112305, + "step": 6582, + "time_per_iteration": 3.9297022819519043 + }, + { + "auxiliary_loss_clip": 0.01129979, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.04811263, + "balance_loss_mlp": 1.02208614, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 3.957731893179502, + "language_loss": 0.76483989, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78651947, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15887451, + "step": 6583, + "time_per_iteration": 2.7579007148742676 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01038461, + "balance_loss_clip": 1.05068302, + "balance_loss_mlp": 1.02409649, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5528740349947143, + "language_loss": 0.65722364, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.67895877, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14355469, + "step": 6584, + "time_per_iteration": 2.477433204650879 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.05185199, + "balance_loss_mlp": 1.03126431, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.3175841195662445, + "language_loss": 0.72925764, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.75112331, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.16064453, + "step": 6585, + "time_per_iteration": 2.540546417236328 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.05214238, + "balance_loss_mlp": 1.01745164, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.7790627112463488, + "language_loss": 0.73174894, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75340736, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.14263916, + "step": 6586, + "time_per_iteration": 2.5012755393981934 + }, + { + "auxiliary_loss_clip": 0.01132503, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.05168402, + "balance_loss_mlp": 1.01787281, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 1.6015727280664234, + "language_loss": 0.71415782, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73580647, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14477539, + "step": 6587, + "time_per_iteration": 2.4832041263580322 + }, + { + "auxiliary_loss_clip": 0.01061701, + "auxiliary_loss_mlp": 0.010117, + "balance_loss_clip": 1.03252077, + "balance_loss_mlp": 1.00980139, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9055276197846177, + "language_loss": 0.61180186, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63253587, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.29199219, + "router_z_loss_mlp": 0.01901245, + "step": 6588, + "time_per_iteration": 2.8959710597991943 + }, + { + "auxiliary_loss_clip": 0.01138631, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.05365264, + "balance_loss_mlp": 1.0243274, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 2.0321707809100364, + "language_loss": 0.81385177, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83562934, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14794922, + "step": 6589, + "time_per_iteration": 4.047722339630127 + }, + { + "auxiliary_loss_clip": 0.01132303, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.04942989, + "balance_loss_mlp": 1.01967502, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 1.9467812519608907, + "language_loss": 0.701051, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72272402, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.15332031, + "step": 6590, + "time_per_iteration": 2.571596384048462 + }, + { + "auxiliary_loss_clip": 0.01139749, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.05845571, + "balance_loss_mlp": 1.02331352, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.7607392785322054, + "language_loss": 0.76064759, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78242648, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14801025, + "step": 6591, + "time_per_iteration": 2.4718434810638428 + }, + { + "auxiliary_loss_clip": 0.01144062, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.05888534, + "balance_loss_mlp": 1.02707124, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.655205841009179, + "language_loss": 0.78025299, + "learning_rate": 2.749823008443152e-06, + "loss": 0.80211347, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14916992, + "step": 6592, + "time_per_iteration": 2.5118906497955322 + }, + { + "auxiliary_loss_clip": 0.01140246, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.0601778, + "balance_loss_mlp": 1.01758015, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.9694339630161042, + "language_loss": 0.69605005, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71776855, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.14025879, + "step": 6593, + "time_per_iteration": 2.6412179470062256 + }, + { + "auxiliary_loss_clip": 0.01135693, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.05188036, + "balance_loss_mlp": 1.02279449, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.6199500730794225, + "language_loss": 0.78323066, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80497533, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.15991211, + "step": 6594, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01070103, + "auxiliary_loss_mlp": 0.01007132, + "balance_loss_clip": 1.04149747, + "balance_loss_mlp": 1.00519753, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9390736695295764, + "language_loss": 0.62998724, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65075958, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.28662109, + "router_z_loss_mlp": 0.01934814, + "step": 6595, + "time_per_iteration": 3.1627120971679688 + }, + { + "auxiliary_loss_clip": 0.01137372, + "auxiliary_loss_mlp": 0.01044276, + "balance_loss_clip": 1.05100727, + "balance_loss_mlp": 1.02865934, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.051009245140989, + "language_loss": 0.63176298, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65357947, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.15637207, + "step": 6596, + "time_per_iteration": 2.739753007888794 + }, + { + "auxiliary_loss_clip": 0.01134401, + "auxiliary_loss_mlp": 0.01039941, + "balance_loss_clip": 1.05366039, + "balance_loss_mlp": 1.02475369, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.5169836878453267, + "language_loss": 0.79205483, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.81379825, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.15185547, + "step": 6597, + "time_per_iteration": 2.4470293521881104 + }, + { + "auxiliary_loss_clip": 0.0113855, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.05211222, + "balance_loss_mlp": 1.02476192, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 2.1642044171721957, + "language_loss": 0.67486346, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69665349, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15679932, + "step": 6598, + "time_per_iteration": 2.5546114444732666 + }, + { + "auxiliary_loss_clip": 0.01136467, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.05207658, + "balance_loss_mlp": 1.02652645, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 1.7140664473337412, + "language_loss": 0.78697646, + "learning_rate": 2.747294930536157e-06, + "loss": 0.80875039, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14379883, + "step": 6599, + "time_per_iteration": 2.5582237243652344 + }, + { + "auxiliary_loss_clip": 0.01136285, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.05337977, + "balance_loss_mlp": 1.02262712, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.6677900065619387, + "language_loss": 0.7258265, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74757648, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.16088867, + "step": 6600, + "time_per_iteration": 2.485591411590576 + }, + { + "auxiliary_loss_clip": 0.01136649, + "auxiliary_loss_mlp": 0.01046729, + "balance_loss_clip": 1.0534687, + "balance_loss_mlp": 1.02956259, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.232184306618715, + "language_loss": 0.857503, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87933671, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.17175293, + "step": 6601, + "time_per_iteration": 3.9101600646972656 + }, + { + "auxiliary_loss_clip": 0.01154978, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.06236649, + "balance_loss_mlp": 1.0231967, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.2155911352084274, + "language_loss": 0.69980425, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.7217629, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.17675781, + "step": 6602, + "time_per_iteration": 2.4652087688446045 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01044412, + "balance_loss_clip": 1.0574708, + "balance_loss_mlp": 1.02939129, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.3377529474563548, + "language_loss": 0.82868278, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85053492, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.15026855, + "step": 6603, + "time_per_iteration": 2.4619104862213135 + }, + { + "auxiliary_loss_clip": 0.01135544, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.05438399, + "balance_loss_mlp": 1.02048063, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.521209538537669, + "language_loss": 0.73055816, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75225782, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13946533, + "step": 6604, + "time_per_iteration": 2.4889400005340576 + }, + { + "auxiliary_loss_clip": 0.01133614, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.05465651, + "balance_loss_mlp": 1.02435327, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.5289091573813745, + "language_loss": 0.82412869, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84586275, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.15441895, + "step": 6605, + "time_per_iteration": 2.527327299118042 + }, + { + "auxiliary_loss_clip": 0.0114722, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.06602097, + "balance_loss_mlp": 1.0214721, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.755540788407462, + "language_loss": 0.74122655, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.76304519, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.1317749, + "step": 6606, + "time_per_iteration": 2.488882064819336 + }, + { + "auxiliary_loss_clip": 0.01134197, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.05116105, + "balance_loss_mlp": 1.02347136, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.7641216979516041, + "language_loss": 0.74048096, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76220185, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14428711, + "step": 6607, + "time_per_iteration": 2.5539793968200684 + }, + { + "auxiliary_loss_clip": 0.01147298, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.06395626, + "balance_loss_mlp": 1.02302265, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 2.005372914553805, + "language_loss": 0.68189967, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70374972, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14678955, + "step": 6608, + "time_per_iteration": 4.248083591461182 + }, + { + "auxiliary_loss_clip": 0.01135782, + "auxiliary_loss_mlp": 0.01045814, + "balance_loss_clip": 1.05112004, + "balance_loss_mlp": 1.02952957, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 1.9490345398609865, + "language_loss": 0.74316096, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76497686, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.1628418, + "step": 6609, + "time_per_iteration": 2.5594635009765625 + }, + { + "auxiliary_loss_clip": 0.01139235, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.05559886, + "balance_loss_mlp": 1.01981139, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.8650760439471845, + "language_loss": 0.71564567, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73738408, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14776611, + "step": 6610, + "time_per_iteration": 2.4712038040161133 + }, + { + "auxiliary_loss_clip": 0.01130736, + "auxiliary_loss_mlp": 0.01043364, + "balance_loss_clip": 1.04903507, + "balance_loss_mlp": 1.02712798, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.7411119635093857, + "language_loss": 0.79016566, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.8119067, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.16247559, + "step": 6611, + "time_per_iteration": 2.4635846614837646 + }, + { + "auxiliary_loss_clip": 0.01137856, + "auxiliary_loss_mlp": 0.01036851, + "balance_loss_clip": 1.05486155, + "balance_loss_mlp": 1.02206945, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.8337486710492843, + "language_loss": 0.79227561, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81402272, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.14782715, + "step": 6612, + "time_per_iteration": 2.5841705799102783 + }, + { + "auxiliary_loss_clip": 0.01075519, + "auxiliary_loss_mlp": 0.01007026, + "balance_loss_clip": 1.04642391, + "balance_loss_mlp": 1.00513017, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8437441227394091, + "language_loss": 0.64941537, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67024076, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.29052734, + "router_z_loss_mlp": 0.01895142, + "step": 6613, + "time_per_iteration": 3.001143217086792 + }, + { + "auxiliary_loss_clip": 0.01147262, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.06371379, + "balance_loss_mlp": 1.01749682, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.627729136402832, + "language_loss": 0.72089058, + "learning_rate": 2.741872951078109e-06, + "loss": 0.74269456, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.15649414, + "step": 6614, + "time_per_iteration": 2.483943462371826 + }, + { + "auxiliary_loss_clip": 0.01134974, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.05159163, + "balance_loss_mlp": 1.01967835, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.835931489171768, + "language_loss": 0.81811738, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83981091, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14703369, + "step": 6615, + "time_per_iteration": 2.438354730606079 + }, + { + "auxiliary_loss_clip": 0.01135177, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.05233431, + "balance_loss_mlp": 1.02199757, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 1.9718342871210672, + "language_loss": 0.6742292, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69594735, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14630127, + "step": 6616, + "time_per_iteration": 2.441570997238159 + }, + { + "auxiliary_loss_clip": 0.01141149, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.05414796, + "balance_loss_mlp": 1.02997494, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.844487235670068, + "language_loss": 0.83802986, + "learning_rate": 2.740787794144541e-06, + "loss": 0.8598991, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.15795898, + "step": 6617, + "time_per_iteration": 2.4476161003112793 + }, + { + "auxiliary_loss_clip": 0.01134622, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.05517495, + "balance_loss_mlp": 1.02647138, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6717420051581628, + "language_loss": 0.72723055, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74897391, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13232422, + "step": 6618, + "time_per_iteration": 2.457279682159424 + }, + { + "auxiliary_loss_clip": 0.01143231, + "auxiliary_loss_mlp": 0.0104155, + "balance_loss_clip": 1.06049085, + "balance_loss_mlp": 1.02586782, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 2.157368334836629, + "language_loss": 0.65585399, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67770177, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.15673828, + "step": 6619, + "time_per_iteration": 2.5961809158325195 + }, + { + "auxiliary_loss_clip": 0.01080807, + "auxiliary_loss_mlp": 0.01006597, + "balance_loss_clip": 1.05028999, + "balance_loss_mlp": 1.00473785, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7820039329105525, + "language_loss": 0.58221948, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60309356, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.30517578, + "router_z_loss_mlp": 0.01861572, + "step": 6620, + "time_per_iteration": 3.0367677211761475 + }, + { + "auxiliary_loss_clip": 0.01137987, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.05737185, + "balance_loss_mlp": 1.0281713, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5410310580644755, + "language_loss": 0.79112655, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81291908, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.1307373, + "step": 6621, + "time_per_iteration": 2.4937825202941895 + }, + { + "auxiliary_loss_clip": 0.01140013, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.05882311, + "balance_loss_mlp": 1.02383327, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.9897422977709354, + "language_loss": 0.77732509, + "learning_rate": 2.738978637623252e-06, + "loss": 0.7990979, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13446045, + "step": 6622, + "time_per_iteration": 2.709019899368286 + }, + { + "auxiliary_loss_clip": 0.01135777, + "auxiliary_loss_mlp": 0.01038741, + "balance_loss_clip": 1.05323815, + "balance_loss_mlp": 1.02407193, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.462644902907815, + "language_loss": 0.74598503, + "learning_rate": 2.738616722197674e-06, + "loss": 0.76773024, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14666748, + "step": 6623, + "time_per_iteration": 2.4776663780212402 + }, + { + "auxiliary_loss_clip": 0.01135043, + "auxiliary_loss_mlp": 0.01047568, + "balance_loss_clip": 1.0539223, + "balance_loss_mlp": 1.03326285, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.9378143616157508, + "language_loss": 0.79140323, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81322938, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14306641, + "step": 6624, + "time_per_iteration": 2.463460683822632 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01040311, + "balance_loss_clip": 1.05265617, + "balance_loss_mlp": 1.0243969, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 32.95009893384554, + "language_loss": 0.84212708, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.86390722, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.15899658, + "step": 6625, + "time_per_iteration": 3.833293914794922 + }, + { + "auxiliary_loss_clip": 0.01132959, + "auxiliary_loss_mlp": 0.01039923, + "balance_loss_clip": 1.05077708, + "balance_loss_mlp": 1.02504563, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.069920078790775, + "language_loss": 0.86907798, + "learning_rate": 2.737530807925321e-06, + "loss": 0.89080691, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14886475, + "step": 6626, + "time_per_iteration": 2.4443979263305664 + }, + { + "auxiliary_loss_clip": 0.01138132, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_clip": 1.05703163, + "balance_loss_mlp": 1.03057432, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.579510503396921, + "language_loss": 0.83874792, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86060035, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.16540527, + "step": 6627, + "time_per_iteration": 2.4442386627197266 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01036582, + "balance_loss_clip": 1.05112684, + "balance_loss_mlp": 1.02366602, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.5038801764679146, + "language_loss": 0.82988918, + "learning_rate": 2.736806725217998e-06, + "loss": 0.85155171, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12915039, + "step": 6628, + "time_per_iteration": 2.4815621376037598 + }, + { + "auxiliary_loss_clip": 0.01132085, + "auxiliary_loss_mlp": 0.01049227, + "balance_loss_clip": 1.05040658, + "balance_loss_mlp": 1.03495741, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.7068386358856174, + "language_loss": 0.71107066, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73288381, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14263916, + "step": 6629, + "time_per_iteration": 2.532839775085449 + }, + { + "auxiliary_loss_clip": 0.0113043, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.05185938, + "balance_loss_mlp": 1.02397859, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.793835920495504, + "language_loss": 0.81067234, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.83235615, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13983154, + "step": 6630, + "time_per_iteration": 2.661642551422119 + }, + { + "auxiliary_loss_clip": 0.01134322, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.05338717, + "balance_loss_mlp": 1.02040434, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.9513646426631874, + "language_loss": 0.75208288, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77378666, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.15649414, + "step": 6631, + "time_per_iteration": 2.4716508388519287 + }, + { + "auxiliary_loss_clip": 0.01139763, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.05703306, + "balance_loss_mlp": 1.02405179, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.053757925566536, + "language_loss": 0.71825862, + "learning_rate": 2.735358224635783e-06, + "loss": 0.74004459, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14782715, + "step": 6632, + "time_per_iteration": 2.5501251220703125 + }, + { + "auxiliary_loss_clip": 0.01133185, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.0526762, + "balance_loss_mlp": 1.02114797, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.766793410211278, + "language_loss": 0.75404513, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.77571768, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12915039, + "step": 6633, + "time_per_iteration": 3.845587730407715 + }, + { + "auxiliary_loss_clip": 0.01138142, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.05706573, + "balance_loss_mlp": 1.01664579, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.836680436225814, + "language_loss": 0.8120355, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83371639, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13305664, + "step": 6634, + "time_per_iteration": 2.5469181537628174 + }, + { + "auxiliary_loss_clip": 0.01133337, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.05204773, + "balance_loss_mlp": 1.01993728, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.9639004334346857, + "language_loss": 0.7459271, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.76760668, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14685059, + "step": 6635, + "time_per_iteration": 2.728128671646118 + }, + { + "auxiliary_loss_clip": 0.01149321, + "auxiliary_loss_mlp": 0.01041968, + "balance_loss_clip": 1.06221437, + "balance_loss_mlp": 1.02585649, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 2.2664064914471354, + "language_loss": 0.66304588, + "learning_rate": 2.733909277895868e-06, + "loss": 0.6849587, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.16113281, + "step": 6636, + "time_per_iteration": 2.4695496559143066 + }, + { + "auxiliary_loss_clip": 0.01128473, + "auxiliary_loss_mlp": 0.01039169, + "balance_loss_clip": 1.04882002, + "balance_loss_mlp": 1.02390981, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.7411580365042127, + "language_loss": 0.8181932, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83986968, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.15258789, + "step": 6637, + "time_per_iteration": 2.487595558166504 + }, + { + "auxiliary_loss_clip": 0.01069395, + "auxiliary_loss_mlp": 0.01005021, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.00334668, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7448568726453602, + "language_loss": 0.53199351, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55273765, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.28955078, + "router_z_loss_mlp": 0.01673889, + "step": 6638, + "time_per_iteration": 3.133427858352661 + }, + { + "auxiliary_loss_clip": 0.01130031, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.04928553, + "balance_loss_mlp": 1.01999605, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.57520639684153, + "language_loss": 0.75717646, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77881789, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14135742, + "step": 6639, + "time_per_iteration": 2.4691858291625977 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.05220556, + "balance_loss_mlp": 1.01812196, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.562660373014358, + "language_loss": 0.76354992, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78515995, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13153076, + "step": 6640, + "time_per_iteration": 2.51564621925354 + }, + { + "auxiliary_loss_clip": 0.01132092, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.05057263, + "balance_loss_mlp": 1.021891, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.1401315588356185, + "language_loss": 0.81711018, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.8387872, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13726807, + "step": 6641, + "time_per_iteration": 2.445952892303467 + }, + { + "auxiliary_loss_clip": 0.01141171, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.05770922, + "balance_loss_mlp": 1.0184288, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 1.9471068384054684, + "language_loss": 0.76926017, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.79100215, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.1461792, + "step": 6642, + "time_per_iteration": 2.4509103298187256 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.04925609, + "balance_loss_mlp": 1.0205152, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 1.9757668410745022, + "language_loss": 0.72273225, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74439162, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.14465332, + "step": 6643, + "time_per_iteration": 2.4432334899902344 + }, + { + "auxiliary_loss_clip": 0.01140838, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.05848479, + "balance_loss_mlp": 1.02108777, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.6160009648903815, + "language_loss": 0.66799164, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68975687, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14611816, + "step": 6644, + "time_per_iteration": 2.4657320976257324 + }, + { + "auxiliary_loss_clip": 0.01131294, + "auxiliary_loss_mlp": 0.01047498, + "balance_loss_clip": 1.05065286, + "balance_loss_mlp": 1.03167844, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.8644603198882774, + "language_loss": 0.77985424, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80164218, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.15820312, + "step": 6645, + "time_per_iteration": 3.825723171234131 + }, + { + "auxiliary_loss_clip": 0.01128078, + "auxiliary_loss_mlp": 0.01038219, + "balance_loss_clip": 1.04561687, + "balance_loss_mlp": 1.02344918, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.6113815737322628, + "language_loss": 0.69993007, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72159302, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.14770508, + "step": 6646, + "time_per_iteration": 2.5364418029785156 + }, + { + "auxiliary_loss_clip": 0.01124629, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.042647, + "balance_loss_mlp": 1.02428889, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 2.0184736939095718, + "language_loss": 0.72001553, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74164784, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14306641, + "step": 6647, + "time_per_iteration": 2.5853946208953857 + }, + { + "auxiliary_loss_clip": 0.01126003, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.04726124, + "balance_loss_mlp": 1.02967882, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 3.1989245648203175, + "language_loss": 0.74347872, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76518184, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14630127, + "step": 6648, + "time_per_iteration": 2.7341721057891846 + }, + { + "auxiliary_loss_clip": 0.01140912, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.05774236, + "balance_loss_mlp": 1.01747966, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.87556304451312, + "language_loss": 0.65552807, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.67726016, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14813232, + "step": 6649, + "time_per_iteration": 2.4945180416107178 + }, + { + "auxiliary_loss_clip": 0.0114618, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.06257463, + "balance_loss_mlp": 1.02356637, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.9678033320157557, + "language_loss": 0.75615805, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77799439, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13879395, + "step": 6650, + "time_per_iteration": 2.5718371868133545 + }, + { + "auxiliary_loss_clip": 0.0112558, + "auxiliary_loss_mlp": 0.01039238, + "balance_loss_clip": 1.04556704, + "balance_loss_mlp": 1.02498615, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 2.0277513178635624, + "language_loss": 0.71695495, + "learning_rate": 2.728471769038975e-06, + "loss": 0.73860317, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.14257812, + "step": 6651, + "time_per_iteration": 2.5515308380126953 + }, + { + "auxiliary_loss_clip": 0.01136216, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.05356157, + "balance_loss_mlp": 1.02023935, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.6795256572861694, + "language_loss": 0.73411316, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75582063, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14294434, + "step": 6652, + "time_per_iteration": 3.8860440254211426 + }, + { + "auxiliary_loss_clip": 0.01055641, + "auxiliary_loss_mlp": 0.0100633, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.00479412, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8614185081395439, + "language_loss": 0.60652828, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62714797, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.0153656, + "step": 6653, + "time_per_iteration": 3.0251004695892334 + }, + { + "auxiliary_loss_clip": 0.01123392, + "auxiliary_loss_mlp": 0.01040157, + "balance_loss_clip": 1.0466547, + "balance_loss_mlp": 1.02662683, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 1.961577621992696, + "language_loss": 0.66958642, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69122195, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13543701, + "step": 6654, + "time_per_iteration": 2.468437910079956 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.05203152, + "balance_loss_mlp": 1.02667093, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 1.8481227559945494, + "language_loss": 0.90275896, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92445636, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.11938477, + "step": 6655, + "time_per_iteration": 2.4445981979370117 + }, + { + "auxiliary_loss_clip": 0.01135783, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.05721343, + "balance_loss_mlp": 1.01972866, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.8452497142919733, + "language_loss": 0.73557651, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75724858, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11694336, + "step": 6656, + "time_per_iteration": 2.5397117137908936 + }, + { + "auxiliary_loss_clip": 0.01129415, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.04908633, + "balance_loss_mlp": 1.02923441, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.839115751661133, + "language_loss": 0.73557281, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75729626, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.137146, + "step": 6657, + "time_per_iteration": 2.7205357551574707 + }, + { + "auxiliary_loss_clip": 0.01127327, + "auxiliary_loss_mlp": 0.01041803, + "balance_loss_clip": 1.0461607, + "balance_loss_mlp": 1.02633548, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4446685080899988, + "language_loss": 0.79778445, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81947571, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.15460205, + "step": 6658, + "time_per_iteration": 2.5939877033233643 + }, + { + "auxiliary_loss_clip": 0.01130654, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.04951501, + "balance_loss_mlp": 1.02593446, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.9286693444911767, + "language_loss": 0.77408171, + "learning_rate": 2.72556921998167e-06, + "loss": 0.7957871, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13946533, + "step": 6659, + "time_per_iteration": 2.5255086421966553 + }, + { + "auxiliary_loss_clip": 0.01131263, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.05672657, + "balance_loss_mlp": 1.01865494, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.676717267482202, + "language_loss": 0.72631949, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74793899, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12030029, + "step": 6660, + "time_per_iteration": 2.5479586124420166 + }, + { + "auxiliary_loss_clip": 0.01130423, + "auxiliary_loss_mlp": 0.01037855, + "balance_loss_clip": 1.04912305, + "balance_loss_mlp": 1.02475393, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.8373348862380048, + "language_loss": 0.70942181, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73110461, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13104248, + "step": 6661, + "time_per_iteration": 2.540132999420166 + }, + { + "auxiliary_loss_clip": 0.01132826, + "auxiliary_loss_mlp": 0.01053715, + "balance_loss_clip": 1.04990315, + "balance_loss_mlp": 1.03691876, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 2.727329428969771, + "language_loss": 0.7564317, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77829713, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.16809082, + "step": 6662, + "time_per_iteration": 2.4527459144592285 + }, + { + "auxiliary_loss_clip": 0.0113969, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.05673444, + "balance_loss_mlp": 1.01993895, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 1.9079680414023064, + "language_loss": 0.66092384, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68265986, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13970947, + "step": 6663, + "time_per_iteration": 2.4071226119995117 + }, + { + "auxiliary_loss_clip": 0.01134737, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.05361688, + "balance_loss_mlp": 1.02191424, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.0680322010326027, + "language_loss": 0.86188555, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.88358706, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13494873, + "step": 6664, + "time_per_iteration": 2.4562973976135254 + }, + { + "auxiliary_loss_clip": 0.01126258, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.04641604, + "balance_loss_mlp": 1.0231849, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 2.218594521437142, + "language_loss": 0.85035473, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87198794, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13873291, + "step": 6665, + "time_per_iteration": 2.4936435222625732 + }, + { + "auxiliary_loss_clip": 0.01133583, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.05021572, + "balance_loss_mlp": 1.03018475, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 2.944777471588603, + "language_loss": 0.78241003, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80420578, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15820312, + "step": 6666, + "time_per_iteration": 2.4472906589508057 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.04894948, + "balance_loss_mlp": 1.02407241, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 2.039289204780522, + "language_loss": 0.7379626, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75965112, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14916992, + "step": 6667, + "time_per_iteration": 2.5049593448638916 + }, + { + "auxiliary_loss_clip": 0.01141006, + "auxiliary_loss_mlp": 0.01057785, + "balance_loss_clip": 1.05427921, + "balance_loss_mlp": 1.04095244, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.4921611363762892, + "language_loss": 0.75874925, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78073716, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.16821289, + "step": 6668, + "time_per_iteration": 3.8851335048675537 + }, + { + "auxiliary_loss_clip": 0.01138455, + "auxiliary_loss_mlp": 0.01050285, + "balance_loss_clip": 1.05448925, + "balance_loss_mlp": 1.03451991, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.9481089171743662, + "language_loss": 0.82400197, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84588939, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.15765381, + "step": 6669, + "time_per_iteration": 2.520827054977417 + }, + { + "auxiliary_loss_clip": 0.0107261, + "auxiliary_loss_mlp": 0.01013328, + "balance_loss_clip": 1.04251611, + "balance_loss_mlp": 1.01150739, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.697793498242915, + "language_loss": 0.53375524, + "learning_rate": 2.721575341289695e-06, + "loss": 0.5546146, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.01818848, + "step": 6670, + "time_per_iteration": 3.2383663654327393 + }, + { + "auxiliary_loss_clip": 0.01133438, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.0515182, + "balance_loss_mlp": 1.02457047, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.6894464762005712, + "language_loss": 0.88365018, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.9053759, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.14587402, + "step": 6671, + "time_per_iteration": 2.5764243602752686 + }, + { + "auxiliary_loss_clip": 0.01135647, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.05448389, + "balance_loss_mlp": 1.02062106, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.885346506073835, + "language_loss": 0.79123634, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81294841, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14910889, + "step": 6672, + "time_per_iteration": 2.5556113719940186 + }, + { + "auxiliary_loss_clip": 0.01137027, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.05862975, + "balance_loss_mlp": 1.0142293, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 2.064574694372564, + "language_loss": 0.63013458, + "learning_rate": 2.72048552626888e-06, + "loss": 0.6517843, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13708496, + "step": 6673, + "time_per_iteration": 2.505500078201294 + }, + { + "auxiliary_loss_clip": 0.01143445, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.0604192, + "balance_loss_mlp": 1.02437854, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4862643692637063, + "language_loss": 0.80390763, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82572418, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13824463, + "step": 6674, + "time_per_iteration": 2.5426151752471924 + }, + { + "auxiliary_loss_clip": 0.01141344, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.05611706, + "balance_loss_mlp": 1.02149129, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.405266469508899, + "language_loss": 0.83177602, + "learning_rate": 2.719758846294294e-06, + "loss": 0.85354763, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14331055, + "step": 6675, + "time_per_iteration": 2.4884424209594727 + }, + { + "auxiliary_loss_clip": 0.0113981, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.0591917, + "balance_loss_mlp": 1.01811576, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.75407277295299, + "language_loss": 0.93573743, + "learning_rate": 2.71939546536012e-06, + "loss": 0.9574638, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.14703369, + "step": 6676, + "time_per_iteration": 3.9421558380126953 + }, + { + "auxiliary_loss_clip": 0.01148812, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.06140351, + "balance_loss_mlp": 1.02667475, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 1.9831325300422675, + "language_loss": 0.7976613, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81958508, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.16894531, + "step": 6677, + "time_per_iteration": 2.4359257221221924 + }, + { + "auxiliary_loss_clip": 0.01133098, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.0523057, + "balance_loss_mlp": 1.02007997, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 1.8518317265489062, + "language_loss": 0.83789873, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85957485, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14453125, + "step": 6678, + "time_per_iteration": 2.5044546127319336 + }, + { + "auxiliary_loss_clip": 0.01122441, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.04419041, + "balance_loss_mlp": 1.02165163, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.0879803186051253, + "language_loss": 0.6359694, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65754938, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13916016, + "step": 6679, + "time_per_iteration": 2.448247194290161 + }, + { + "auxiliary_loss_clip": 0.0112625, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.04715967, + "balance_loss_mlp": 1.01860762, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.425378119010419, + "language_loss": 0.78732514, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80890465, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13085938, + "step": 6680, + "time_per_iteration": 2.503380298614502 + }, + { + "auxiliary_loss_clip": 0.01134674, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.04962564, + "balance_loss_mlp": 1.03051615, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.6635219091285685, + "language_loss": 0.75744164, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77924865, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.15509033, + "step": 6681, + "time_per_iteration": 2.4584479331970215 + }, + { + "auxiliary_loss_clip": 0.01129644, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.04785144, + "balance_loss_mlp": 1.02148771, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 1.9315029222910216, + "language_loss": 0.6401571, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66180944, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14099121, + "step": 6682, + "time_per_iteration": 2.580488681793213 + }, + { + "auxiliary_loss_clip": 0.01130337, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.04740167, + "balance_loss_mlp": 1.02160764, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.7163338060295792, + "language_loss": 0.73181129, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75346982, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13916016, + "step": 6683, + "time_per_iteration": 2.532611608505249 + }, + { + "auxiliary_loss_clip": 0.01132843, + "auxiliary_loss_mlp": 0.01043922, + "balance_loss_clip": 1.05037522, + "balance_loss_mlp": 1.02908063, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.6095045802198555, + "language_loss": 0.73100638, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75277406, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.14831543, + "step": 6684, + "time_per_iteration": 2.5297443866729736 + }, + { + "auxiliary_loss_clip": 0.01061694, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.03345764, + "balance_loss_mlp": 1.00206196, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8068257255747797, + "language_loss": 0.60415435, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62480855, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.28222656, + "router_z_loss_mlp": 0.01661682, + "step": 6685, + "time_per_iteration": 3.2073187828063965 + }, + { + "auxiliary_loss_clip": 0.0113776, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.05237412, + "balance_loss_mlp": 1.02287066, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 1.7094596226792356, + "language_loss": 0.70120001, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72294962, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14318848, + "step": 6686, + "time_per_iteration": 2.4287049770355225 + }, + { + "auxiliary_loss_clip": 0.01141191, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.06151748, + "balance_loss_mlp": 1.02032244, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.6142336067428356, + "language_loss": 0.74781173, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76955801, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13122559, + "step": 6687, + "time_per_iteration": 2.504845380783081 + }, + { + "auxiliary_loss_clip": 0.01130443, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.05018878, + "balance_loss_mlp": 1.02503943, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.7009067861546892, + "language_loss": 0.70545089, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72716427, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.1585083, + "step": 6688, + "time_per_iteration": 3.9048850536346436 + }, + { + "auxiliary_loss_clip": 0.01142326, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.05589926, + "balance_loss_mlp": 1.02412891, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.6485825938772125, + "language_loss": 0.64461482, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66643804, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15869141, + "step": 6689, + "time_per_iteration": 2.5031819343566895 + }, + { + "auxiliary_loss_clip": 0.01131787, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.04922628, + "balance_loss_mlp": 1.02501607, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.098918789803782, + "language_loss": 0.73286766, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.7545898, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.15411377, + "step": 6690, + "time_per_iteration": 2.421495199203491 + }, + { + "auxiliary_loss_clip": 0.01132031, + "auxiliary_loss_mlp": 0.01041276, + "balance_loss_clip": 1.05031681, + "balance_loss_mlp": 1.02573109, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.529983833627093, + "language_loss": 0.74875504, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.77048814, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.15539551, + "step": 6691, + "time_per_iteration": 2.469294309616089 + }, + { + "auxiliary_loss_clip": 0.01134329, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.05159879, + "balance_loss_mlp": 1.02979934, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.9978171184139697, + "language_loss": 0.72536695, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74716866, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.16040039, + "step": 6692, + "time_per_iteration": 2.432380437850952 + }, + { + "auxiliary_loss_clip": 0.01134241, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.05205035, + "balance_loss_mlp": 1.02142906, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 1.8917867121936702, + "language_loss": 0.84224224, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86393988, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14099121, + "step": 6693, + "time_per_iteration": 2.468935012817383 + }, + { + "auxiliary_loss_clip": 0.01139931, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.05696058, + "balance_loss_mlp": 1.02913928, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.6642329908313607, + "language_loss": 0.70543766, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.72727388, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14556885, + "step": 6694, + "time_per_iteration": 4.255146026611328 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01041537, + "balance_loss_clip": 1.04998064, + "balance_loss_mlp": 1.02633762, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.2563389009896047, + "language_loss": 0.67715335, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69890147, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.15185547, + "step": 6695, + "time_per_iteration": 2.581319808959961 + }, + { + "auxiliary_loss_clip": 0.01135287, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.05533004, + "balance_loss_mlp": 1.02080333, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.329245958132158, + "language_loss": 0.79136288, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81305826, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13439941, + "step": 6696, + "time_per_iteration": 2.8997907638549805 + }, + { + "auxiliary_loss_clip": 0.01138047, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.05413878, + "balance_loss_mlp": 1.02468157, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.9528958988173766, + "language_loss": 0.7128951, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.734676, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.15368652, + "step": 6697, + "time_per_iteration": 2.474663734436035 + }, + { + "auxiliary_loss_clip": 0.01132165, + "auxiliary_loss_mlp": 0.01043739, + "balance_loss_clip": 1.05107045, + "balance_loss_mlp": 1.02965486, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.0074292153532007, + "language_loss": 0.61424792, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63600701, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14074707, + "step": 6698, + "time_per_iteration": 2.521577835083008 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.05647957, + "balance_loss_mlp": 1.02107394, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.8737184471002302, + "language_loss": 0.771577, + "learning_rate": 2.711030202621491e-06, + "loss": 0.79330522, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14251709, + "step": 6699, + "time_per_iteration": 2.4638094902038574 + }, + { + "auxiliary_loss_clip": 0.01143015, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.06041682, + "balance_loss_mlp": 1.02084017, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.6883777359797723, + "language_loss": 0.80761933, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82939732, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13946533, + "step": 6700, + "time_per_iteration": 2.4778575897216797 + }, + { + "auxiliary_loss_clip": 0.01152745, + "auxiliary_loss_mlp": 0.01040195, + "balance_loss_clip": 1.06288588, + "balance_loss_mlp": 1.02424479, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 1.765513373158888, + "language_loss": 0.74820161, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77013105, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.15966797, + "step": 6701, + "time_per_iteration": 2.533508539199829 + }, + { + "auxiliary_loss_clip": 0.01143435, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.06138468, + "balance_loss_mlp": 1.01868892, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5639907890016693, + "language_loss": 0.66306084, + "learning_rate": 2.709938026276208e-06, + "loss": 0.68481588, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.1338501, + "step": 6702, + "time_per_iteration": 2.561354160308838 + }, + { + "auxiliary_loss_clip": 0.01136944, + "auxiliary_loss_mlp": 0.01038327, + "balance_loss_clip": 1.05494928, + "balance_loss_mlp": 1.023229, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.6488373620857957, + "language_loss": 0.6605444, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68229711, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15093994, + "step": 6703, + "time_per_iteration": 2.4661264419555664 + }, + { + "auxiliary_loss_clip": 0.01142389, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.05912268, + "balance_loss_mlp": 1.02034092, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 1.7798637002506885, + "language_loss": 0.82009757, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84187198, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.1472168, + "step": 6704, + "time_per_iteration": 2.5335941314697266 + }, + { + "auxiliary_loss_clip": 0.01138816, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.05494666, + "balance_loss_mlp": 1.02169061, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.9048959337151894, + "language_loss": 0.73515409, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75690484, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14562988, + "step": 6705, + "time_per_iteration": 2.527488946914673 + }, + { + "auxiliary_loss_clip": 0.01139859, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.05922055, + "balance_loss_mlp": 1.02336204, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.7946802239002035, + "language_loss": 0.66147757, + "learning_rate": 2.708481414320713e-06, + "loss": 0.6832521, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.14215088, + "step": 6706, + "time_per_iteration": 2.582460403442383 + }, + { + "auxiliary_loss_clip": 0.0113188, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.0490284, + "balance_loss_mlp": 1.0245595, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.4063387059071237, + "language_loss": 0.71543109, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73713779, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14227295, + "step": 6707, + "time_per_iteration": 2.6801559925079346 + }, + { + "auxiliary_loss_clip": 0.0112842, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.05292094, + "balance_loss_mlp": 1.01796389, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.6811456056661698, + "language_loss": 0.79881251, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82042253, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.14624023, + "step": 6708, + "time_per_iteration": 2.4878487586975098 + }, + { + "auxiliary_loss_clip": 0.01139649, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_clip": 1.05374992, + "balance_loss_mlp": 1.02670431, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.158602051923886, + "language_loss": 0.82882094, + "learning_rate": 2.70738867321606e-06, + "loss": 0.85063505, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.15039062, + "step": 6709, + "time_per_iteration": 2.4080100059509277 + }, + { + "auxiliary_loss_clip": 0.01142525, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.05911088, + "balance_loss_mlp": 1.02364206, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.4418654830859563, + "language_loss": 0.71394074, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73574972, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.1472168, + "step": 6710, + "time_per_iteration": 2.52354097366333 + }, + { + "auxiliary_loss_clip": 0.0113389, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.05057502, + "balance_loss_mlp": 1.02343881, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.100436671431982, + "language_loss": 0.85042566, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87214452, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14575195, + "step": 6711, + "time_per_iteration": 3.896784782409668 + }, + { + "auxiliary_loss_clip": 0.01142353, + "auxiliary_loss_mlp": 0.01039603, + "balance_loss_clip": 1.05652404, + "balance_loss_mlp": 1.02464223, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 2.307934127418406, + "language_loss": 0.768022, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78984153, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14953613, + "step": 6712, + "time_per_iteration": 2.4133059978485107 + }, + { + "auxiliary_loss_clip": 0.01140823, + "auxiliary_loss_mlp": 0.01037717, + "balance_loss_clip": 1.05804634, + "balance_loss_mlp": 1.0238111, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 2.0386829887097404, + "language_loss": 0.79010713, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81189251, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13891602, + "step": 6713, + "time_per_iteration": 2.483628511428833 + }, + { + "auxiliary_loss_clip": 0.01140437, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.05711818, + "balance_loss_mlp": 1.02060246, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8432269694140244, + "language_loss": 0.88645571, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90820503, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13879395, + "step": 6714, + "time_per_iteration": 2.428406238555908 + }, + { + "auxiliary_loss_clip": 0.01134973, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.05470395, + "balance_loss_mlp": 1.02465606, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.7334488960560759, + "language_loss": 0.69300413, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71474469, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14416504, + "step": 6715, + "time_per_iteration": 2.462315082550049 + }, + { + "auxiliary_loss_clip": 0.01141283, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.05684936, + "balance_loss_mlp": 1.0216527, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 2.0246670212277116, + "language_loss": 0.77482653, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79660082, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14489746, + "step": 6716, + "time_per_iteration": 2.4445016384124756 + }, + { + "auxiliary_loss_clip": 0.01128557, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.05068302, + "balance_loss_mlp": 1.02241135, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.8005817199213006, + "language_loss": 0.76759124, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78923821, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13726807, + "step": 6717, + "time_per_iteration": 2.466076374053955 + }, + { + "auxiliary_loss_clip": 0.01068067, + "auxiliary_loss_mlp": 0.01007262, + "balance_loss_clip": 1.03949606, + "balance_loss_mlp": 1.00553083, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9322340485938042, + "language_loss": 0.60735828, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.6281116, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.28564453, + "router_z_loss_mlp": 0.01733398, + "step": 6718, + "time_per_iteration": 2.9763245582580566 + }, + { + "auxiliary_loss_clip": 0.01136628, + "auxiliary_loss_mlp": 0.01040244, + "balance_loss_clip": 1.05215394, + "balance_loss_mlp": 1.02480626, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 1.8996042585148987, + "language_loss": 0.75091022, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.77267897, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.15441895, + "step": 6719, + "time_per_iteration": 4.126542329788208 + }, + { + "auxiliary_loss_clip": 0.0114246, + "auxiliary_loss_mlp": 0.01043617, + "balance_loss_clip": 1.06007099, + "balance_loss_mlp": 1.02800035, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.2533179063123, + "language_loss": 0.81464887, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.8365097, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.15625, + "step": 6720, + "time_per_iteration": 2.473414897918701 + }, + { + "auxiliary_loss_clip": 0.01134709, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.05235839, + "balance_loss_mlp": 1.01603472, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 2.0600430771188805, + "language_loss": 0.76759815, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.78925192, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14648438, + "step": 6721, + "time_per_iteration": 2.5145697593688965 + }, + { + "auxiliary_loss_clip": 0.01127724, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.0509212, + "balance_loss_mlp": 1.01646328, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 1.862933747478116, + "language_loss": 0.72747397, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74903911, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12341309, + "step": 6722, + "time_per_iteration": 2.531092405319214 + }, + { + "auxiliary_loss_clip": 0.0113442, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.05350757, + "balance_loss_mlp": 1.02290201, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.82237293544754, + "language_loss": 0.65385622, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67556512, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13549805, + "step": 6723, + "time_per_iteration": 2.5442821979522705 + }, + { + "auxiliary_loss_clip": 0.01156888, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.06852317, + "balance_loss_mlp": 1.03244591, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.4497665553797134, + "language_loss": 0.73598754, + "learning_rate": 2.701921353880734e-06, + "loss": 0.7580328, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15197754, + "step": 6724, + "time_per_iteration": 2.542149543762207 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.05673468, + "balance_loss_mlp": 1.02459502, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.7553210817409206, + "language_loss": 0.75072885, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.7724483, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13378906, + "step": 6725, + "time_per_iteration": 2.531127452850342 + }, + { + "auxiliary_loss_clip": 0.01133924, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.05481195, + "balance_loss_mlp": 1.01750493, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.6048856967963665, + "language_loss": 0.77009237, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79175389, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14733887, + "step": 6726, + "time_per_iteration": 2.687284231185913 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.05569696, + "balance_loss_mlp": 1.02093852, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.0277668083610396, + "language_loss": 0.8215344, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.84324861, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14422607, + "step": 6727, + "time_per_iteration": 2.4601645469665527 + }, + { + "auxiliary_loss_clip": 0.011334, + "auxiliary_loss_mlp": 0.01039321, + "balance_loss_clip": 1.05282569, + "balance_loss_mlp": 1.02393687, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.0288912414637608, + "language_loss": 0.85641742, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87814468, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.15386963, + "step": 6728, + "time_per_iteration": 2.4965338706970215 + }, + { + "auxiliary_loss_clip": 0.01135094, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.05333471, + "balance_loss_mlp": 1.02576017, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 2.6471131339486695, + "language_loss": 0.82156181, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84330893, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13848877, + "step": 6729, + "time_per_iteration": 2.46260666847229 + }, + { + "auxiliary_loss_clip": 0.01138045, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.05728889, + "balance_loss_mlp": 1.02779007, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 2.176949305618199, + "language_loss": 0.7351675, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75696635, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.14044189, + "step": 6730, + "time_per_iteration": 2.5459046363830566 + }, + { + "auxiliary_loss_clip": 0.01123087, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.04458797, + "balance_loss_mlp": 1.02346373, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 2.0181738859024456, + "language_loss": 0.67972207, + "learning_rate": 2.699367885848985e-06, + "loss": 0.70132822, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.14068604, + "step": 6731, + "time_per_iteration": 2.6447391510009766 + }, + { + "auxiliary_loss_clip": 0.01127119, + "auxiliary_loss_mlp": 0.01044437, + "balance_loss_clip": 1.04671764, + "balance_loss_mlp": 1.02899981, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.5019147210383408, + "language_loss": 0.73660731, + "learning_rate": 2.699002998510517e-06, + "loss": 0.75832283, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.15429688, + "step": 6732, + "time_per_iteration": 3.9661455154418945 + }, + { + "auxiliary_loss_clip": 0.01131511, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.05111027, + "balance_loss_mlp": 1.01874733, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.635212371445189, + "language_loss": 0.77195096, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79358, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12652588, + "step": 6733, + "time_per_iteration": 2.4621758460998535 + }, + { + "auxiliary_loss_clip": 0.01138191, + "auxiliary_loss_mlp": 0.0103995, + "balance_loss_clip": 1.05441606, + "balance_loss_mlp": 1.02473283, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 2.2819721027949003, + "language_loss": 0.76778734, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78956866, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15216064, + "step": 6734, + "time_per_iteration": 2.545896053314209 + }, + { + "auxiliary_loss_clip": 0.01133517, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.04896283, + "balance_loss_mlp": 1.02339959, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.447952193610307, + "language_loss": 0.6431036, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.66481143, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.1385498, + "step": 6735, + "time_per_iteration": 2.467139482498169 + }, + { + "auxiliary_loss_clip": 0.01128083, + "auxiliary_loss_mlp": 0.01038559, + "balance_loss_clip": 1.0479207, + "balance_loss_mlp": 1.02555907, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.7242707558213115, + "language_loss": 0.83342528, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85509169, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13012695, + "step": 6736, + "time_per_iteration": 2.4824676513671875 + }, + { + "auxiliary_loss_clip": 0.01143646, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.05998802, + "balance_loss_mlp": 1.02351284, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.8752537527497577, + "language_loss": 0.75257862, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77439439, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14428711, + "step": 6737, + "time_per_iteration": 2.4639017581939697 + }, + { + "auxiliary_loss_clip": 0.01128969, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.05030453, + "balance_loss_mlp": 1.02733588, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.2213211353157463, + "language_loss": 0.72096127, + "learning_rate": 2.696813118332519e-06, + "loss": 0.74266136, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13708496, + "step": 6738, + "time_per_iteration": 3.9113171100616455 + }, + { + "auxiliary_loss_clip": 0.01135241, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.05533886, + "balance_loss_mlp": 1.02147269, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.0420220388592107, + "language_loss": 0.74794304, + "learning_rate": 2.696448045740828e-06, + "loss": 0.76963794, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12780762, + "step": 6739, + "time_per_iteration": 2.409928321838379 + }, + { + "auxiliary_loss_clip": 0.01125599, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.04490566, + "balance_loss_mlp": 1.02195585, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.7988218297073886, + "language_loss": 0.74324787, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76486295, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1394043, + "step": 6740, + "time_per_iteration": 2.549846887588501 + }, + { + "auxiliary_loss_clip": 0.01135391, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.05682302, + "balance_loss_mlp": 1.01940656, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 2.8205190964067306, + "language_loss": 0.77207959, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79376346, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13586426, + "step": 6741, + "time_per_iteration": 2.534264326095581 + }, + { + "auxiliary_loss_clip": 0.01126669, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.04498219, + "balance_loss_mlp": 1.02386093, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.7759757142390267, + "language_loss": 0.71107185, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73274773, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.17053223, + "step": 6742, + "time_per_iteration": 2.467989683151245 + }, + { + "auxiliary_loss_clip": 0.01136583, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.05064356, + "balance_loss_mlp": 1.01559448, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.2003965069048133, + "language_loss": 0.72088242, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74255693, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.152771, + "step": 6743, + "time_per_iteration": 2.4380249977111816 + }, + { + "auxiliary_loss_clip": 0.01140614, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.05466056, + "balance_loss_mlp": 1.02451873, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 2.415737013287919, + "language_loss": 0.71116185, + "learning_rate": 2.694622286918588e-06, + "loss": 0.73296636, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.15319824, + "step": 6744, + "time_per_iteration": 2.5021402835845947 + }, + { + "auxiliary_loss_clip": 0.01136548, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.05618083, + "balance_loss_mlp": 1.02429605, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.5882385874515654, + "language_loss": 0.79833072, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82007515, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13604736, + "step": 6745, + "time_per_iteration": 2.5073812007904053 + }, + { + "auxiliary_loss_clip": 0.01129473, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.0497458, + "balance_loss_mlp": 1.01799905, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 2.308956433586893, + "language_loss": 0.66732067, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68894672, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.15118408, + "step": 6746, + "time_per_iteration": 2.636620044708252 + }, + { + "auxiliary_loss_clip": 0.01126585, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.04720318, + "balance_loss_mlp": 1.01899529, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.6355431866509118, + "language_loss": 0.57231307, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59390086, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13189697, + "step": 6747, + "time_per_iteration": 2.6495423316955566 + }, + { + "auxiliary_loss_clip": 0.01131535, + "auxiliary_loss_mlp": 0.0104132, + "balance_loss_clip": 1.04909396, + "balance_loss_mlp": 1.0276525, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.8502629819558274, + "language_loss": 0.84690034, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86862886, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13677979, + "step": 6748, + "time_per_iteration": 2.5426905155181885 + }, + { + "auxiliary_loss_clip": 0.01139682, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.05488884, + "balance_loss_mlp": 1.02160144, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 1.8054339893565565, + "language_loss": 0.81497812, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83673805, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14709473, + "step": 6749, + "time_per_iteration": 2.487605333328247 + }, + { + "auxiliary_loss_clip": 0.01127405, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.04670894, + "balance_loss_mlp": 1.02656257, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.7188843842501147, + "language_loss": 0.75656182, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77824295, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.14129639, + "step": 6750, + "time_per_iteration": 2.5528528690338135 + }, + { + "auxiliary_loss_clip": 0.0113514, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.04913473, + "balance_loss_mlp": 1.02263498, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.1792752892640945, + "language_loss": 0.73250449, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75424081, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.15856934, + "step": 6751, + "time_per_iteration": 2.492774486541748 + }, + { + "auxiliary_loss_clip": 0.01132516, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.04951215, + "balance_loss_mlp": 1.01903987, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.8894857425504807, + "language_loss": 0.66921008, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.69087899, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.15356445, + "step": 6752, + "time_per_iteration": 2.524472236633301 + }, + { + "auxiliary_loss_clip": 0.011574, + "auxiliary_loss_mlp": 0.01045173, + "balance_loss_clip": 1.06756592, + "balance_loss_mlp": 1.02912772, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.751546796321638, + "language_loss": 0.7079283, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72995412, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16052246, + "step": 6753, + "time_per_iteration": 2.7902703285217285 + }, + { + "auxiliary_loss_clip": 0.01139157, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.05399299, + "balance_loss_mlp": 1.02219796, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.071313513332389, + "language_loss": 0.71880758, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74057233, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.15124512, + "step": 6754, + "time_per_iteration": 3.885324716567993 + }, + { + "auxiliary_loss_clip": 0.01140979, + "auxiliary_loss_mlp": 0.01043169, + "balance_loss_clip": 1.05707264, + "balance_loss_mlp": 1.02778554, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7035816554478749, + "language_loss": 0.83025312, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85209459, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.15393066, + "step": 6755, + "time_per_iteration": 2.482590436935425 + }, + { + "auxiliary_loss_clip": 0.011411, + "auxiliary_loss_mlp": 0.01041078, + "balance_loss_clip": 1.05370891, + "balance_loss_mlp": 1.02508068, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.6456978597946226, + "language_loss": 0.70845735, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.73027915, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.15991211, + "step": 6756, + "time_per_iteration": 2.641768455505371 + }, + { + "auxiliary_loss_clip": 0.01144056, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.05541265, + "balance_loss_mlp": 1.03009176, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.7289776477169825, + "language_loss": 0.78921044, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81110668, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.15472412, + "step": 6757, + "time_per_iteration": 2.480048894882202 + }, + { + "auxiliary_loss_clip": 0.01132427, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.05028069, + "balance_loss_mlp": 1.01652408, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 2.8176206277009053, + "language_loss": 0.78864086, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.81027633, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14593506, + "step": 6758, + "time_per_iteration": 2.4608967304229736 + }, + { + "auxiliary_loss_clip": 0.01131959, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.05085516, + "balance_loss_mlp": 1.02250302, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.001270765056049, + "language_loss": 0.88639635, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90809107, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14996338, + "step": 6759, + "time_per_iteration": 2.59152889251709 + }, + { + "auxiliary_loss_clip": 0.01132752, + "auxiliary_loss_mlp": 0.01037432, + "balance_loss_clip": 1.04883623, + "balance_loss_mlp": 1.02222049, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.032569228314983, + "language_loss": 0.64256048, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66426229, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.15216064, + "step": 6760, + "time_per_iteration": 2.4930150508880615 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.05985606, + "balance_loss_mlp": 1.02107823, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.8759960407750642, + "language_loss": 0.75396776, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77578229, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15802002, + "step": 6761, + "time_per_iteration": 2.5295724868774414 + }, + { + "auxiliary_loss_clip": 0.01124211, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.04732144, + "balance_loss_mlp": 1.02092123, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4094262676051834, + "language_loss": 0.69855928, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72015381, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.14312744, + "step": 6762, + "time_per_iteration": 2.517197847366333 + }, + { + "auxiliary_loss_clip": 0.01139355, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.05605412, + "balance_loss_mlp": 1.02453828, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.3434817083227637, + "language_loss": 0.72959, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75136447, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13549805, + "step": 6763, + "time_per_iteration": 3.9648685455322266 + }, + { + "auxiliary_loss_clip": 0.01143402, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.0570612, + "balance_loss_mlp": 1.01870012, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.5902359400457646, + "language_loss": 0.69105452, + "learning_rate": 2.687312683911033e-06, + "loss": 0.7128312, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.15576172, + "step": 6764, + "time_per_iteration": 2.5649454593658447 + }, + { + "auxiliary_loss_clip": 0.01150327, + "auxiliary_loss_mlp": 0.01040042, + "balance_loss_clip": 1.06445348, + "balance_loss_mlp": 1.02375793, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.120192484545306, + "language_loss": 0.91437626, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93627989, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.16271973, + "step": 6765, + "time_per_iteration": 2.531184434890747 + }, + { + "auxiliary_loss_clip": 0.01140366, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.05403423, + "balance_loss_mlp": 1.02685022, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.522185357995992, + "language_loss": 0.79064441, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.81246972, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15332031, + "step": 6766, + "time_per_iteration": 2.483959197998047 + }, + { + "auxiliary_loss_clip": 0.01132848, + "auxiliary_loss_mlp": 0.01039506, + "balance_loss_clip": 1.04716301, + "balance_loss_mlp": 1.02393687, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.768901590129862, + "language_loss": 0.76754117, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78926468, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.15563965, + "step": 6767, + "time_per_iteration": 2.444301128387451 + }, + { + "auxiliary_loss_clip": 0.0114223, + "auxiliary_loss_mlp": 0.01039132, + "balance_loss_clip": 1.05497849, + "balance_loss_mlp": 1.02434397, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.8103354939003233, + "language_loss": 0.77160525, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79341888, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.14782715, + "step": 6768, + "time_per_iteration": 2.5458545684814453 + }, + { + "auxiliary_loss_clip": 0.01139956, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.05522799, + "balance_loss_mlp": 1.02158809, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.3095631092331748, + "language_loss": 0.87071502, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89246702, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13659668, + "step": 6769, + "time_per_iteration": 2.4591920375823975 + }, + { + "auxiliary_loss_clip": 0.01136179, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.05559742, + "balance_loss_mlp": 1.02262306, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 2.3423939646252396, + "language_loss": 0.809304, + "learning_rate": 2.685117765051156e-06, + "loss": 0.8310225, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13031006, + "step": 6770, + "time_per_iteration": 2.474491834640503 + }, + { + "auxiliary_loss_clip": 0.01138617, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.05204463, + "balance_loss_mlp": 1.0167799, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 2.1660828990106755, + "language_loss": 0.80402237, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82572609, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14984131, + "step": 6771, + "time_per_iteration": 2.589385747909546 + }, + { + "auxiliary_loss_clip": 0.01127225, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.04650259, + "balance_loss_mlp": 1.02254939, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.421105517788011, + "language_loss": 0.75804651, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.77968872, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14447021, + "step": 6772, + "time_per_iteration": 2.7151737213134766 + }, + { + "auxiliary_loss_clip": 0.01134726, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.04971242, + "balance_loss_mlp": 1.02516437, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.637049352896389, + "language_loss": 0.81518936, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83694005, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15185547, + "step": 6773, + "time_per_iteration": 2.4048657417297363 + }, + { + "auxiliary_loss_clip": 0.01090267, + "auxiliary_loss_mlp": 0.01016149, + "balance_loss_clip": 1.06207347, + "balance_loss_mlp": 1.01318371, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8204518806916204, + "language_loss": 0.64372492, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66478908, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.28173828, + "router_z_loss_mlp": 0.02966309, + "step": 6774, + "time_per_iteration": 3.0396995544433594 + }, + { + "auxiliary_loss_clip": 0.01138407, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.05466056, + "balance_loss_mlp": 1.0187521, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 1.7803118155917168, + "language_loss": 0.72383875, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74555522, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14471436, + "step": 6775, + "time_per_iteration": 2.512660264968872 + }, + { + "auxiliary_loss_clip": 0.0114058, + "auxiliary_loss_mlp": 0.01046526, + "balance_loss_clip": 1.05369854, + "balance_loss_mlp": 1.03135657, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.4662362580572628, + "language_loss": 0.78122789, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80309892, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15167236, + "step": 6776, + "time_per_iteration": 3.940000534057617 + }, + { + "auxiliary_loss_clip": 0.01143856, + "auxiliary_loss_mlp": 0.01038412, + "balance_loss_clip": 1.05664277, + "balance_loss_mlp": 1.02367187, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 3.6679181188202583, + "language_loss": 0.79535365, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81717628, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.1473999, + "step": 6777, + "time_per_iteration": 2.554314374923706 + }, + { + "auxiliary_loss_clip": 0.01059697, + "auxiliary_loss_mlp": 0.01008077, + "balance_loss_clip": 1.03103721, + "balance_loss_mlp": 1.00644135, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6927428702520887, + "language_loss": 0.53206092, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55273867, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.28613281, + "router_z_loss_mlp": 0.01635742, + "step": 6778, + "time_per_iteration": 3.146587371826172 + }, + { + "auxiliary_loss_clip": 0.01143977, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_clip": 1.06091881, + "balance_loss_mlp": 1.02823293, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.0878558027781238, + "language_loss": 0.82540917, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84727859, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14733887, + "step": 6779, + "time_per_iteration": 2.492180347442627 + }, + { + "auxiliary_loss_clip": 0.01137657, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.05608463, + "balance_loss_mlp": 1.02290237, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.6567712617421237, + "language_loss": 0.76026529, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78201532, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14453125, + "step": 6780, + "time_per_iteration": 2.515826463699341 + }, + { + "auxiliary_loss_clip": 0.01139152, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.06015015, + "balance_loss_mlp": 1.02199125, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 1.958117085427879, + "language_loss": 0.65996325, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68170583, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13116455, + "step": 6781, + "time_per_iteration": 3.9666292667388916 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01040217, + "balance_loss_clip": 1.04779959, + "balance_loss_mlp": 1.02530968, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.7472600653690142, + "language_loss": 0.71270525, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73440278, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14910889, + "step": 6782, + "time_per_iteration": 2.658339262008667 + }, + { + "auxiliary_loss_clip": 0.01137352, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.05305135, + "balance_loss_mlp": 1.01972508, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 2.37366712000986, + "language_loss": 0.81931126, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84102321, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.14129639, + "step": 6783, + "time_per_iteration": 2.504481077194214 + }, + { + "auxiliary_loss_clip": 0.01126863, + "auxiliary_loss_mlp": 0.01037673, + "balance_loss_clip": 1.04519272, + "balance_loss_mlp": 1.02281916, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.5401138777676282, + "language_loss": 0.80613399, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82777929, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14868164, + "step": 6784, + "time_per_iteration": 2.7184481620788574 + }, + { + "auxiliary_loss_clip": 0.01129256, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.04503739, + "balance_loss_mlp": 1.02108598, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 2.1652668128363057, + "language_loss": 0.66055703, + "learning_rate": 2.679626382651386e-06, + "loss": 0.68221211, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.15161133, + "step": 6785, + "time_per_iteration": 2.5191874504089355 + }, + { + "auxiliary_loss_clip": 0.01130104, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.05046666, + "balance_loss_mlp": 1.01869106, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 2.027699548365896, + "language_loss": 0.80102706, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82264996, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13494873, + "step": 6786, + "time_per_iteration": 2.491703748703003 + }, + { + "auxiliary_loss_clip": 0.01131714, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.04963613, + "balance_loss_mlp": 1.02524638, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.612143497976644, + "language_loss": 0.81825036, + "learning_rate": 2.678893759192982e-06, + "loss": 0.83995509, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13513184, + "step": 6787, + "time_per_iteration": 2.4892609119415283 + }, + { + "auxiliary_loss_clip": 0.01134758, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.05596209, + "balance_loss_mlp": 1.02260065, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.7105004300586717, + "language_loss": 0.68223745, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70394892, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.13800049, + "step": 6788, + "time_per_iteration": 2.4709324836730957 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.0104622, + "balance_loss_clip": 1.05286527, + "balance_loss_mlp": 1.03037095, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 1.7762420084238562, + "language_loss": 0.65899849, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68080395, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.1585083, + "step": 6789, + "time_per_iteration": 2.6083226203918457 + }, + { + "auxiliary_loss_clip": 0.01139117, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.05695486, + "balance_loss_mlp": 1.0248394, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.761639242421355, + "language_loss": 0.60825831, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.63005137, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.15344238, + "step": 6790, + "time_per_iteration": 2.480802297592163 + }, + { + "auxiliary_loss_clip": 0.01145172, + "auxiliary_loss_mlp": 0.01037649, + "balance_loss_clip": 1.06443572, + "balance_loss_mlp": 1.02350509, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 4.1688861481881165, + "language_loss": 0.68811285, + "learning_rate": 2.677428203462683e-06, + "loss": 0.70994109, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14135742, + "step": 6791, + "time_per_iteration": 2.4274113178253174 + }, + { + "auxiliary_loss_clip": 0.01070958, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.04315269, + "balance_loss_mlp": 1.03439939, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 1.233125964047885, + "language_loss": 0.59645545, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61752689, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01782227, + "step": 6792, + "time_per_iteration": 3.077457904815674 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01047816, + "balance_loss_clip": 1.05388951, + "balance_loss_mlp": 1.03299844, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 1.6782121947326305, + "language_loss": 0.80405122, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82591629, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14819336, + "step": 6793, + "time_per_iteration": 2.5355236530303955 + }, + { + "auxiliary_loss_clip": 0.01138094, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.05658126, + "balance_loss_mlp": 1.02171326, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.8456585269578079, + "language_loss": 0.84913874, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87088156, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.1449585, + "step": 6794, + "time_per_iteration": 2.586660861968994 + }, + { + "auxiliary_loss_clip": 0.01140314, + "auxiliary_loss_mlp": 0.0104094, + "balance_loss_clip": 1.05574048, + "balance_loss_mlp": 1.02646852, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.596820477751116, + "language_loss": 0.80144119, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82325369, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14477539, + "step": 6795, + "time_per_iteration": 2.470515012741089 + }, + { + "auxiliary_loss_clip": 0.01146, + "auxiliary_loss_mlp": 0.01037106, + "balance_loss_clip": 1.05912137, + "balance_loss_mlp": 1.02201414, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 2.2760778521827647, + "language_loss": 0.69521415, + "learning_rate": 2.675595680920792e-06, + "loss": 0.71704519, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.15087891, + "step": 6796, + "time_per_iteration": 2.44722318649292 + }, + { + "auxiliary_loss_clip": 0.01137114, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.05573559, + "balance_loss_mlp": 1.02555943, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.753151942256858, + "language_loss": 0.78163362, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.80339867, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.13848877, + "step": 6797, + "time_per_iteration": 2.4410929679870605 + }, + { + "auxiliary_loss_clip": 0.01130028, + "auxiliary_loss_mlp": 0.01039717, + "balance_loss_clip": 1.04808629, + "balance_loss_mlp": 1.02578151, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 1.840044436151952, + "language_loss": 0.85644257, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87814003, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13946533, + "step": 6798, + "time_per_iteration": 4.009955883026123 + }, + { + "auxiliary_loss_clip": 0.01127367, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.0490973, + "balance_loss_mlp": 1.02009034, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.5903655785850883, + "language_loss": 0.84525049, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86684805, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12298584, + "step": 6799, + "time_per_iteration": 2.5036280155181885 + }, + { + "auxiliary_loss_clip": 0.01135614, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.05338216, + "balance_loss_mlp": 1.02471018, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.196742256761587, + "language_loss": 0.83496296, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85671687, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.15075684, + "step": 6800, + "time_per_iteration": 2.4228672981262207 + }, + { + "auxiliary_loss_clip": 0.01130197, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.04793692, + "balance_loss_mlp": 1.01862884, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 2.0064884521560264, + "language_loss": 0.7425155, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76414585, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14221191, + "step": 6801, + "time_per_iteration": 2.436467409133911 + }, + { + "auxiliary_loss_clip": 0.0113795, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.05486476, + "balance_loss_mlp": 1.02144349, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 2.043482404223279, + "language_loss": 0.80066299, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82240677, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14996338, + "step": 6802, + "time_per_iteration": 2.4260947704315186 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01043642, + "balance_loss_clip": 1.05012894, + "balance_loss_mlp": 1.02751243, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 3.3094464778994, + "language_loss": 0.757496, + "learning_rate": 2.673029073767934e-06, + "loss": 0.7792927, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.16119385, + "step": 6803, + "time_per_iteration": 2.426455497741699 + }, + { + "auxiliary_loss_clip": 0.0112932, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.04710126, + "balance_loss_mlp": 1.0239861, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 2.025428528010497, + "language_loss": 0.78820419, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80987716, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13989258, + "step": 6804, + "time_per_iteration": 2.48183536529541 + }, + { + "auxiliary_loss_clip": 0.01139593, + "auxiliary_loss_mlp": 0.01043174, + "balance_loss_clip": 1.0515666, + "balance_loss_mlp": 1.02873206, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.781962872061086, + "language_loss": 0.75448537, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77631307, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14428711, + "step": 6805, + "time_per_iteration": 4.099343538284302 + }, + { + "auxiliary_loss_clip": 0.01129969, + "auxiliary_loss_mlp": 0.01039602, + "balance_loss_clip": 1.04878676, + "balance_loss_mlp": 1.02616763, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.7612302909572286, + "language_loss": 0.7956596, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81735528, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13427734, + "step": 6806, + "time_per_iteration": 2.4991180896759033 + }, + { + "auxiliary_loss_clip": 0.01132529, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.0509069, + "balance_loss_mlp": 1.01618385, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.9439183105141091, + "language_loss": 0.71729279, + "learning_rate": 2.671561879334007e-06, + "loss": 0.73892236, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14233398, + "step": 6807, + "time_per_iteration": 2.517763376235962 + }, + { + "auxiliary_loss_clip": 0.01052698, + "auxiliary_loss_mlp": 0.01014962, + "balance_loss_clip": 1.02443051, + "balance_loss_mlp": 1.01287854, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.808610545940359, + "language_loss": 0.58742535, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60810196, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.28271484, + "router_z_loss_mlp": 0.02084351, + "step": 6808, + "time_per_iteration": 3.220245361328125 + }, + { + "auxiliary_loss_clip": 0.01138136, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.05690813, + "balance_loss_mlp": 1.02370572, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 2.327451930000548, + "language_loss": 0.54832751, + "learning_rate": 2.670828129267242e-06, + "loss": 0.57007873, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.1328125, + "step": 6809, + "time_per_iteration": 2.599424362182617 + }, + { + "auxiliary_loss_clip": 0.01138444, + "auxiliary_loss_mlp": 0.01026269, + "balance_loss_clip": 1.05730867, + "balance_loss_mlp": 1.0131557, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.6864042791633598, + "language_loss": 0.83187199, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85351908, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13104248, + "step": 6810, + "time_per_iteration": 2.68379807472229 + }, + { + "auxiliary_loss_clip": 0.01149808, + "auxiliary_loss_mlp": 0.01040517, + "balance_loss_clip": 1.06428027, + "balance_loss_mlp": 1.02473402, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.580478200269362, + "language_loss": 0.77349597, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79539919, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.15795898, + "step": 6811, + "time_per_iteration": 2.493551015853882 + }, + { + "auxiliary_loss_clip": 0.0114184, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.0593853, + "balance_loss_mlp": 1.01659107, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.8133130595335694, + "language_loss": 0.70359284, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72533065, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.15356445, + "step": 6812, + "time_per_iteration": 2.4465863704681396 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.04801393, + "balance_loss_mlp": 1.02130449, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.54328493272091, + "language_loss": 0.662624, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68427956, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.14703369, + "step": 6813, + "time_per_iteration": 2.5045158863067627 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.04810345, + "balance_loss_mlp": 1.01923573, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.7991551640876615, + "language_loss": 0.74390221, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76553452, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14758301, + "step": 6814, + "time_per_iteration": 2.520200490951538 + }, + { + "auxiliary_loss_clip": 0.01142425, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.05856693, + "balance_loss_mlp": 1.0190239, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 1.8208399858682731, + "language_loss": 0.66580153, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68755949, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14349365, + "step": 6815, + "time_per_iteration": 2.4986090660095215 + }, + { + "auxiliary_loss_clip": 0.01127821, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.05105293, + "balance_loss_mlp": 1.02342188, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.7366829585006214, + "language_loss": 0.76733965, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78898668, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13470459, + "step": 6816, + "time_per_iteration": 2.522209882736206 + }, + { + "auxiliary_loss_clip": 0.0112584, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.04634535, + "balance_loss_mlp": 1.02187705, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.1474747542829564, + "language_loss": 0.81722176, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.83884054, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.1416626, + "step": 6817, + "time_per_iteration": 2.4945621490478516 + }, + { + "auxiliary_loss_clip": 0.01146147, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.06124687, + "balance_loss_mlp": 1.02279651, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7631641002486445, + "language_loss": 0.80286849, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82471812, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.16015625, + "step": 6818, + "time_per_iteration": 2.4875845909118652 + }, + { + "auxiliary_loss_clip": 0.01130335, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.05091429, + "balance_loss_mlp": 1.02320349, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 2.067371943134562, + "language_loss": 0.66484332, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68651295, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13421631, + "step": 6819, + "time_per_iteration": 3.9076387882232666 + }, + { + "auxiliary_loss_clip": 0.01142085, + "auxiliary_loss_mlp": 0.01045132, + "balance_loss_clip": 1.05249, + "balance_loss_mlp": 1.02977777, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.5960638419883033, + "language_loss": 0.85200298, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.8738752, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.15356445, + "step": 6820, + "time_per_iteration": 2.4683635234832764 + }, + { + "auxiliary_loss_clip": 0.01129955, + "auxiliary_loss_mlp": 0.01041117, + "balance_loss_clip": 1.04912627, + "balance_loss_mlp": 1.02704978, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.621417818605906, + "language_loss": 0.70994371, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73165441, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14068604, + "step": 6821, + "time_per_iteration": 2.4965336322784424 + }, + { + "auxiliary_loss_clip": 0.01133053, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.05165887, + "balance_loss_mlp": 1.02013445, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.7881017513765496, + "language_loss": 0.74730432, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76897037, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13427734, + "step": 6822, + "time_per_iteration": 2.4690723419189453 + }, + { + "auxiliary_loss_clip": 0.01143215, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.06033802, + "balance_loss_mlp": 1.02431703, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 2.3528737186704634, + "language_loss": 0.7582252, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.78003705, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13647461, + "step": 6823, + "time_per_iteration": 2.452310562133789 + }, + { + "auxiliary_loss_clip": 0.01135954, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.0502919, + "balance_loss_mlp": 1.02396655, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 1.890313497276543, + "language_loss": 0.73293716, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75469863, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.16223145, + "step": 6824, + "time_per_iteration": 3.9659550189971924 + }, + { + "auxiliary_loss_clip": 0.01136788, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.05195212, + "balance_loss_mlp": 1.0205127, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 6.555838383420737, + "language_loss": 0.71721762, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.73894209, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.15124512, + "step": 6825, + "time_per_iteration": 2.489919662475586 + }, + { + "auxiliary_loss_clip": 0.01141465, + "auxiliary_loss_mlp": 0.01037999, + "balance_loss_clip": 1.06009293, + "balance_loss_mlp": 1.02520156, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9034746936460596, + "language_loss": 0.84730715, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86910176, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12805176, + "step": 6826, + "time_per_iteration": 2.470247745513916 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.05394983, + "balance_loss_mlp": 1.02066398, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.7796299937845568, + "language_loss": 0.66259325, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68425709, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.14001465, + "step": 6827, + "time_per_iteration": 2.500930070877075 + }, + { + "auxiliary_loss_clip": 0.01137467, + "auxiliary_loss_mlp": 0.01040554, + "balance_loss_clip": 1.05384278, + "balance_loss_mlp": 1.02682161, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3920067198578316, + "language_loss": 0.72210437, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74388456, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.1373291, + "step": 6828, + "time_per_iteration": 2.477875232696533 + }, + { + "auxiliary_loss_clip": 0.01141579, + "auxiliary_loss_mlp": 0.01043258, + "balance_loss_clip": 1.05430555, + "balance_loss_mlp": 1.02766585, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.8863150275774334, + "language_loss": 0.83683836, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85868669, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15588379, + "step": 6829, + "time_per_iteration": 2.51806640625 + }, + { + "auxiliary_loss_clip": 0.01133417, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.05247927, + "balance_loss_mlp": 1.02212834, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.6331392531000166, + "language_loss": 0.89987403, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92156416, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13476562, + "step": 6830, + "time_per_iteration": 2.534571886062622 + }, + { + "auxiliary_loss_clip": 0.0113815, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.05584562, + "balance_loss_mlp": 1.01998901, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 2.2787070713426507, + "language_loss": 0.66036993, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68209457, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.14337158, + "step": 6831, + "time_per_iteration": 2.552924394607544 + }, + { + "auxiliary_loss_clip": 0.0114788, + "auxiliary_loss_mlp": 0.01036324, + "balance_loss_clip": 1.06654692, + "balance_loss_mlp": 1.02259076, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 2.5638347405934865, + "language_loss": 0.6920315, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71387351, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.1373291, + "step": 6832, + "time_per_iteration": 2.508448600769043 + }, + { + "auxiliary_loss_clip": 0.01132762, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.05286789, + "balance_loss_mlp": 1.02393484, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.1478934375863634, + "language_loss": 0.73677266, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75847054, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13098145, + "step": 6833, + "time_per_iteration": 2.4823110103607178 + }, + { + "auxiliary_loss_clip": 0.01139366, + "auxiliary_loss_mlp": 0.01050139, + "balance_loss_clip": 1.05236077, + "balance_loss_mlp": 1.03380752, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.664901562238646, + "language_loss": 0.72998536, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.75188041, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.16333008, + "step": 6834, + "time_per_iteration": 2.475064277648926 + }, + { + "auxiliary_loss_clip": 0.01140844, + "auxiliary_loss_mlp": 0.01042524, + "balance_loss_clip": 1.05203891, + "balance_loss_mlp": 1.02749121, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.4289321234577423, + "language_loss": 0.71452802, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73636174, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.15026855, + "step": 6835, + "time_per_iteration": 2.498544692993164 + }, + { + "auxiliary_loss_clip": 0.01133894, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.05154061, + "balance_loss_mlp": 1.0218811, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.8394359756263632, + "language_loss": 0.86911094, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89081943, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.15081787, + "step": 6836, + "time_per_iteration": 2.4159610271453857 + }, + { + "auxiliary_loss_clip": 0.01131762, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.05100036, + "balance_loss_mlp": 1.02124929, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.8170507278326644, + "language_loss": 0.68661463, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.70828742, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14257812, + "step": 6837, + "time_per_iteration": 2.6088764667510986 + }, + { + "auxiliary_loss_clip": 0.01130359, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.04696536, + "balance_loss_mlp": 1.02368808, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.9189048091280154, + "language_loss": 0.75187087, + "learning_rate": 2.660177375289599e-06, + "loss": 0.7735607, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14941406, + "step": 6838, + "time_per_iteration": 2.4953675270080566 + }, + { + "auxiliary_loss_clip": 0.01132636, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.0516417, + "balance_loss_mlp": 1.01967323, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.8711959143105252, + "language_loss": 0.82249606, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84416664, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1473999, + "step": 6839, + "time_per_iteration": 2.48036789894104 + }, + { + "auxiliary_loss_clip": 0.01121768, + "auxiliary_loss_mlp": 0.01027159, + "balance_loss_clip": 1.04282999, + "balance_loss_mlp": 1.01346231, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.787904489762493, + "language_loss": 0.80480558, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82629478, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13696289, + "step": 6840, + "time_per_iteration": 3.88259220123291 + }, + { + "auxiliary_loss_clip": 0.01124788, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.04715919, + "balance_loss_mlp": 1.02200079, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.8627961345658879, + "language_loss": 0.67627358, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.69788831, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.14685059, + "step": 6841, + "time_per_iteration": 2.4495038986206055 + }, + { + "auxiliary_loss_clip": 0.01057528, + "auxiliary_loss_mlp": 0.01006432, + "balance_loss_clip": 1.02957165, + "balance_loss_mlp": 1.00476313, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7618490074190475, + "language_loss": 0.59610951, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61674911, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.27978516, + "router_z_loss_mlp": 0.0166626, + "step": 6842, + "time_per_iteration": 3.173429489135742 + }, + { + "auxiliary_loss_clip": 0.01132651, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.05288613, + "balance_loss_mlp": 1.02220893, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 1.9348826929515375, + "language_loss": 0.70243251, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72411507, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13409424, + "step": 6843, + "time_per_iteration": 2.5184855461120605 + }, + { + "auxiliary_loss_clip": 0.01062752, + "auxiliary_loss_mlp": 0.01013356, + "balance_loss_clip": 1.03331017, + "balance_loss_mlp": 1.01178861, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7315773300344287, + "language_loss": 0.53613311, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55689424, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.29443359, + "router_z_loss_mlp": 0.01568604, + "step": 6844, + "time_per_iteration": 3.099353313446045 + }, + { + "auxiliary_loss_clip": 0.01133428, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.05412865, + "balance_loss_mlp": 1.02287054, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.8566045659109782, + "language_loss": 0.66612035, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68781662, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13317871, + "step": 6845, + "time_per_iteration": 2.4399027824401855 + }, + { + "auxiliary_loss_clip": 0.01137591, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.06012225, + "balance_loss_mlp": 1.01752162, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.9756792812223372, + "language_loss": 0.69942546, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72111058, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1338501, + "step": 6846, + "time_per_iteration": 2.430983543395996 + }, + { + "auxiliary_loss_clip": 0.01127995, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.04743648, + "balance_loss_mlp": 1.0225246, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.3677128019890528, + "language_loss": 0.65533519, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67698741, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14691162, + "step": 6847, + "time_per_iteration": 2.516974449157715 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.05412436, + "balance_loss_mlp": 1.02092898, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.4468640404734407, + "language_loss": 0.70633876, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72801828, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13964844, + "step": 6848, + "time_per_iteration": 2.5625712871551514 + }, + { + "auxiliary_loss_clip": 0.01074696, + "auxiliary_loss_mlp": 0.01009976, + "balance_loss_clip": 1.04437232, + "balance_loss_mlp": 1.00842977, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8932479868741793, + "language_loss": 0.56266832, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58351499, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.30371094, + "router_z_loss_mlp": 0.01544189, + "step": 6849, + "time_per_iteration": 4.595993757247925 + }, + { + "auxiliary_loss_clip": 0.01144368, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.06343424, + "balance_loss_mlp": 1.02371383, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.9395850405170187, + "language_loss": 0.75951111, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78133583, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14379883, + "step": 6850, + "time_per_iteration": 2.596789598464966 + }, + { + "auxiliary_loss_clip": 0.01142908, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.06116652, + "balance_loss_mlp": 1.02045608, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.5347877183559204, + "language_loss": 0.68084681, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70261872, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13830566, + "step": 6851, + "time_per_iteration": 2.6233222484588623 + }, + { + "auxiliary_loss_clip": 0.01156339, + "auxiliary_loss_mlp": 0.01043649, + "balance_loss_clip": 1.06850171, + "balance_loss_mlp": 1.02704358, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.167250173648081, + "language_loss": 0.79481727, + "learning_rate": 2.655028075792743e-06, + "loss": 0.8168171, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.1661377, + "step": 6852, + "time_per_iteration": 2.4852347373962402 + }, + { + "auxiliary_loss_clip": 0.01138811, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.0553143, + "balance_loss_mlp": 1.01907432, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 2.061355846240672, + "language_loss": 0.77391315, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.7956484, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.15661621, + "step": 6853, + "time_per_iteration": 2.681065797805786 + }, + { + "auxiliary_loss_clip": 0.01143172, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.05828679, + "balance_loss_mlp": 1.0239234, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.6485973163827656, + "language_loss": 0.65428102, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.67610711, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.1550293, + "step": 6854, + "time_per_iteration": 2.6248457431793213 + }, + { + "auxiliary_loss_clip": 0.01143561, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.06293678, + "balance_loss_mlp": 1.018718, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.7947953242775643, + "language_loss": 0.83425456, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85601234, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13482666, + "step": 6855, + "time_per_iteration": 2.538944721221924 + }, + { + "auxiliary_loss_clip": 0.01136302, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.05851531, + "balance_loss_mlp": 1.02586198, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.8918162822825364, + "language_loss": 0.79007876, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81183553, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1350708, + "step": 6856, + "time_per_iteration": 2.480046510696411 + }, + { + "auxiliary_loss_clip": 0.01146494, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.06470203, + "balance_loss_mlp": 1.02146602, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.5716204139188936, + "language_loss": 0.79976833, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.82159185, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.14404297, + "step": 6857, + "time_per_iteration": 2.45525860786438 + }, + { + "auxiliary_loss_clip": 0.01135095, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.05288875, + "balance_loss_mlp": 1.02314973, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.9879314977972362, + "language_loss": 0.71327239, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.73500514, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.15026855, + "step": 6858, + "time_per_iteration": 2.499699115753174 + }, + { + "auxiliary_loss_clip": 0.0112389, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.04564846, + "balance_loss_mlp": 1.02177739, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4122470695613327, + "language_loss": 0.59377313, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61537892, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.14916992, + "step": 6859, + "time_per_iteration": 2.743387222290039 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01036298, + "balance_loss_clip": 1.04915261, + "balance_loss_mlp": 1.02220798, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.031191927338092, + "language_loss": 0.73574734, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75742614, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14086914, + "step": 6860, + "time_per_iteration": 2.4485316276550293 + }, + { + "auxiliary_loss_clip": 0.01129775, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0489676, + "balance_loss_mlp": 1.01782441, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.601111560374126, + "language_loss": 0.74765158, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76926219, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13464355, + "step": 6861, + "time_per_iteration": 2.4512410163879395 + }, + { + "auxiliary_loss_clip": 0.01142545, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.06033647, + "balance_loss_mlp": 1.02023852, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.0910525605716948, + "language_loss": 0.79491121, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81667435, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13543701, + "step": 6862, + "time_per_iteration": 2.4393537044525146 + }, + { + "auxiliary_loss_clip": 0.01140741, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.05983233, + "balance_loss_mlp": 1.01867366, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7118388616614864, + "language_loss": 0.75999081, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78172475, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14001465, + "step": 6863, + "time_per_iteration": 3.856152057647705 + }, + { + "auxiliary_loss_clip": 0.01051751, + "auxiliary_loss_mlp": 0.01002712, + "balance_loss_clip": 1.02430892, + "balance_loss_mlp": 1.00076044, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7039850949763182, + "language_loss": 0.52747279, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54801738, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.01953125, + "step": 6864, + "time_per_iteration": 3.0391886234283447 + }, + { + "auxiliary_loss_clip": 0.01128717, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.04560518, + "balance_loss_mlp": 1.01944339, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.5991363364642441, + "language_loss": 0.72702885, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.7486589, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14849854, + "step": 6865, + "time_per_iteration": 2.5477612018585205 + }, + { + "auxiliary_loss_clip": 0.01067033, + "auxiliary_loss_mlp": 0.01006263, + "balance_loss_clip": 1.03902555, + "balance_loss_mlp": 1.00454497, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9250298666838213, + "language_loss": 0.66629064, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68702364, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.28076172, + "router_z_loss_mlp": 0.01716614, + "step": 6866, + "time_per_iteration": 2.9410037994384766 + }, + { + "auxiliary_loss_clip": 0.01135005, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.05438185, + "balance_loss_mlp": 1.02091312, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.345055350347275, + "language_loss": 0.81158924, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8333059, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.1574707, + "step": 6867, + "time_per_iteration": 3.8780405521392822 + }, + { + "auxiliary_loss_clip": 0.01141811, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.05792165, + "balance_loss_mlp": 1.01950312, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.5370743043269015, + "language_loss": 0.77673614, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79849327, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1439209, + "step": 6868, + "time_per_iteration": 2.4752254486083984 + }, + { + "auxiliary_loss_clip": 0.01058909, + "auxiliary_loss_mlp": 0.01005369, + "balance_loss_clip": 1.03085852, + "balance_loss_mlp": 1.00371814, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8335122244843226, + "language_loss": 0.57769823, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59834105, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.28027344, + "router_z_loss_mlp": 0.01654053, + "step": 6869, + "time_per_iteration": 2.8331120014190674 + }, + { + "auxiliary_loss_clip": 0.01134493, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.05623353, + "balance_loss_mlp": 1.0187099, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.8703551088373145, + "language_loss": 0.75340801, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77508038, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14031982, + "step": 6870, + "time_per_iteration": 2.5263140201568604 + }, + { + "auxiliary_loss_clip": 0.01138546, + "auxiliary_loss_mlp": 0.01037525, + "balance_loss_clip": 1.05517602, + "balance_loss_mlp": 1.0233686, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.6259179815997236, + "language_loss": 0.83733642, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85909712, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.14172363, + "step": 6871, + "time_per_iteration": 2.4952147006988525 + }, + { + "auxiliary_loss_clip": 0.01131674, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.05108452, + "balance_loss_mlp": 1.02495754, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 2.264505463789194, + "language_loss": 0.68782532, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70953405, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.14233398, + "step": 6872, + "time_per_iteration": 2.50759220123291 + }, + { + "auxiliary_loss_clip": 0.0113719, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.05336142, + "balance_loss_mlp": 1.02287352, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.8498210464254505, + "language_loss": 0.7584666, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78021371, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14654541, + "step": 6873, + "time_per_iteration": 2.434392213821411 + }, + { + "auxiliary_loss_clip": 0.01148546, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.06131554, + "balance_loss_mlp": 1.01957083, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.7651410547866362, + "language_loss": 0.83786339, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85968649, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.1418457, + "step": 6874, + "time_per_iteration": 2.5107853412628174 + }, + { + "auxiliary_loss_clip": 0.01137919, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.05140555, + "balance_loss_mlp": 1.03455162, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 1.8456637034871464, + "language_loss": 0.71839571, + "learning_rate": 2.646557961279436e-06, + "loss": 0.74027669, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.15618896, + "step": 6875, + "time_per_iteration": 2.546736001968384 + }, + { + "auxiliary_loss_clip": 0.01123243, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.04844189, + "balance_loss_mlp": 1.01957047, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.5190633466445886, + "language_loss": 0.82853425, + "learning_rate": 2.646189399991154e-06, + "loss": 0.85008931, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12689209, + "step": 6876, + "time_per_iteration": 2.585897445678711 + }, + { + "auxiliary_loss_clip": 0.01133345, + "auxiliary_loss_mlp": 0.01046539, + "balance_loss_clip": 1.0490725, + "balance_loss_mlp": 1.02825224, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.176750010188637, + "language_loss": 0.6494121, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67121094, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.18286133, + "step": 6877, + "time_per_iteration": 2.436814546585083 + }, + { + "auxiliary_loss_clip": 0.01135896, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.05558968, + "balance_loss_mlp": 1.02220154, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.7681183241817116, + "language_loss": 0.76267266, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78440028, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14654541, + "step": 6878, + "time_per_iteration": 2.5367705821990967 + }, + { + "auxiliary_loss_clip": 0.01135373, + "auxiliary_loss_mlp": 0.01038395, + "balance_loss_clip": 1.05443001, + "balance_loss_mlp": 1.02477503, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8912354356489098, + "language_loss": 0.80010891, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.8218466, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13616943, + "step": 6879, + "time_per_iteration": 2.4635255336761475 + }, + { + "auxiliary_loss_clip": 0.01142502, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.05879045, + "balance_loss_mlp": 1.02205682, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.6665905699707877, + "language_loss": 0.84992218, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.87170982, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.1418457, + "step": 6880, + "time_per_iteration": 2.5175280570983887 + }, + { + "auxiliary_loss_clip": 0.01137065, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.05464768, + "balance_loss_mlp": 1.01727712, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.795035236250054, + "language_loss": 0.70442533, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.7261045, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13580322, + "step": 6881, + "time_per_iteration": 2.4533400535583496 + }, + { + "auxiliary_loss_clip": 0.0114224, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.06122696, + "balance_loss_mlp": 1.03189778, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.9643737699702395, + "language_loss": 0.80837244, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83024734, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13372803, + "step": 6882, + "time_per_iteration": 2.441330671310425 + }, + { + "auxiliary_loss_clip": 0.01141225, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.05719519, + "balance_loss_mlp": 1.02340031, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 2.1442607632609403, + "language_loss": 0.70017552, + "learning_rate": 2.643608785656077e-06, + "loss": 0.72198308, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.16125488, + "step": 6883, + "time_per_iteration": 3.9024837017059326 + }, + { + "auxiliary_loss_clip": 0.01135535, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.05318415, + "balance_loss_mlp": 1.01826203, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.8361016843537434, + "language_loss": 0.76323497, + "learning_rate": 2.643240028730663e-06, + "loss": 0.78490722, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13427734, + "step": 6884, + "time_per_iteration": 2.4688680171966553 + }, + { + "auxiliary_loss_clip": 0.01136309, + "auxiliary_loss_mlp": 0.01037135, + "balance_loss_clip": 1.05346036, + "balance_loss_mlp": 1.02303874, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.3963263869495368, + "language_loss": 0.75631046, + "learning_rate": 2.642871247413523e-06, + "loss": 0.77804494, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14105225, + "step": 6885, + "time_per_iteration": 2.5395426750183105 + }, + { + "auxiliary_loss_clip": 0.01140008, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.05771875, + "balance_loss_mlp": 1.02263308, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.801991825880803, + "language_loss": 0.69445634, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71622282, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14007568, + "step": 6886, + "time_per_iteration": 2.4521167278289795 + }, + { + "auxiliary_loss_clip": 0.01138441, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.05605435, + "balance_loss_mlp": 1.02345634, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 2.0478586916366814, + "language_loss": 0.75204754, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77381051, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.144104, + "step": 6887, + "time_per_iteration": 2.4874649047851562 + }, + { + "auxiliary_loss_clip": 0.01133374, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.05279195, + "balance_loss_mlp": 1.01825118, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 2.041947013987108, + "language_loss": 0.70256686, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72422206, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13891602, + "step": 6888, + "time_per_iteration": 2.5140299797058105 + }, + { + "auxiliary_loss_clip": 0.01131572, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.05283546, + "balance_loss_mlp": 1.02125156, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 3.7771245377411655, + "language_loss": 0.76245427, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.78412509, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14263916, + "step": 6889, + "time_per_iteration": 2.497562885284424 + }, + { + "auxiliary_loss_clip": 0.01135887, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.0548327, + "balance_loss_mlp": 1.02115571, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.5007875823426948, + "language_loss": 0.80057204, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82228374, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14129639, + "step": 6890, + "time_per_iteration": 2.5575404167175293 + }, + { + "auxiliary_loss_clip": 0.01134362, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.05391788, + "balance_loss_mlp": 1.0230546, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.8107584254545466, + "language_loss": 0.74148679, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.7632072, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14630127, + "step": 6891, + "time_per_iteration": 2.5908703804016113 + }, + { + "auxiliary_loss_clip": 0.01141313, + "auxiliary_loss_mlp": 0.01040317, + "balance_loss_clip": 1.05800664, + "balance_loss_mlp": 1.02478445, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.6697940844150037, + "language_loss": 0.84458792, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86640429, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.15533447, + "step": 6892, + "time_per_iteration": 2.45389461517334 + }, + { + "auxiliary_loss_clip": 0.01139428, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.05916691, + "balance_loss_mlp": 1.01482201, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.9299363717780889, + "language_loss": 0.70415694, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72583163, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13214111, + "step": 6893, + "time_per_iteration": 4.030860900878906 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.05953503, + "balance_loss_mlp": 1.01842773, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.630420322371555, + "language_loss": 0.7255584, + "learning_rate": 2.639551120239279e-06, + "loss": 0.7472707, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.14093018, + "step": 6894, + "time_per_iteration": 2.5542244911193848 + }, + { + "auxiliary_loss_clip": 0.0113986, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.05808711, + "balance_loss_mlp": 1.0219779, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 3.8559944944110325, + "language_loss": 0.62598693, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64774907, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.14379883, + "step": 6895, + "time_per_iteration": 2.4268155097961426 + }, + { + "auxiliary_loss_clip": 0.0113415, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.05452275, + "balance_loss_mlp": 1.01854634, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.4029926894893858, + "language_loss": 0.70575708, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72741926, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13525391, + "step": 6896, + "time_per_iteration": 2.5277106761932373 + }, + { + "auxiliary_loss_clip": 0.01134192, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.05238664, + "balance_loss_mlp": 1.02497458, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 2.676414662701245, + "language_loss": 0.7292124, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75095737, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.15344238, + "step": 6897, + "time_per_iteration": 2.5277349948883057 + }, + { + "auxiliary_loss_clip": 0.01137965, + "auxiliary_loss_mlp": 0.01043784, + "balance_loss_clip": 1.05917406, + "balance_loss_mlp": 1.02940083, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 1.6577097209166274, + "language_loss": 0.84407681, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86589432, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.1439209, + "step": 6898, + "time_per_iteration": 2.5406527519226074 + }, + { + "auxiliary_loss_clip": 0.01133867, + "auxiliary_loss_mlp": 0.01040943, + "balance_loss_clip": 1.04939604, + "balance_loss_mlp": 1.02604151, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 2.352188321961987, + "language_loss": 0.74938095, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.77112901, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14904785, + "step": 6899, + "time_per_iteration": 2.4486260414123535 + }, + { + "auxiliary_loss_clip": 0.01133798, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.0510087, + "balance_loss_mlp": 1.01993561, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.723456802479543, + "language_loss": 0.75791442, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77960265, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.15081787, + "step": 6900, + "time_per_iteration": 2.502013921737671 + }, + { + "auxiliary_loss_clip": 0.01142646, + "auxiliary_loss_mlp": 0.01039402, + "balance_loss_clip": 1.06125331, + "balance_loss_mlp": 1.02408338, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.329702253163153, + "language_loss": 0.79927772, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82109821, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.15301514, + "step": 6901, + "time_per_iteration": 2.4589295387268066 + }, + { + "auxiliary_loss_clip": 0.01126559, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.0469563, + "balance_loss_mlp": 1.01900947, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.7291931158299556, + "language_loss": 0.69853008, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.72012454, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13879395, + "step": 6902, + "time_per_iteration": 2.6904642581939697 + }, + { + "auxiliary_loss_clip": 0.01126765, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.04862356, + "balance_loss_mlp": 1.02158272, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 1.7693112527759578, + "language_loss": 0.83354026, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85516036, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13677979, + "step": 6903, + "time_per_iteration": 2.522371530532837 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01038925, + "balance_loss_clip": 1.05080807, + "balance_loss_mlp": 1.02206862, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.9715249757586304, + "language_loss": 0.67745388, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69919813, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.16845703, + "step": 6904, + "time_per_iteration": 2.569974660873413 + }, + { + "auxiliary_loss_clip": 0.01139436, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.0564934, + "balance_loss_mlp": 1.02144456, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.9249572703059832, + "language_loss": 0.77941662, + "learning_rate": 2.635490520350643e-06, + "loss": 0.80117369, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.1484375, + "step": 6905, + "time_per_iteration": 2.4759678840637207 + }, + { + "auxiliary_loss_clip": 0.01130868, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.04926777, + "balance_loss_mlp": 1.0180583, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.9721500180001812, + "language_loss": 0.68700242, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70864141, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.14971924, + "step": 6906, + "time_per_iteration": 2.476567268371582 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.04713476, + "balance_loss_mlp": 1.01950645, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 2.0676185029740695, + "language_loss": 0.67035264, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69193578, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13653564, + "step": 6907, + "time_per_iteration": 3.85471773147583 + }, + { + "auxiliary_loss_clip": 0.01132582, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.0517844, + "balance_loss_mlp": 1.02266979, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 7.099127242610828, + "language_loss": 0.77096367, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79265112, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13494873, + "step": 6908, + "time_per_iteration": 2.5261285305023193 + }, + { + "auxiliary_loss_clip": 0.01069661, + "auxiliary_loss_mlp": 0.01022424, + "balance_loss_clip": 1.0406816, + "balance_loss_mlp": 1.02070928, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7723768873506642, + "language_loss": 0.64811355, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66903448, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.28955078, + "router_z_loss_mlp": 0.01715088, + "step": 6909, + "time_per_iteration": 3.006920099258423 + }, + { + "auxiliary_loss_clip": 0.01134212, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.05413222, + "balance_loss_mlp": 1.0199753, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.5482572725407813, + "language_loss": 0.8710891, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89276576, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13470459, + "step": 6910, + "time_per_iteration": 3.889023542404175 + }, + { + "auxiliary_loss_clip": 0.01056307, + "auxiliary_loss_mlp": 0.01011655, + "balance_loss_clip": 1.02817249, + "balance_loss_mlp": 1.00996566, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8042411600000402, + "language_loss": 0.62200141, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64268106, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 0.01693726, + "step": 6911, + "time_per_iteration": 3.0966062545776367 + }, + { + "auxiliary_loss_clip": 0.01141617, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.0552516, + "balance_loss_mlp": 1.02407169, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.304384628615876, + "language_loss": 0.87815881, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.89996457, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.14886475, + "step": 6912, + "time_per_iteration": 2.42100191116333 + }, + { + "auxiliary_loss_clip": 0.01138218, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.05671239, + "balance_loss_mlp": 1.01856041, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 2.6431529150007815, + "language_loss": 0.63045192, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65215647, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13671875, + "step": 6913, + "time_per_iteration": 2.476404905319214 + }, + { + "auxiliary_loss_clip": 0.01127059, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.04792976, + "balance_loss_mlp": 1.02272964, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.7711331353267898, + "language_loss": 0.75240922, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77405936, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.15234375, + "step": 6914, + "time_per_iteration": 2.5317134857177734 + }, + { + "auxiliary_loss_clip": 0.01136754, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.05351746, + "balance_loss_mlp": 1.03065848, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 2.04827057552691, + "language_loss": 0.87684703, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89868128, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.16009521, + "step": 6915, + "time_per_iteration": 2.4649465084075928 + }, + { + "auxiliary_loss_clip": 0.01140763, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.05789137, + "balance_loss_mlp": 1.02212834, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 1.8988052655480976, + "language_loss": 0.70933777, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73110706, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14031982, + "step": 6916, + "time_per_iteration": 2.4552478790283203 + }, + { + "auxiliary_loss_clip": 0.01146787, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.06089854, + "balance_loss_mlp": 1.01650608, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.528357132544792, + "language_loss": 0.7148419, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73662567, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.15100098, + "step": 6917, + "time_per_iteration": 2.486645221710205 + }, + { + "auxiliary_loss_clip": 0.0113907, + "auxiliary_loss_mlp": 0.01029746, + "balance_loss_clip": 1.05756342, + "balance_loss_mlp": 1.0157094, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.4691489632811126, + "language_loss": 0.80866915, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83035731, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14050293, + "step": 6918, + "time_per_iteration": 2.4965438842773438 + }, + { + "auxiliary_loss_clip": 0.01148631, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.06215823, + "balance_loss_mlp": 1.01964498, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.4636080509673355, + "language_loss": 0.70148224, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72331613, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.15124512, + "step": 6919, + "time_per_iteration": 2.6681809425354004 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.06070971, + "balance_loss_mlp": 1.01955247, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 2.011493138347623, + "language_loss": 0.81801742, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.8398093, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.15234375, + "step": 6920, + "time_per_iteration": 2.4750661849975586 + }, + { + "auxiliary_loss_clip": 0.01147943, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.06331122, + "balance_loss_mlp": 1.0194428, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 2.7037088475566695, + "language_loss": 0.65086901, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.6726948, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.1519165, + "step": 6921, + "time_per_iteration": 2.462193727493286 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.06147194, + "balance_loss_mlp": 1.02031302, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 1.9992308673366808, + "language_loss": 0.80639273, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82817769, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.14501953, + "step": 6922, + "time_per_iteration": 2.489786386489868 + }, + { + "auxiliary_loss_clip": 0.01134537, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.05207336, + "balance_loss_mlp": 1.0207541, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.6783804544804857, + "language_loss": 0.67855901, + "learning_rate": 2.628839621341247e-06, + "loss": 0.70026708, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.15527344, + "step": 6923, + "time_per_iteration": 2.5322985649108887 + }, + { + "auxiliary_loss_clip": 0.01136081, + "auxiliary_loss_mlp": 0.01045539, + "balance_loss_clip": 1.05383658, + "balance_loss_mlp": 1.03002369, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 2.0286467845321274, + "language_loss": 0.76234484, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78416097, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.1552124, + "step": 6924, + "time_per_iteration": 2.5725739002227783 + }, + { + "auxiliary_loss_clip": 0.01127626, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.04559612, + "balance_loss_mlp": 1.02285814, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.7837741869814383, + "language_loss": 0.72967315, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75132066, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14276123, + "step": 6925, + "time_per_iteration": 2.557586908340454 + }, + { + "auxiliary_loss_clip": 0.01131645, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.05050123, + "balance_loss_mlp": 1.01887536, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.25702126277374, + "language_loss": 0.84031367, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86196065, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14178467, + "step": 6926, + "time_per_iteration": 2.451552391052246 + }, + { + "auxiliary_loss_clip": 0.01131318, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.05285656, + "balance_loss_mlp": 1.02284956, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.696293636491344, + "language_loss": 0.86496866, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88663739, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12713623, + "step": 6927, + "time_per_iteration": 3.992443799972534 + }, + { + "auxiliary_loss_clip": 0.01127357, + "auxiliary_loss_mlp": 0.01046913, + "balance_loss_clip": 1.04695046, + "balance_loss_mlp": 1.03003263, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.1000608790911177, + "language_loss": 0.7255941, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74733675, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.16882324, + "step": 6928, + "time_per_iteration": 2.484719753265381 + }, + { + "auxiliary_loss_clip": 0.01129477, + "auxiliary_loss_mlp": 0.01038901, + "balance_loss_clip": 1.04847693, + "balance_loss_mlp": 1.02445889, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.0882134591121333, + "language_loss": 0.77753735, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.79922104, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.14459229, + "step": 6929, + "time_per_iteration": 2.5106723308563232 + }, + { + "auxiliary_loss_clip": 0.01135579, + "auxiliary_loss_mlp": 0.010416, + "balance_loss_clip": 1.0524857, + "balance_loss_mlp": 1.02698517, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 2.567234117892116, + "language_loss": 0.71126068, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73303246, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14624023, + "step": 6930, + "time_per_iteration": 2.4488391876220703 + }, + { + "auxiliary_loss_clip": 0.01135033, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.0525502, + "balance_loss_mlp": 1.02579522, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7044155143815154, + "language_loss": 0.80847901, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83023107, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.1439209, + "step": 6931, + "time_per_iteration": 2.4230897426605225 + }, + { + "auxiliary_loss_clip": 0.01136201, + "auxiliary_loss_mlp": 0.01045065, + "balance_loss_clip": 1.05096865, + "balance_loss_mlp": 1.02942443, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 2.16426238026528, + "language_loss": 0.79298347, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81479615, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.15649414, + "step": 6932, + "time_per_iteration": 2.496091365814209 + }, + { + "auxiliary_loss_clip": 0.0113498, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.05096507, + "balance_loss_mlp": 1.02157259, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 2.1367251321994982, + "language_loss": 0.81758666, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.8393023, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15002441, + "step": 6933, + "time_per_iteration": 2.5168490409851074 + }, + { + "auxiliary_loss_clip": 0.0113594, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.05193305, + "balance_loss_mlp": 1.0182476, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.563686606782526, + "language_loss": 0.77008784, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79179037, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.16064453, + "step": 6934, + "time_per_iteration": 2.453519344329834 + }, + { + "auxiliary_loss_clip": 0.01143703, + "auxiliary_loss_mlp": 0.01035072, + "balance_loss_clip": 1.05858374, + "balance_loss_mlp": 1.02092242, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 2.0332925422423873, + "language_loss": 0.67465973, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69644743, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.14160156, + "step": 6935, + "time_per_iteration": 2.408168315887451 + }, + { + "auxiliary_loss_clip": 0.01136427, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.05590439, + "balance_loss_mlp": 1.02769315, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.1503716841793636, + "language_loss": 0.73138338, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75317013, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14538574, + "step": 6936, + "time_per_iteration": 2.479581832885742 + }, + { + "auxiliary_loss_clip": 0.01132532, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.05355167, + "balance_loss_mlp": 1.02153039, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.9832875091613886, + "language_loss": 0.74671638, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76839411, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13720703, + "step": 6937, + "time_per_iteration": 3.98172664642334 + }, + { + "auxiliary_loss_clip": 0.01135819, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.056077, + "balance_loss_mlp": 1.02442789, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.7972774699312828, + "language_loss": 0.84297878, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86471713, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13604736, + "step": 6938, + "time_per_iteration": 2.5457420349121094 + }, + { + "auxiliary_loss_clip": 0.01135942, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.05368328, + "balance_loss_mlp": 1.02373886, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.0275834036624376, + "language_loss": 0.74400485, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76575011, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14837646, + "step": 6939, + "time_per_iteration": 2.5393478870391846 + }, + { + "auxiliary_loss_clip": 0.01132789, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.05057108, + "balance_loss_mlp": 1.02317786, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.7496531900805996, + "language_loss": 0.75147521, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77318257, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14782715, + "step": 6940, + "time_per_iteration": 2.5153720378875732 + }, + { + "auxiliary_loss_clip": 0.01137384, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.05633867, + "balance_loss_mlp": 1.02461493, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.884609203936739, + "language_loss": 0.71216285, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73392153, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13873291, + "step": 6941, + "time_per_iteration": 2.6713714599609375 + }, + { + "auxiliary_loss_clip": 0.01131801, + "auxiliary_loss_mlp": 0.01041307, + "balance_loss_clip": 1.04952312, + "balance_loss_mlp": 1.02703226, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 2.1948338242048173, + "language_loss": 0.7367233, + "learning_rate": 2.621810847844104e-06, + "loss": 0.75845444, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14276123, + "step": 6942, + "time_per_iteration": 2.575347900390625 + }, + { + "auxiliary_loss_clip": 0.01136861, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.05241001, + "balance_loss_mlp": 1.02590013, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.3289400268640468, + "language_loss": 0.72754526, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74932718, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15441895, + "step": 6943, + "time_per_iteration": 2.494861364364624 + }, + { + "auxiliary_loss_clip": 0.01141368, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.05796528, + "balance_loss_mlp": 1.01467192, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 3.300151521822208, + "language_loss": 0.63787323, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65958774, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.1541748, + "step": 6944, + "time_per_iteration": 2.560426950454712 + }, + { + "auxiliary_loss_clip": 0.01140898, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.05969644, + "balance_loss_mlp": 1.02204275, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.4569038188413872, + "language_loss": 0.70150733, + "learning_rate": 2.620700260921513e-06, + "loss": 0.7232756, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13879395, + "step": 6945, + "time_per_iteration": 2.5282270908355713 + }, + { + "auxiliary_loss_clip": 0.01131623, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.05088568, + "balance_loss_mlp": 1.02249789, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.9951661004190981, + "language_loss": 0.80505669, + "learning_rate": 2.620330018187899e-06, + "loss": 0.82675505, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.15716553, + "step": 6946, + "time_per_iteration": 2.4758036136627197 + }, + { + "auxiliary_loss_clip": 0.01134997, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.05490386, + "balance_loss_mlp": 1.01987588, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.667344840679058, + "language_loss": 0.7734617, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79515481, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14428711, + "step": 6947, + "time_per_iteration": 2.487246036529541 + }, + { + "auxiliary_loss_clip": 0.01131249, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.04886413, + "balance_loss_mlp": 1.01682186, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 2.066848042806309, + "language_loss": 0.71832943, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73996103, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.15093994, + "step": 6948, + "time_per_iteration": 2.6169700622558594 + }, + { + "auxiliary_loss_clip": 0.01133635, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.053087, + "balance_loss_mlp": 1.01711488, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.5614317319821882, + "language_loss": 0.76927447, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79092526, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14312744, + "step": 6949, + "time_per_iteration": 2.5178980827331543 + }, + { + "auxiliary_loss_clip": 0.01142992, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.05789042, + "balance_loss_mlp": 1.02430725, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.6040543755658603, + "language_loss": 0.81955075, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84137487, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.15118408, + "step": 6950, + "time_per_iteration": 3.8759872913360596 + }, + { + "auxiliary_loss_clip": 0.01133602, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.05288827, + "balance_loss_mlp": 1.02149308, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.3819149484555464, + "language_loss": 0.76311791, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78480428, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13537598, + "step": 6951, + "time_per_iteration": 2.575437307357788 + }, + { + "auxiliary_loss_clip": 0.01145368, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.05660796, + "balance_loss_mlp": 1.01796722, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 2.4108344553051944, + "language_loss": 0.73309803, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75488174, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.15032959, + "step": 6952, + "time_per_iteration": 2.5589234828948975 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.05736423, + "balance_loss_mlp": 1.02019, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.1199830418392995, + "language_loss": 0.71959841, + "learning_rate": 2.617737661195593e-06, + "loss": 0.74132776, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13146973, + "step": 6953, + "time_per_iteration": 2.487415075302124 + }, + { + "auxiliary_loss_clip": 0.01140432, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.06008017, + "balance_loss_mlp": 1.02262354, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.850900879242373, + "language_loss": 0.76159358, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78337795, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.15386963, + "step": 6954, + "time_per_iteration": 3.9627647399902344 + }, + { + "auxiliary_loss_clip": 0.01134716, + "auxiliary_loss_mlp": 0.01044138, + "balance_loss_clip": 1.05077004, + "balance_loss_mlp": 1.02738941, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 5.344017946101352, + "language_loss": 0.84648961, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86827815, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1673584, + "step": 6955, + "time_per_iteration": 2.467714309692383 + }, + { + "auxiliary_loss_clip": 0.01137095, + "auxiliary_loss_mlp": 0.01040249, + "balance_loss_clip": 1.05690742, + "balance_loss_mlp": 1.02593219, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.5194748210009004, + "language_loss": 0.83286345, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85463691, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14312744, + "step": 6956, + "time_per_iteration": 2.4835309982299805 + }, + { + "auxiliary_loss_clip": 0.01139252, + "auxiliary_loss_mlp": 0.01039944, + "balance_loss_clip": 1.05570531, + "balance_loss_mlp": 1.02424419, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.146513179194364, + "language_loss": 0.72054851, + "learning_rate": 2.616255798691059e-06, + "loss": 0.74234045, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.15698242, + "step": 6957, + "time_per_iteration": 2.435068368911743 + }, + { + "auxiliary_loss_clip": 0.01132404, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_clip": 1.05073762, + "balance_loss_mlp": 1.02941167, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.0113103423668344, + "language_loss": 0.75329369, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77505463, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.14282227, + "step": 6958, + "time_per_iteration": 2.4969658851623535 + }, + { + "auxiliary_loss_clip": 0.01137263, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.05454111, + "balance_loss_mlp": 1.02246749, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 2.001480130562596, + "language_loss": 0.77284324, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.79459643, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.15594482, + "step": 6959, + "time_per_iteration": 2.46566104888916 + }, + { + "auxiliary_loss_clip": 0.0114765, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.0648154, + "balance_loss_mlp": 1.02319956, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 2.5971201357777587, + "language_loss": 0.77534926, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.79721749, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.159729, + "step": 6960, + "time_per_iteration": 2.467252254486084 + }, + { + "auxiliary_loss_clip": 0.01132663, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.05453181, + "balance_loss_mlp": 1.02178109, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.845304620287022, + "language_loss": 0.76174086, + "learning_rate": 2.614773562290835e-06, + "loss": 0.78345054, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1652832, + "step": 6961, + "time_per_iteration": 2.4445700645446777 + }, + { + "auxiliary_loss_clip": 0.01083203, + "auxiliary_loss_mlp": 0.01007275, + "balance_loss_clip": 1.05471396, + "balance_loss_mlp": 1.00526667, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7866408007428634, + "language_loss": 0.54730201, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56820679, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.02008057, + "step": 6962, + "time_per_iteration": 3.017765522003174 + }, + { + "auxiliary_loss_clip": 0.0114174, + "auxiliary_loss_mlp": 0.01041847, + "balance_loss_clip": 1.05623913, + "balance_loss_mlp": 1.02651691, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8959744745582612, + "language_loss": 0.85304362, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87487948, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.15344238, + "step": 6963, + "time_per_iteration": 2.425997734069824 + }, + { + "auxiliary_loss_clip": 0.01143727, + "auxiliary_loss_mlp": 0.01042312, + "balance_loss_clip": 1.06142747, + "balance_loss_mlp": 1.02667141, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.47671951960244, + "language_loss": 0.70013714, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.7219975, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.15661621, + "step": 6964, + "time_per_iteration": 2.5219576358795166 + }, + { + "auxiliary_loss_clip": 0.01139867, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.05951226, + "balance_loss_mlp": 1.02192163, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.7367896159493668, + "language_loss": 0.7111634, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73291981, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13842773, + "step": 6965, + "time_per_iteration": 2.589871406555176 + }, + { + "auxiliary_loss_clip": 0.01145606, + "auxiliary_loss_mlp": 0.01038283, + "balance_loss_clip": 1.06396937, + "balance_loss_mlp": 1.02477062, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.7192579994599295, + "language_loss": 0.71896404, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74080294, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13513184, + "step": 6966, + "time_per_iteration": 2.500422954559326 + }, + { + "auxiliary_loss_clip": 0.011478, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.05912995, + "balance_loss_mlp": 1.02104831, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.158097320126939, + "language_loss": 0.70930767, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73115009, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.15405273, + "step": 6967, + "time_per_iteration": 2.734548568725586 + }, + { + "auxiliary_loss_clip": 0.0107989, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.05102789, + "balance_loss_mlp": 1.00069547, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.673772886679582, + "language_loss": 0.46221235, + "learning_rate": 2.612178751609011e-06, + "loss": 0.4830386, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.02035522, + "step": 6968, + "time_per_iteration": 3.1058645248413086 + }, + { + "auxiliary_loss_clip": 0.01139445, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.05407476, + "balance_loss_mlp": 1.02128911, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.7220329201702034, + "language_loss": 0.75197214, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77374089, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.16137695, + "step": 6969, + "time_per_iteration": 2.5614187717437744 + }, + { + "auxiliary_loss_clip": 0.01133553, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.0519197, + "balance_loss_mlp": 1.01857841, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.770894765371981, + "language_loss": 0.80847716, + "learning_rate": 2.611437167992705e-06, + "loss": 0.83013469, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13616943, + "step": 6970, + "time_per_iteration": 2.6363298892974854 + }, + { + "auxiliary_loss_clip": 0.01137358, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.05599821, + "balance_loss_mlp": 1.01907885, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.9008686786111233, + "language_loss": 0.8267498, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.84845996, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.14550781, + "step": 6971, + "time_per_iteration": 3.905534267425537 + }, + { + "auxiliary_loss_clip": 0.01140209, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.05684996, + "balance_loss_mlp": 1.02291453, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.7303397987290663, + "language_loss": 0.7491926, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.7709825, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.1585083, + "step": 6972, + "time_per_iteration": 2.421743154525757 + }, + { + "auxiliary_loss_clip": 0.01137752, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.05583167, + "balance_loss_mlp": 1.02686107, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4520449784085103, + "language_loss": 0.72907102, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75086451, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.1472168, + "step": 6973, + "time_per_iteration": 2.632549285888672 + }, + { + "auxiliary_loss_clip": 0.01145356, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.05805814, + "balance_loss_mlp": 1.02524316, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.259713880859707, + "language_loss": 0.75038362, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77225411, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.16442871, + "step": 6974, + "time_per_iteration": 2.465794324874878 + }, + { + "auxiliary_loss_clip": 0.01142955, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.0592953, + "balance_loss_mlp": 1.01887774, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.756472403945894, + "language_loss": 0.72516489, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74692643, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14331055, + "step": 6975, + "time_per_iteration": 2.4734151363372803 + }, + { + "auxiliary_loss_clip": 0.011426, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.05920088, + "balance_loss_mlp": 1.02378321, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.7560337631776404, + "language_loss": 0.80950791, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83131957, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14807129, + "step": 6976, + "time_per_iteration": 2.487454652786255 + }, + { + "auxiliary_loss_clip": 0.01136502, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.05482471, + "balance_loss_mlp": 1.01614022, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.7349981278648126, + "language_loss": 0.67924368, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.70090908, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13879395, + "step": 6977, + "time_per_iteration": 2.4604763984680176 + }, + { + "auxiliary_loss_clip": 0.01139012, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.05534387, + "balance_loss_mlp": 1.02590609, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.626599340917122, + "language_loss": 0.81220448, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83400667, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.1529541, + "step": 6978, + "time_per_iteration": 2.4293622970581055 + }, + { + "auxiliary_loss_clip": 0.01132266, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.04836059, + "balance_loss_mlp": 1.02480435, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.682873615902972, + "language_loss": 0.82942754, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.85115165, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15332031, + "step": 6979, + "time_per_iteration": 2.540107488632202 + }, + { + "auxiliary_loss_clip": 0.01135908, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.05366325, + "balance_loss_mlp": 1.02275658, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.8856213897722929, + "language_loss": 0.83299458, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85472846, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14727783, + "step": 6980, + "time_per_iteration": 3.9930951595306396 + }, + { + "auxiliary_loss_clip": 0.01146732, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.05888963, + "balance_loss_mlp": 1.02854609, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.6645898719413936, + "language_loss": 0.79545033, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81735456, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.15136719, + "step": 6981, + "time_per_iteration": 2.4327282905578613 + }, + { + "auxiliary_loss_clip": 0.01135042, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.05319154, + "balance_loss_mlp": 1.02690446, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 2.1683541730494973, + "language_loss": 0.84451973, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86628985, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.15075684, + "step": 6982, + "time_per_iteration": 2.497098684310913 + }, + { + "auxiliary_loss_clip": 0.01141252, + "auxiliary_loss_mlp": 0.01039764, + "balance_loss_clip": 1.05370986, + "balance_loss_mlp": 1.02396321, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 2.210529889811785, + "language_loss": 0.57078326, + "learning_rate": 2.606614618903214e-06, + "loss": 0.59259343, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.15826416, + "step": 6983, + "time_per_iteration": 2.5201849937438965 + }, + { + "auxiliary_loss_clip": 0.01140387, + "auxiliary_loss_mlp": 0.0104007, + "balance_loss_clip": 1.05776417, + "balance_loss_mlp": 1.02659297, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 1.9018998304016332, + "language_loss": 0.82123208, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84303665, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13482666, + "step": 6984, + "time_per_iteration": 2.45709490776062 + }, + { + "auxiliary_loss_clip": 0.01138209, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.05833125, + "balance_loss_mlp": 1.02213502, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.8997218752345064, + "language_loss": 0.7931788, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81492001, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13775635, + "step": 6985, + "time_per_iteration": 2.4864909648895264 + }, + { + "auxiliary_loss_clip": 0.01155027, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.06417716, + "balance_loss_mlp": 1.02561235, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.6262196564176103, + "language_loss": 0.78099775, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80295658, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.15246582, + "step": 6986, + "time_per_iteration": 2.5213980674743652 + }, + { + "auxiliary_loss_clip": 0.01134725, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.05671382, + "balance_loss_mlp": 1.02100658, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.6794242516945237, + "language_loss": 0.72297353, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74465781, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12689209, + "step": 6987, + "time_per_iteration": 2.4995274543762207 + }, + { + "auxiliary_loss_clip": 0.01136331, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.05425119, + "balance_loss_mlp": 1.02700639, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.5040947075286635, + "language_loss": 0.75515735, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77694559, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.15478516, + "step": 6988, + "time_per_iteration": 2.5536282062530518 + }, + { + "auxiliary_loss_clip": 0.01142758, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.05904686, + "balance_loss_mlp": 1.02064943, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.662263557916009, + "language_loss": 0.74284822, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76463169, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14923096, + "step": 6989, + "time_per_iteration": 2.5073211193084717 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.05315995, + "balance_loss_mlp": 1.02399969, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.4272865895687628, + "language_loss": 0.71096796, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73273182, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.1607666, + "step": 6990, + "time_per_iteration": 2.5414111614227295 + }, + { + "auxiliary_loss_clip": 0.01064785, + "auxiliary_loss_mlp": 0.01007255, + "balance_loss_clip": 1.03561711, + "balance_loss_mlp": 1.00540698, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8193258073058802, + "language_loss": 0.6047492, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62546957, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.29150391, + "router_z_loss_mlp": 0.01849365, + "step": 6991, + "time_per_iteration": 2.967381477355957 + }, + { + "auxiliary_loss_clip": 0.01144877, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.05877197, + "balance_loss_mlp": 1.02843142, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.7949890744783747, + "language_loss": 0.83129019, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85317636, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.15307617, + "step": 6992, + "time_per_iteration": 2.523789405822754 + }, + { + "auxiliary_loss_clip": 0.01084602, + "auxiliary_loss_mlp": 0.01009372, + "balance_loss_clip": 1.05271649, + "balance_loss_mlp": 1.00767648, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8076797798036379, + "language_loss": 0.65480781, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67574751, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.31933594, + "router_z_loss_mlp": 0.01693726, + "step": 6993, + "time_per_iteration": 3.190091848373413 + }, + { + "auxiliary_loss_clip": 0.01148359, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.06029439, + "balance_loss_mlp": 1.02121294, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 2.8947100012502123, + "language_loss": 0.83241343, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85427672, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.16760254, + "step": 6994, + "time_per_iteration": 3.806910276412964 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.05382919, + "balance_loss_mlp": 1.02123737, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 2.1147608442875354, + "language_loss": 0.78443897, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80611497, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.1373291, + "step": 6995, + "time_per_iteration": 2.4242019653320312 + }, + { + "auxiliary_loss_clip": 0.01133805, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.05403733, + "balance_loss_mlp": 1.0180285, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.7513642595645664, + "language_loss": 0.80197835, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82363045, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13366699, + "step": 6996, + "time_per_iteration": 2.500081777572632 + }, + { + "auxiliary_loss_clip": 0.01130104, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.04884946, + "balance_loss_mlp": 1.01819849, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 4.7240812175126585, + "language_loss": 0.7513355, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77295744, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13903809, + "step": 6997, + "time_per_iteration": 2.442742109298706 + }, + { + "auxiliary_loss_clip": 0.01136389, + "auxiliary_loss_mlp": 0.01038512, + "balance_loss_clip": 1.05399251, + "balance_loss_mlp": 1.02435589, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 2.1191016966432534, + "language_loss": 0.75369197, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77544099, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.14147949, + "step": 6998, + "time_per_iteration": 3.869091749191284 + }, + { + "auxiliary_loss_clip": 0.01137303, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_clip": 1.05125868, + "balance_loss_mlp": 1.03061295, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.7534712800553027, + "language_loss": 0.76147795, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78331137, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.15429688, + "step": 6999, + "time_per_iteration": 2.5502278804779053 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.05738783, + "balance_loss_mlp": 1.02269721, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 2.4902429514840616, + "language_loss": 0.64145446, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66323274, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.15136719, + "step": 7000, + "time_per_iteration": 2.6480026245117188 + }, + { + "auxiliary_loss_clip": 0.01142012, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.05898166, + "balance_loss_mlp": 1.02402115, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.568108771901181, + "language_loss": 0.76308632, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78489679, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.15002441, + "step": 7001, + "time_per_iteration": 2.4929020404815674 + }, + { + "auxiliary_loss_clip": 0.01131216, + "auxiliary_loss_mlp": 0.01036515, + "balance_loss_clip": 1.05020022, + "balance_loss_mlp": 1.02206683, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.452116036480679, + "language_loss": 0.86451805, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.8861953, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14453125, + "step": 7002, + "time_per_iteration": 2.4528164863586426 + }, + { + "auxiliary_loss_clip": 0.0113866, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.0590986, + "balance_loss_mlp": 1.02203226, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 2.361985890243976, + "language_loss": 0.67413676, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69587946, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13574219, + "step": 7003, + "time_per_iteration": 2.498795509338379 + }, + { + "auxiliary_loss_clip": 0.01137861, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_clip": 1.05290437, + "balance_loss_mlp": 1.02214146, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 1.9895238355670077, + "language_loss": 0.78127199, + "learning_rate": 2.598816148672344e-06, + "loss": 0.80303055, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15844727, + "step": 7004, + "time_per_iteration": 2.4971442222595215 + }, + { + "auxiliary_loss_clip": 0.01136709, + "auxiliary_loss_mlp": 0.01052282, + "balance_loss_clip": 1.05759907, + "balance_loss_mlp": 1.03404307, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.5910601384408776, + "language_loss": 0.68408346, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70597339, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.18237305, + "step": 7005, + "time_per_iteration": 2.5082716941833496 + }, + { + "auxiliary_loss_clip": 0.01133883, + "auxiliary_loss_mlp": 0.01040286, + "balance_loss_clip": 1.05082357, + "balance_loss_mlp": 1.02415681, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 2.538012068532979, + "language_loss": 0.72765577, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74939746, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.16137695, + "step": 7006, + "time_per_iteration": 2.5183498859405518 + }, + { + "auxiliary_loss_clip": 0.01142581, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.05958891, + "balance_loss_mlp": 1.01830256, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.6805517773588652, + "language_loss": 0.70978469, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.7315433, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14990234, + "step": 7007, + "time_per_iteration": 2.4861581325531006 + }, + { + "auxiliary_loss_clip": 0.01129367, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.04777372, + "balance_loss_mlp": 1.0211637, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7224975943563634, + "language_loss": 0.82286429, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84452814, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.15856934, + "step": 7008, + "time_per_iteration": 2.4384565353393555 + }, + { + "auxiliary_loss_clip": 0.01134703, + "auxiliary_loss_mlp": 0.01040795, + "balance_loss_clip": 1.0524168, + "balance_loss_mlp": 1.02517843, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 1.704523938031148, + "language_loss": 0.71786118, + "learning_rate": 2.596957889196831e-06, + "loss": 0.7396161, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.15618896, + "step": 7009, + "time_per_iteration": 2.5842769145965576 + }, + { + "auxiliary_loss_clip": 0.01137691, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_clip": 1.05253375, + "balance_loss_mlp": 1.03091574, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 3.6022844779122334, + "language_loss": 0.66019958, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68204892, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.16333008, + "step": 7010, + "time_per_iteration": 2.5408966541290283 + }, + { + "auxiliary_loss_clip": 0.01137841, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.05591214, + "balance_loss_mlp": 1.02021897, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.5352682793883026, + "language_loss": 0.72460586, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74633431, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14770508, + "step": 7011, + "time_per_iteration": 2.510507345199585 + }, + { + "auxiliary_loss_clip": 0.01071717, + "auxiliary_loss_mlp": 0.01006741, + "balance_loss_clip": 1.04159117, + "balance_loss_mlp": 1.00485158, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7894544873126879, + "language_loss": 0.54317367, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56395823, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.01889038, + "step": 7012, + "time_per_iteration": 2.958885431289673 + }, + { + "auxiliary_loss_clip": 0.01139651, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.05470717, + "balance_loss_mlp": 1.01871932, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.4035967885235636, + "language_loss": 0.78696495, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80870485, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15612793, + "step": 7013, + "time_per_iteration": 2.5112645626068115 + }, + { + "auxiliary_loss_clip": 0.01147879, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.06057596, + "balance_loss_mlp": 1.02059793, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.7225124852677802, + "language_loss": 0.81460452, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83644116, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15197754, + "step": 7014, + "time_per_iteration": 4.005672216415405 + }, + { + "auxiliary_loss_clip": 0.0113618, + "auxiliary_loss_mlp": 0.01035346, + "balance_loss_clip": 1.05403209, + "balance_loss_mlp": 1.02095747, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.5336927539697036, + "language_loss": 0.77872288, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.80043817, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.1439209, + "step": 7015, + "time_per_iteration": 2.49145770072937 + }, + { + "auxiliary_loss_clip": 0.01143868, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.06007051, + "balance_loss_mlp": 1.02093887, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.3328077428103597, + "language_loss": 0.82185531, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84365559, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.15222168, + "step": 7016, + "time_per_iteration": 2.5950746536254883 + }, + { + "auxiliary_loss_clip": 0.01136499, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.05394101, + "balance_loss_mlp": 1.02104533, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 3.5139638263101634, + "language_loss": 0.67705101, + "learning_rate": 2.593983497660586e-06, + "loss": 0.69877148, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14489746, + "step": 7017, + "time_per_iteration": 2.4599452018737793 + }, + { + "auxiliary_loss_clip": 0.01068483, + "auxiliary_loss_mlp": 0.01005831, + "balance_loss_clip": 1.0399754, + "balance_loss_mlp": 1.00412619, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.7047082457103051, + "language_loss": 0.59413928, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61488241, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.28466797, + "router_z_loss_mlp": 0.01702881, + "step": 7018, + "time_per_iteration": 3.239612579345703 + }, + { + "auxiliary_loss_clip": 0.01138861, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.05524445, + "balance_loss_mlp": 1.0197475, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.7687307865031727, + "language_loss": 0.75070703, + "learning_rate": 2.593239674255382e-06, + "loss": 0.7724371, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.1439209, + "step": 7019, + "time_per_iteration": 2.4803128242492676 + }, + { + "auxiliary_loss_clip": 0.01141806, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.05784965, + "balance_loss_mlp": 1.02010608, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.8291501670808787, + "language_loss": 0.69079268, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71256638, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.15466309, + "step": 7020, + "time_per_iteration": 2.4443209171295166 + }, + { + "auxiliary_loss_clip": 0.01143608, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.06395674, + "balance_loss_mlp": 1.01904261, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.8390645751156574, + "language_loss": 0.8097316, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83149242, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13433838, + "step": 7021, + "time_per_iteration": 2.450099468231201 + }, + { + "auxiliary_loss_clip": 0.01137644, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.05520427, + "balance_loss_mlp": 1.02056289, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.5182752083275228, + "language_loss": 0.69895816, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72068644, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.14624023, + "step": 7022, + "time_per_iteration": 2.5784389972686768 + }, + { + "auxiliary_loss_clip": 0.01134026, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.05760109, + "balance_loss_mlp": 1.01568985, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.781836669209464, + "language_loss": 0.67487544, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69649601, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12335205, + "step": 7023, + "time_per_iteration": 2.5187346935272217 + }, + { + "auxiliary_loss_clip": 0.01138212, + "auxiliary_loss_mlp": 0.01040923, + "balance_loss_clip": 1.05950403, + "balance_loss_mlp": 1.02623081, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.6963394214202419, + "language_loss": 0.69296741, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71475875, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14691162, + "step": 7024, + "time_per_iteration": 3.862839937210083 + }, + { + "auxiliary_loss_clip": 0.01143925, + "auxiliary_loss_mlp": 0.01038774, + "balance_loss_clip": 1.05903471, + "balance_loss_mlp": 1.02444482, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.617390164453968, + "language_loss": 0.76996207, + "learning_rate": 2.591007664594147e-06, + "loss": 0.791789, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.14331055, + "step": 7025, + "time_per_iteration": 2.4947054386138916 + }, + { + "auxiliary_loss_clip": 0.01140987, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.06278992, + "balance_loss_mlp": 1.02105749, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.8829991701442894, + "language_loss": 0.79629266, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81804681, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13378906, + "step": 7026, + "time_per_iteration": 2.4621355533599854 + }, + { + "auxiliary_loss_clip": 0.01098722, + "auxiliary_loss_mlp": 0.01006581, + "balance_loss_clip": 1.07029653, + "balance_loss_mlp": 1.00444162, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7179194320547916, + "language_loss": 0.61902332, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64007634, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.28466797, + "router_z_loss_mlp": 0.02142334, + "step": 7027, + "time_per_iteration": 3.1666972637176514 + }, + { + "auxiliary_loss_clip": 0.01129455, + "auxiliary_loss_mlp": 0.01037312, + "balance_loss_clip": 1.05012643, + "balance_loss_mlp": 1.02304864, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 2.3976970585917563, + "language_loss": 0.71471918, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.7363869, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.1427002, + "step": 7028, + "time_per_iteration": 2.490044116973877 + }, + { + "auxiliary_loss_clip": 0.01135316, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.05361629, + "balance_loss_mlp": 1.02381611, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 2.034599900340921, + "language_loss": 0.82566178, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84739292, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13989258, + "step": 7029, + "time_per_iteration": 2.494302749633789 + }, + { + "auxiliary_loss_clip": 0.01137346, + "auxiliary_loss_mlp": 0.01041847, + "balance_loss_clip": 1.0544796, + "balance_loss_mlp": 1.02676666, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 2.0607174846118084, + "language_loss": 0.75369507, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77548701, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.15093994, + "step": 7030, + "time_per_iteration": 2.4933292865753174 + }, + { + "auxiliary_loss_clip": 0.01137059, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.05617976, + "balance_loss_mlp": 1.02398694, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 2.0062217283249195, + "language_loss": 0.86559325, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88736236, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.15869141, + "step": 7031, + "time_per_iteration": 2.6530792713165283 + }, + { + "auxiliary_loss_clip": 0.01147635, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.06267762, + "balance_loss_mlp": 1.02658391, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 1.8985349225389851, + "language_loss": 0.73363996, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75553894, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15698242, + "step": 7032, + "time_per_iteration": 2.5809476375579834 + }, + { + "auxiliary_loss_clip": 0.01138829, + "auxiliary_loss_mlp": 0.01043498, + "balance_loss_clip": 1.05701494, + "balance_loss_mlp": 1.02916312, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.7851752423695368, + "language_loss": 0.7009486, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72277188, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.14337158, + "step": 7033, + "time_per_iteration": 2.5835299491882324 + }, + { + "auxiliary_loss_clip": 0.01142488, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.05762529, + "balance_loss_mlp": 1.02275276, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 2.390546757009574, + "language_loss": 0.90369093, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92549199, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14868164, + "step": 7034, + "time_per_iteration": 2.47159481048584 + }, + { + "auxiliary_loss_clip": 0.01135643, + "auxiliary_loss_mlp": 0.01036193, + "balance_loss_clip": 1.05477715, + "balance_loss_mlp": 1.02242398, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.572761661636572, + "language_loss": 0.76966918, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.7913875, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13775635, + "step": 7035, + "time_per_iteration": 2.536898136138916 + }, + { + "auxiliary_loss_clip": 0.01134751, + "auxiliary_loss_mlp": 0.01050273, + "balance_loss_clip": 1.05285954, + "balance_loss_mlp": 1.0341202, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.9920619043761691, + "language_loss": 0.82543528, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84728551, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.16149902, + "step": 7036, + "time_per_iteration": 2.4453587532043457 + }, + { + "auxiliary_loss_clip": 0.0113566, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.05714488, + "balance_loss_mlp": 1.02101791, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.640173530882597, + "language_loss": 0.70390159, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72561681, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.1484375, + "step": 7037, + "time_per_iteration": 2.5370917320251465 + }, + { + "auxiliary_loss_clip": 0.0113483, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.05406594, + "balance_loss_mlp": 1.01957929, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.642600575569063, + "language_loss": 0.78129137, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80298644, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.15100098, + "step": 7038, + "time_per_iteration": 2.434993267059326 + }, + { + "auxiliary_loss_clip": 0.01141723, + "auxiliary_loss_mlp": 0.01039768, + "balance_loss_clip": 1.05616176, + "balance_loss_mlp": 1.02407968, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.6974301962225145, + "language_loss": 0.66982591, + "learning_rate": 2.585796509770259e-06, + "loss": 0.69164085, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.15673828, + "step": 7039, + "time_per_iteration": 3.894371271133423 + }, + { + "auxiliary_loss_clip": 0.01132882, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.0478555, + "balance_loss_mlp": 1.01977479, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.7061299640680962, + "language_loss": 0.75726783, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.77894419, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14984131, + "step": 7040, + "time_per_iteration": 2.5239250659942627 + }, + { + "auxiliary_loss_clip": 0.01130859, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.04979432, + "balance_loss_mlp": 1.0150795, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.749423341862829, + "language_loss": 0.65120316, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.6728031, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14068604, + "step": 7041, + "time_per_iteration": 2.515859603881836 + }, + { + "auxiliary_loss_clip": 0.011375, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.05459392, + "balance_loss_mlp": 1.01753092, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.96560074523605, + "language_loss": 0.74487108, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76658112, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.15966797, + "step": 7042, + "time_per_iteration": 4.074299573898315 + }, + { + "auxiliary_loss_clip": 0.01139871, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.05628467, + "balance_loss_mlp": 1.02075183, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3649064883201998, + "language_loss": 0.82436025, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84610873, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14221191, + "step": 7043, + "time_per_iteration": 2.4931766986846924 + }, + { + "auxiliary_loss_clip": 0.01141416, + "auxiliary_loss_mlp": 0.01038417, + "balance_loss_clip": 1.06078529, + "balance_loss_mlp": 1.02309823, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.214442314540012, + "language_loss": 0.64891863, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.670717, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.15307617, + "step": 7044, + "time_per_iteration": 2.5860190391540527 + }, + { + "auxiliary_loss_clip": 0.01149831, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.06439006, + "balance_loss_mlp": 1.02991271, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.715850925657069, + "language_loss": 0.7512185, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77317727, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.16125488, + "step": 7045, + "time_per_iteration": 2.590158462524414 + }, + { + "auxiliary_loss_clip": 0.01144161, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.0635972, + "balance_loss_mlp": 1.02311945, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.5421184095894303, + "language_loss": 0.80764902, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82946932, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14746094, + "step": 7046, + "time_per_iteration": 2.5036814212799072 + }, + { + "auxiliary_loss_clip": 0.0113863, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.05496407, + "balance_loss_mlp": 1.02276456, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.7116976184737145, + "language_loss": 0.76875764, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.79051548, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.1439209, + "step": 7047, + "time_per_iteration": 2.503643751144409 + }, + { + "auxiliary_loss_clip": 0.01141426, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.06203675, + "balance_loss_mlp": 1.01818812, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.6919250196418547, + "language_loss": 0.68273294, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70447278, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.14361572, + "step": 7048, + "time_per_iteration": 2.5225446224212646 + }, + { + "auxiliary_loss_clip": 0.01138683, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.05587482, + "balance_loss_mlp": 1.02650023, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.8319703322538274, + "language_loss": 0.78290236, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80470943, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.15551758, + "step": 7049, + "time_per_iteration": 2.5342984199523926 + }, + { + "auxiliary_loss_clip": 0.01147938, + "auxiliary_loss_mlp": 0.0104109, + "balance_loss_clip": 1.06423259, + "balance_loss_mlp": 1.02636743, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 2.330212613598431, + "language_loss": 0.83138126, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.8532716, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1472168, + "step": 7050, + "time_per_iteration": 2.543429374694824 + }, + { + "auxiliary_loss_clip": 0.01137862, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.05522454, + "balance_loss_mlp": 1.01955295, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 7.466043799061127, + "language_loss": 0.73466921, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75639009, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14678955, + "step": 7051, + "time_per_iteration": 2.4627621173858643 + }, + { + "auxiliary_loss_clip": 0.0112989, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.04917884, + "balance_loss_mlp": 1.0202024, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.6120317603015262, + "language_loss": 0.8625437, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88418609, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.14135742, + "step": 7052, + "time_per_iteration": 2.5371413230895996 + }, + { + "auxiliary_loss_clip": 0.01129737, + "auxiliary_loss_mlp": 0.01044839, + "balance_loss_clip": 1.0473429, + "balance_loss_mlp": 1.02957463, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4205936862089048, + "language_loss": 0.72560501, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74735081, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.15240479, + "step": 7053, + "time_per_iteration": 2.4843950271606445 + }, + { + "auxiliary_loss_clip": 0.01130798, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.0510118, + "balance_loss_mlp": 1.01908588, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.71323348837709, + "language_loss": 0.82223058, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84387481, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14526367, + "step": 7054, + "time_per_iteration": 2.5563549995422363 + }, + { + "auxiliary_loss_clip": 0.01063798, + "auxiliary_loss_mlp": 0.0101335, + "balance_loss_clip": 1.03408563, + "balance_loss_mlp": 1.01168561, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7910581009474078, + "language_loss": 0.60350275, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62427425, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.01664734, + "step": 7055, + "time_per_iteration": 3.0484416484832764 + }, + { + "auxiliary_loss_clip": 0.01138956, + "auxiliary_loss_mlp": 0.01052974, + "balance_loss_clip": 1.05398083, + "balance_loss_mlp": 1.03645205, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.8590731646587177, + "language_loss": 0.77133447, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79325378, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.16516113, + "step": 7056, + "time_per_iteration": 2.452206611633301 + }, + { + "auxiliary_loss_clip": 0.01149788, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.06376898, + "balance_loss_mlp": 1.02303863, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 6.899230859441854, + "language_loss": 0.84502995, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86692667, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.16845703, + "step": 7057, + "time_per_iteration": 2.5126044750213623 + }, + { + "auxiliary_loss_clip": 0.01149171, + "auxiliary_loss_mlp": 0.01042179, + "balance_loss_clip": 1.0608288, + "balance_loss_mlp": 1.0261569, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.550758887170174, + "language_loss": 0.82702827, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.84894168, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.16015625, + "step": 7058, + "time_per_iteration": 3.977874279022217 + }, + { + "auxiliary_loss_clip": 0.01142943, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.06281221, + "balance_loss_mlp": 1.01495481, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.6406845214706207, + "language_loss": 0.80603838, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82775736, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.14013672, + "step": 7059, + "time_per_iteration": 2.5444319248199463 + }, + { + "auxiliary_loss_clip": 0.01144315, + "auxiliary_loss_mlp": 0.01039311, + "balance_loss_clip": 1.05740738, + "balance_loss_mlp": 1.0229193, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9827040037618335, + "language_loss": 0.70602214, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72785842, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.16381836, + "step": 7060, + "time_per_iteration": 2.4445183277130127 + }, + { + "auxiliary_loss_clip": 0.01149034, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.06443512, + "balance_loss_mlp": 1.02041316, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.6445441586611207, + "language_loss": 0.75974393, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78158385, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14550781, + "step": 7061, + "time_per_iteration": 2.5475058555603027 + }, + { + "auxiliary_loss_clip": 0.01145019, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.06015134, + "balance_loss_mlp": 1.02491903, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 2.1676181082568844, + "language_loss": 0.72706312, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.7489239, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.16143799, + "step": 7062, + "time_per_iteration": 2.460846185684204 + }, + { + "auxiliary_loss_clip": 0.01136058, + "auxiliary_loss_mlp": 0.01043562, + "balance_loss_clip": 1.05349827, + "balance_loss_mlp": 1.02878642, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 2.2165569188941947, + "language_loss": 0.66664219, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68843842, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14788818, + "step": 7063, + "time_per_iteration": 2.4692344665527344 + }, + { + "auxiliary_loss_clip": 0.01135075, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.05601525, + "balance_loss_mlp": 1.02019536, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.5652859395522483, + "language_loss": 0.78801441, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80970412, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13708496, + "step": 7064, + "time_per_iteration": 2.5922937393188477 + }, + { + "auxiliary_loss_clip": 0.01146572, + "auxiliary_loss_mlp": 0.01043805, + "balance_loss_clip": 1.05682623, + "balance_loss_mlp": 1.02768826, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 2.0517861228896974, + "language_loss": 0.75220597, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77410972, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16113281, + "step": 7065, + "time_per_iteration": 2.473820686340332 + }, + { + "auxiliary_loss_clip": 0.01145658, + "auxiliary_loss_mlp": 0.01038993, + "balance_loss_clip": 1.06072855, + "balance_loss_mlp": 1.02350807, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.2952847886182417, + "language_loss": 0.726336, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74818254, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15496826, + "step": 7066, + "time_per_iteration": 2.475551128387451 + }, + { + "auxiliary_loss_clip": 0.01134421, + "auxiliary_loss_mlp": 0.01043617, + "balance_loss_clip": 1.0483768, + "balance_loss_mlp": 1.02711272, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 2.1187248050816865, + "language_loss": 0.79949468, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.82127512, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.16503906, + "step": 7067, + "time_per_iteration": 2.4769434928894043 + }, + { + "auxiliary_loss_clip": 0.01075595, + "auxiliary_loss_mlp": 0.01019833, + "balance_loss_clip": 1.04593492, + "balance_loss_mlp": 1.01807213, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.978180151302128, + "language_loss": 0.63435125, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65530556, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.29736328, + "router_z_loss_mlp": 0.01760864, + "step": 7068, + "time_per_iteration": 4.461661338806152 + }, + { + "auxiliary_loss_clip": 0.01141042, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.05740738, + "balance_loss_mlp": 1.0192523, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.6562837447025456, + "language_loss": 0.72740436, + "learning_rate": 2.574615138284361e-06, + "loss": 0.7491647, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.15740967, + "step": 7069, + "time_per_iteration": 2.440783977508545 + }, + { + "auxiliary_loss_clip": 0.01148638, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.06498241, + "balance_loss_mlp": 1.02004957, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 2.191209105262844, + "language_loss": 0.79233968, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81419253, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.16577148, + "step": 7070, + "time_per_iteration": 2.547268867492676 + }, + { + "auxiliary_loss_clip": 0.0113177, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.04779053, + "balance_loss_mlp": 1.0229826, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 2.16298024132372, + "language_loss": 0.70478725, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72650456, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1697998, + "step": 7071, + "time_per_iteration": 2.486485242843628 + }, + { + "auxiliary_loss_clip": 0.01139564, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.05592871, + "balance_loss_mlp": 1.01759434, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 3.015775729672512, + "language_loss": 0.71454406, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73626024, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14453125, + "step": 7072, + "time_per_iteration": 2.5640103816986084 + }, + { + "auxiliary_loss_clip": 0.01138624, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.05464089, + "balance_loss_mlp": 1.02597797, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.6603125241682903, + "language_loss": 0.81542647, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.8372196, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14703369, + "step": 7073, + "time_per_iteration": 2.5601723194122314 + }, + { + "auxiliary_loss_clip": 0.01133122, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.05227709, + "balance_loss_mlp": 1.0194447, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.4097063933765637, + "language_loss": 0.91357273, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.9352389, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14056396, + "step": 7074, + "time_per_iteration": 2.5105209350585938 + }, + { + "auxiliary_loss_clip": 0.01138766, + "auxiliary_loss_mlp": 0.01042597, + "balance_loss_clip": 1.05267453, + "balance_loss_mlp": 1.0256933, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.573759184242696, + "language_loss": 0.64282781, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66464144, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.16894531, + "step": 7075, + "time_per_iteration": 2.469547986984253 + }, + { + "auxiliary_loss_clip": 0.01134737, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.05551958, + "balance_loss_mlp": 1.02113628, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.9481481357562958, + "language_loss": 0.73597717, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75767577, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13983154, + "step": 7076, + "time_per_iteration": 2.576343297958374 + }, + { + "auxiliary_loss_clip": 0.01144418, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.05845833, + "balance_loss_mlp": 1.02653909, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 1.7320862358740838, + "language_loss": 0.7877686, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80963093, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.15264893, + "step": 7077, + "time_per_iteration": 2.574442148208618 + }, + { + "auxiliary_loss_clip": 0.0112291, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.04586363, + "balance_loss_mlp": 1.01713538, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6594461961243268, + "language_loss": 0.72805911, + "learning_rate": 2.571256885418265e-06, + "loss": 0.7495966, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13708496, + "step": 7078, + "time_per_iteration": 2.5895307064056396 + }, + { + "auxiliary_loss_clip": 0.01132892, + "auxiliary_loss_mlp": 0.01040641, + "balance_loss_clip": 1.0538727, + "balance_loss_mlp": 1.02673578, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 2.098085202734037, + "language_loss": 0.7991159, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.82085127, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13903809, + "step": 7079, + "time_per_iteration": 2.4726877212524414 + }, + { + "auxiliary_loss_clip": 0.01137246, + "auxiliary_loss_mlp": 0.01035879, + "balance_loss_clip": 1.05534458, + "balance_loss_mlp": 1.02214634, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.3823474989100093, + "language_loss": 0.71855319, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74028444, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13739014, + "step": 7080, + "time_per_iteration": 2.738055944442749 + }, + { + "auxiliary_loss_clip": 0.01124978, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.0443387, + "balance_loss_mlp": 1.02212143, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 14.10419639705933, + "language_loss": 0.80902368, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.83063543, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.140625, + "step": 7081, + "time_per_iteration": 3.909024477005005 + }, + { + "auxiliary_loss_clip": 0.01124211, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.047786, + "balance_loss_mlp": 1.02099657, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.5610204425027694, + "language_loss": 0.81448102, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83608758, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.15454102, + "step": 7082, + "time_per_iteration": 2.46596622467041 + }, + { + "auxiliary_loss_clip": 0.01128816, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.04892015, + "balance_loss_mlp": 1.02131093, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 1.8864401665101662, + "language_loss": 0.70279551, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72444046, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.14367676, + "step": 7083, + "time_per_iteration": 2.598536491394043 + }, + { + "auxiliary_loss_clip": 0.01062686, + "auxiliary_loss_mlp": 0.01013268, + "balance_loss_clip": 1.03302157, + "balance_loss_mlp": 1.0115366, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8781218524507035, + "language_loss": 0.67089885, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69165838, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.29638672, + "router_z_loss_mlp": 0.01733398, + "step": 7084, + "time_per_iteration": 3.145709276199341 + }, + { + "auxiliary_loss_clip": 0.01131768, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.05188644, + "balance_loss_mlp": 1.02263808, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.0886979322091075, + "language_loss": 0.78885221, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.81055444, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.15814209, + "step": 7085, + "time_per_iteration": 2.5051939487457275 + }, + { + "auxiliary_loss_clip": 0.01138671, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.05160189, + "balance_loss_mlp": 1.02723622, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 1.9311081095164846, + "language_loss": 0.7602855, + "learning_rate": 2.568270298414995e-06, + "loss": 0.7821101, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.16534424, + "step": 7086, + "time_per_iteration": 3.856523275375366 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01039091, + "balance_loss_clip": 1.04656637, + "balance_loss_mlp": 1.02385592, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 2.2802793561696237, + "language_loss": 0.80384189, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82550973, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.15234375, + "step": 7087, + "time_per_iteration": 2.6863274574279785 + }, + { + "auxiliary_loss_clip": 0.01135249, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.05141294, + "balance_loss_mlp": 1.01940334, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.946882188035185, + "language_loss": 0.66093409, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68263936, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 7088, + "time_per_iteration": 2.5484507083892822 + }, + { + "auxiliary_loss_clip": 0.01136569, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.05308533, + "balance_loss_mlp": 1.02685404, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 2.1031390159409513, + "language_loss": 0.68656123, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70835567, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.16015625, + "step": 7089, + "time_per_iteration": 2.5170140266418457 + }, + { + "auxiliary_loss_clip": 0.01135049, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.05131257, + "balance_loss_mlp": 1.02642584, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.2547904126439717, + "language_loss": 0.73201871, + "learning_rate": 2.566776487287525e-06, + "loss": 0.7537955, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.1619873, + "step": 7090, + "time_per_iteration": 2.468060255050659 + }, + { + "auxiliary_loss_clip": 0.01135312, + "auxiliary_loss_mlp": 0.01050205, + "balance_loss_clip": 1.04942441, + "balance_loss_mlp": 1.03430867, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.8960329363096582, + "language_loss": 0.75242192, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77427709, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.15887451, + "step": 7091, + "time_per_iteration": 2.498476505279541 + }, + { + "auxiliary_loss_clip": 0.01136967, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.05743241, + "balance_loss_mlp": 1.02179873, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 2.9826896759777135, + "language_loss": 0.82820618, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84991634, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12243652, + "step": 7092, + "time_per_iteration": 2.4887735843658447 + }, + { + "auxiliary_loss_clip": 0.01139254, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.0535233, + "balance_loss_mlp": 1.01965785, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.7253460493624815, + "language_loss": 0.73767042, + "learning_rate": 2.565655903224038e-06, + "loss": 0.75940526, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.14587402, + "step": 7093, + "time_per_iteration": 2.508667230606079 + }, + { + "auxiliary_loss_clip": 0.0113714, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.05648565, + "balance_loss_mlp": 1.02613235, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 3.7175743013822644, + "language_loss": 0.70219672, + "learning_rate": 2.565282332284532e-06, + "loss": 0.72397333, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.1439209, + "step": 7094, + "time_per_iteration": 2.518872022628784 + }, + { + "auxiliary_loss_clip": 0.01134316, + "auxiliary_loss_mlp": 0.01045398, + "balance_loss_clip": 1.05145431, + "balance_loss_mlp": 1.02812409, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.5619408059069817, + "language_loss": 0.81783354, + "learning_rate": 2.564908739909464e-06, + "loss": 0.8396306, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.17260742, + "step": 7095, + "time_per_iteration": 2.4571354389190674 + }, + { + "auxiliary_loss_clip": 0.01136624, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.05159402, + "balance_loss_mlp": 1.0332135, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 4.945283358281924, + "language_loss": 0.80677986, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82862914, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15093994, + "step": 7096, + "time_per_iteration": 2.6775505542755127 + }, + { + "auxiliary_loss_clip": 0.01146372, + "auxiliary_loss_mlp": 0.01038891, + "balance_loss_clip": 1.05937684, + "balance_loss_mlp": 1.02438319, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 2.397647904050872, + "language_loss": 0.65585059, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.6777032, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.1449585, + "step": 7097, + "time_per_iteration": 2.492129325866699 + }, + { + "auxiliary_loss_clip": 0.01134973, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.05562747, + "balance_loss_mlp": 1.02012646, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.6761506247118054, + "language_loss": 0.74206185, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76375109, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13830566, + "step": 7098, + "time_per_iteration": 2.651604175567627 + }, + { + "auxiliary_loss_clip": 0.01138988, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.05803287, + "balance_loss_mlp": 1.01639664, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.697026009419006, + "language_loss": 0.75400853, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77569795, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13568115, + "step": 7099, + "time_per_iteration": 2.5068931579589844 + }, + { + "auxiliary_loss_clip": 0.01139728, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.05618465, + "balance_loss_mlp": 1.02640891, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.209633973633855, + "language_loss": 0.83370727, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85551679, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14794922, + "step": 7100, + "time_per_iteration": 2.537167549133301 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.05401862, + "balance_loss_mlp": 1.02088773, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3803293903906364, + "language_loss": 0.82265091, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84436166, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13500977, + "step": 7101, + "time_per_iteration": 3.913269519805908 + }, + { + "auxiliary_loss_clip": 0.0113596, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.05010533, + "balance_loss_mlp": 1.02228451, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 3.3969863105426406, + "language_loss": 0.73218769, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.75393403, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.16394043, + "step": 7102, + "time_per_iteration": 2.442413568496704 + }, + { + "auxiliary_loss_clip": 0.01127912, + "auxiliary_loss_mlp": 0.01038928, + "balance_loss_clip": 1.04811001, + "balance_loss_mlp": 1.02402651, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.7384388556128207, + "language_loss": 0.82802171, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.84969008, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14904785, + "step": 7103, + "time_per_iteration": 2.4457204341888428 + }, + { + "auxiliary_loss_clip": 0.01141033, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.05392206, + "balance_loss_mlp": 1.01896977, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.122723702895682, + "language_loss": 0.73721325, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75895989, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.14672852, + "step": 7104, + "time_per_iteration": 2.4416041374206543 + }, + { + "auxiliary_loss_clip": 0.01138693, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.05674112, + "balance_loss_mlp": 1.01814246, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 1.850276233346572, + "language_loss": 0.74951673, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77123737, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15228271, + "step": 7105, + "time_per_iteration": 2.551807165145874 + }, + { + "auxiliary_loss_clip": 0.01141378, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.05510318, + "balance_loss_mlp": 1.03321767, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 2.5677196915414733, + "language_loss": 0.76854622, + "learning_rate": 2.560797813088819e-06, + "loss": 0.79043269, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.140625, + "step": 7106, + "time_per_iteration": 2.430635690689087 + }, + { + "auxiliary_loss_clip": 0.01132011, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.04810286, + "balance_loss_mlp": 1.02961516, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7738703515770387, + "language_loss": 0.80211866, + "learning_rate": 2.560423964592229e-06, + "loss": 0.82387447, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13970947, + "step": 7107, + "time_per_iteration": 2.452803373336792 + }, + { + "auxiliary_loss_clip": 0.01136157, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_clip": 1.05391288, + "balance_loss_mlp": 1.03189921, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.4495058639610878, + "language_loss": 0.68076909, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70259845, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14892578, + "step": 7108, + "time_per_iteration": 2.562007427215576 + }, + { + "auxiliary_loss_clip": 0.01137109, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.05277514, + "balance_loss_mlp": 1.02318144, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.7084971458287217, + "language_loss": 0.71362096, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73536038, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13653564, + "step": 7109, + "time_per_iteration": 2.433898687362671 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.0546422, + "balance_loss_mlp": 1.02311444, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 2.487697434408835, + "language_loss": 0.64932263, + "learning_rate": 2.559302291651174e-06, + "loss": 0.67109978, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.16577148, + "step": 7110, + "time_per_iteration": 2.5965137481689453 + }, + { + "auxiliary_loss_clip": 0.01136236, + "auxiliary_loss_mlp": 0.01035308, + "balance_loss_clip": 1.05448866, + "balance_loss_mlp": 1.02010846, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6320613305263192, + "language_loss": 0.76459295, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78630841, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.1519165, + "step": 7111, + "time_per_iteration": 2.487492322921753 + }, + { + "auxiliary_loss_clip": 0.01134979, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.05225015, + "balance_loss_mlp": 1.02056909, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.7337271526231404, + "language_loss": 0.72960824, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75130022, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13653564, + "step": 7112, + "time_per_iteration": 3.8338751792907715 + }, + { + "auxiliary_loss_clip": 0.01137539, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.05546999, + "balance_loss_mlp": 1.026793, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.51671205558839, + "language_loss": 0.71484649, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73662531, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13537598, + "step": 7113, + "time_per_iteration": 2.5381338596343994 + }, + { + "auxiliary_loss_clip": 0.01142688, + "auxiliary_loss_mlp": 0.01040712, + "balance_loss_clip": 1.05782127, + "balance_loss_mlp": 1.0263474, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.838924314977121, + "language_loss": 0.61894977, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64078379, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14355469, + "step": 7114, + "time_per_iteration": 2.4764463901519775 + }, + { + "auxiliary_loss_clip": 0.01145018, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.05766463, + "balance_loss_mlp": 1.025195, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.857013199942006, + "language_loss": 0.64811796, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66998577, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.16552734, + "step": 7115, + "time_per_iteration": 2.5066988468170166 + }, + { + "auxiliary_loss_clip": 0.01150789, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.06634557, + "balance_loss_mlp": 1.0218848, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.7212040936893371, + "language_loss": 0.73508537, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75694448, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13244629, + "step": 7116, + "time_per_iteration": 2.464704751968384 + }, + { + "auxiliary_loss_clip": 0.01133259, + "auxiliary_loss_mlp": 0.01039394, + "balance_loss_clip": 1.05430984, + "balance_loss_mlp": 1.02638841, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.6646639586995373, + "language_loss": 0.69196987, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71369648, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13000488, + "step": 7117, + "time_per_iteration": 2.5395238399505615 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01045333, + "balance_loss_clip": 1.05229974, + "balance_loss_mlp": 1.03073013, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.7258538668780723, + "language_loss": 0.70260221, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72441447, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14599609, + "step": 7118, + "time_per_iteration": 2.399266004562378 + }, + { + "auxiliary_loss_clip": 0.0113315, + "auxiliary_loss_mlp": 0.01049161, + "balance_loss_clip": 1.05122447, + "balance_loss_mlp": 1.03362811, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 1.6755713043724385, + "language_loss": 0.74751949, + "learning_rate": 2.55593612908444e-06, + "loss": 0.7693426, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15527344, + "step": 7119, + "time_per_iteration": 2.564098596572876 + }, + { + "auxiliary_loss_clip": 0.01135289, + "auxiliary_loss_mlp": 0.01034506, + "balance_loss_clip": 1.05364788, + "balance_loss_mlp": 1.02069569, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 2.1332153275886943, + "language_loss": 0.74877167, + "learning_rate": 2.555562005426573e-06, + "loss": 0.77046967, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13812256, + "step": 7120, + "time_per_iteration": 2.4537172317504883 + }, + { + "auxiliary_loss_clip": 0.01141761, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.06060457, + "balance_loss_mlp": 1.02364731, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.7851960092835137, + "language_loss": 0.77025592, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.79204261, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13269043, + "step": 7121, + "time_per_iteration": 2.4954168796539307 + }, + { + "auxiliary_loss_clip": 0.01131318, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.0505197, + "balance_loss_mlp": 1.01879632, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 2.0377117661473436, + "language_loss": 0.85759455, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87922293, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.1272583, + "step": 7122, + "time_per_iteration": 2.4428563117980957 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01034428, + "balance_loss_clip": 1.05548286, + "balance_loss_mlp": 1.02065945, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 2.614361838879552, + "language_loss": 0.81582785, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83755714, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13757324, + "step": 7123, + "time_per_iteration": 2.635070562362671 + }, + { + "auxiliary_loss_clip": 0.01137621, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.05730247, + "balance_loss_mlp": 1.01885366, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.5629408557971918, + "language_loss": 0.80880165, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83050144, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13513184, + "step": 7124, + "time_per_iteration": 2.4436585903167725 + }, + { + "auxiliary_loss_clip": 0.01136191, + "auxiliary_loss_mlp": 0.01040147, + "balance_loss_clip": 1.05473542, + "balance_loss_mlp": 1.02552056, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.6993398684800656, + "language_loss": 0.80240929, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82417262, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14642334, + "step": 7125, + "time_per_iteration": 3.8941917419433594 + }, + { + "auxiliary_loss_clip": 0.01140255, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.06041038, + "balance_loss_mlp": 1.02010036, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.9723371474338303, + "language_loss": 0.75338, + "learning_rate": 2.553316821569659e-06, + "loss": 0.77511096, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12738037, + "step": 7126, + "time_per_iteration": 2.4759128093719482 + }, + { + "auxiliary_loss_clip": 0.01132668, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.05275619, + "balance_loss_mlp": 1.02059484, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 1.653140016943561, + "language_loss": 0.81410617, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83577776, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13897705, + "step": 7127, + "time_per_iteration": 2.5144739151000977 + }, + { + "auxiliary_loss_clip": 0.01141676, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.06095099, + "balance_loss_mlp": 1.02501225, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 2.2893689915002584, + "language_loss": 0.76424313, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.7860477, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13763428, + "step": 7128, + "time_per_iteration": 2.4604804515838623 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.05226922, + "balance_loss_mlp": 1.02409852, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 2.457684406960515, + "language_loss": 0.73664135, + "learning_rate": 2.552193946194937e-06, + "loss": 0.75835967, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14013672, + "step": 7129, + "time_per_iteration": 2.502371072769165 + }, + { + "auxiliary_loss_clip": 0.01142313, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.0621717, + "balance_loss_mlp": 1.02012885, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.9415711461453153, + "language_loss": 0.77474821, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.7965098, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.1373291, + "step": 7130, + "time_per_iteration": 3.8965725898742676 + }, + { + "auxiliary_loss_clip": 0.01146943, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.06400061, + "balance_loss_mlp": 1.02161145, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 2.0299992401735034, + "language_loss": 0.73383415, + "learning_rate": 2.551445257891886e-06, + "loss": 0.7556628, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14312744, + "step": 7131, + "time_per_iteration": 2.480825901031494 + }, + { + "auxiliary_loss_clip": 0.01138797, + "auxiliary_loss_mlp": 0.01039074, + "balance_loss_clip": 1.05639815, + "balance_loss_mlp": 1.02470374, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.313024553557765, + "language_loss": 0.7706098, + "learning_rate": 2.551070882366973e-06, + "loss": 0.7923885, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14373779, + "step": 7132, + "time_per_iteration": 2.419172525405884 + }, + { + "auxiliary_loss_clip": 0.01133177, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.05225158, + "balance_loss_mlp": 1.02460718, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.5380221470874624, + "language_loss": 0.78681791, + "learning_rate": 2.550696485945397e-06, + "loss": 0.8085342, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13842773, + "step": 7133, + "time_per_iteration": 2.4658095836639404 + }, + { + "auxiliary_loss_clip": 0.01140829, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.05897498, + "balance_loss_mlp": 1.01899874, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.8484623243733846, + "language_loss": 0.74660659, + "learning_rate": 2.550322068641355e-06, + "loss": 0.76833916, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13427734, + "step": 7134, + "time_per_iteration": 2.4084599018096924 + }, + { + "auxiliary_loss_clip": 0.01136008, + "auxiliary_loss_mlp": 0.01027588, + "balance_loss_clip": 1.05599499, + "balance_loss_mlp": 1.0151782, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.8580230845239882, + "language_loss": 0.84194672, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86358273, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12402344, + "step": 7135, + "time_per_iteration": 2.417694330215454 + }, + { + "auxiliary_loss_clip": 0.01142422, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.06455564, + "balance_loss_mlp": 1.01875949, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.9333671768396667, + "language_loss": 0.75065327, + "learning_rate": 2.549573171442666e-06, + "loss": 0.7724002, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13519287, + "step": 7136, + "time_per_iteration": 2.6433169841766357 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.04336965, + "balance_loss_mlp": 1.02060032, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 2.3571275179785713, + "language_loss": 0.79303962, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81463057, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14007568, + "step": 7137, + "time_per_iteration": 2.529742479324341 + }, + { + "auxiliary_loss_clip": 0.01141376, + "auxiliary_loss_mlp": 0.01037589, + "balance_loss_clip": 1.05711246, + "balance_loss_mlp": 1.02176952, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 3.1591759558773362, + "language_loss": 0.77024949, + "learning_rate": 2.548824190884499e-06, + "loss": 0.79203916, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.15814209, + "step": 7138, + "time_per_iteration": 2.4891393184661865 + }, + { + "auxiliary_loss_clip": 0.01076548, + "auxiliary_loss_mlp": 0.01011021, + "balance_loss_clip": 1.04644823, + "balance_loss_mlp": 1.00924528, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7723823747780798, + "language_loss": 0.56218195, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58305764, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.30175781, + "router_z_loss_mlp": 0.01776123, + "step": 7139, + "time_per_iteration": 2.9491403102874756 + }, + { + "auxiliary_loss_clip": 0.01124205, + "auxiliary_loss_mlp": 0.01035655, + "balance_loss_clip": 1.04736185, + "balance_loss_mlp": 1.02396083, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6613074890747448, + "language_loss": 0.8097595, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83135808, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11706543, + "step": 7140, + "time_per_iteration": 2.501323699951172 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.04616666, + "balance_loss_mlp": 1.01863837, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.789851070546887, + "language_loss": 0.8199836, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84156811, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13671875, + "step": 7141, + "time_per_iteration": 2.4474127292633057 + }, + { + "auxiliary_loss_clip": 0.01136801, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.05451608, + "balance_loss_mlp": 1.02453995, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.7943841611109839, + "language_loss": 0.8650701, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88682938, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.14562988, + "step": 7142, + "time_per_iteration": 2.5248756408691406 + }, + { + "auxiliary_loss_clip": 0.01135205, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.0565505, + "balance_loss_mlp": 1.01726627, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.8169166589737138, + "language_loss": 0.77792907, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.79958451, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13079834, + "step": 7143, + "time_per_iteration": 2.5092732906341553 + }, + { + "auxiliary_loss_clip": 0.01135988, + "auxiliary_loss_mlp": 0.01035701, + "balance_loss_clip": 1.05749273, + "balance_loss_mlp": 1.02283216, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.2006105132533125, + "language_loss": 0.76791668, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78963363, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12872314, + "step": 7144, + "time_per_iteration": 2.512146472930908 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.05466294, + "balance_loss_mlp": 1.02085698, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 1.7591652870261765, + "language_loss": 0.73329687, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75502735, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14074707, + "step": 7145, + "time_per_iteration": 4.037315607070923 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.05409408, + "balance_loss_mlp": 1.02003098, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.9400809428485901, + "language_loss": 0.79400718, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81571823, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14251709, + "step": 7146, + "time_per_iteration": 2.4389488697052 + }, + { + "auxiliary_loss_clip": 0.01127411, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.04944277, + "balance_loss_mlp": 1.01955831, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 2.1975527954055574, + "language_loss": 0.82453877, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.84613365, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12512207, + "step": 7147, + "time_per_iteration": 2.500213384628296 + }, + { + "auxiliary_loss_clip": 0.01131961, + "auxiliary_loss_mlp": 0.01036829, + "balance_loss_clip": 1.05044389, + "balance_loss_mlp": 1.02188027, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.8481053926945223, + "language_loss": 0.87135744, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89304537, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.1494751, + "step": 7148, + "time_per_iteration": 2.561002492904663 + }, + { + "auxiliary_loss_clip": 0.0113072, + "auxiliary_loss_mlp": 0.01036423, + "balance_loss_clip": 1.05066872, + "balance_loss_mlp": 1.02342296, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.883089897753788, + "language_loss": 0.78029931, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80197072, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13000488, + "step": 7149, + "time_per_iteration": 2.6905410289764404 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.0456202, + "balance_loss_mlp": 1.02287984, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.8447252340758007, + "language_loss": 0.79980731, + "learning_rate": 2.544328563349256e-06, + "loss": 0.82139379, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13439941, + "step": 7150, + "time_per_iteration": 2.523754358291626 + }, + { + "auxiliary_loss_clip": 0.01140957, + "auxiliary_loss_mlp": 0.01043578, + "balance_loss_clip": 1.05675936, + "balance_loss_mlp": 1.02818775, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.7516684317165838, + "language_loss": 0.74954611, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77139151, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.1539917, + "step": 7151, + "time_per_iteration": 2.4885194301605225 + }, + { + "auxiliary_loss_clip": 0.011335, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.05204868, + "balance_loss_mlp": 1.01833475, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 2.4653628420618356, + "language_loss": 0.70033062, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72198975, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14080811, + "step": 7152, + "time_per_iteration": 2.467653274536133 + }, + { + "auxiliary_loss_clip": 0.01130393, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.0495441, + "balance_loss_mlp": 1.02349937, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 3.00543561743066, + "language_loss": 0.71485555, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73652661, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13238525, + "step": 7153, + "time_per_iteration": 2.578613758087158 + }, + { + "auxiliary_loss_clip": 0.01134038, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.05303836, + "balance_loss_mlp": 1.0186162, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 2.0187772009212464, + "language_loss": 0.78108686, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80274886, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13543701, + "step": 7154, + "time_per_iteration": 2.42101788520813 + }, + { + "auxiliary_loss_clip": 0.0113325, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.05354297, + "balance_loss_mlp": 1.02562332, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.5265821029988669, + "language_loss": 0.79128182, + "learning_rate": 2.542454506558389e-06, + "loss": 0.81301677, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.14624023, + "step": 7155, + "time_per_iteration": 2.5154364109039307 + }, + { + "auxiliary_loss_clip": 0.01131473, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.05408812, + "balance_loss_mlp": 1.01806128, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.8276075493648487, + "language_loss": 0.88861668, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.91023707, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12506104, + "step": 7156, + "time_per_iteration": 3.8600361347198486 + }, + { + "auxiliary_loss_clip": 0.01141322, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.05988312, + "balance_loss_mlp": 1.0204556, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 2.210522788909314, + "language_loss": 0.83398473, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85574746, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.14489746, + "step": 7157, + "time_per_iteration": 2.496185541152954 + }, + { + "auxiliary_loss_clip": 0.01138498, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.05521107, + "balance_loss_mlp": 1.02164268, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.9249567738267284, + "language_loss": 0.71870828, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74045426, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14434814, + "step": 7158, + "time_per_iteration": 2.5075440406799316 + }, + { + "auxiliary_loss_clip": 0.01126095, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.04538369, + "balance_loss_mlp": 1.02416003, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 1.8469358545106984, + "language_loss": 0.82782269, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84947753, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.15228271, + "step": 7159, + "time_per_iteration": 2.429542064666748 + }, + { + "auxiliary_loss_clip": 0.01140571, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.0594635, + "balance_loss_mlp": 1.01942134, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.575467372789934, + "language_loss": 0.82627434, + "learning_rate": 2.54057993551933e-06, + "loss": 0.84800565, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13128662, + "step": 7160, + "time_per_iteration": 2.4280471801757812 + }, + { + "auxiliary_loss_clip": 0.01140678, + "auxiliary_loss_mlp": 0.01042306, + "balance_loss_clip": 1.05710793, + "balance_loss_mlp": 1.02629662, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.1891001234664573, + "language_loss": 0.78235823, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.80418807, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.16003418, + "step": 7161, + "time_per_iteration": 2.469961166381836 + }, + { + "auxiliary_loss_clip": 0.01137105, + "auxiliary_loss_mlp": 0.01038908, + "balance_loss_clip": 1.05456364, + "balance_loss_mlp": 1.02558625, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.1110921750883533, + "language_loss": 0.72946703, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75122714, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13317871, + "step": 7162, + "time_per_iteration": 2.579040765762329 + }, + { + "auxiliary_loss_clip": 0.01064234, + "auxiliary_loss_mlp": 0.01007344, + "balance_loss_clip": 1.0355624, + "balance_loss_mlp": 1.00581932, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.8153562342661163, + "language_loss": 0.59025264, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61096835, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.28662109, + "router_z_loss_mlp": 0.01524353, + "step": 7163, + "time_per_iteration": 2.993042469024658 + }, + { + "auxiliary_loss_clip": 0.01123109, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.04493308, + "balance_loss_mlp": 1.02371836, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.681293056516737, + "language_loss": 0.79325449, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81486011, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13739014, + "step": 7164, + "time_per_iteration": 2.5171544551849365 + }, + { + "auxiliary_loss_clip": 0.0114076, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.0563333, + "balance_loss_mlp": 1.0259515, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.7654485340859665, + "language_loss": 0.68021262, + "learning_rate": 2.538704852009177e-06, + "loss": 0.7020216, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14178467, + "step": 7165, + "time_per_iteration": 2.5031514167785645 + }, + { + "auxiliary_loss_clip": 0.01133744, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_clip": 1.05405641, + "balance_loss_mlp": 1.03333402, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.9428466902407588, + "language_loss": 0.75295848, + "learning_rate": 2.538329773967034e-06, + "loss": 0.7747581, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12890625, + "step": 7166, + "time_per_iteration": 2.494654655456543 + }, + { + "auxiliary_loss_clip": 0.01129163, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.05078566, + "balance_loss_mlp": 1.02725017, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.7735403446433748, + "language_loss": 0.71938527, + "learning_rate": 2.537954675511372e-06, + "loss": 0.74108297, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13354492, + "step": 7167, + "time_per_iteration": 2.5073132514953613 + }, + { + "auxiliary_loss_clip": 0.01125892, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.04941714, + "balance_loss_mlp": 1.01947713, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.5220782662880512, + "language_loss": 0.78343147, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80500925, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12408447, + "step": 7168, + "time_per_iteration": 3.9553046226501465 + }, + { + "auxiliary_loss_clip": 0.01137597, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.05547476, + "balance_loss_mlp": 1.02791703, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.1161956545781533, + "language_loss": 0.82694995, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84874296, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13781738, + "step": 7169, + "time_per_iteration": 2.4658308029174805 + }, + { + "auxiliary_loss_clip": 0.01063283, + "auxiliary_loss_mlp": 0.01003515, + "balance_loss_clip": 1.03441691, + "balance_loss_mlp": 1.00187635, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.7038442424773441, + "language_loss": 0.6075381, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62820613, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.01637268, + "step": 7170, + "time_per_iteration": 3.240762710571289 + }, + { + "auxiliary_loss_clip": 0.01127684, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.04853249, + "balance_loss_mlp": 1.01794815, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.8699258052645462, + "language_loss": 0.75838953, + "learning_rate": 2.536454077838021e-06, + "loss": 0.77997923, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13342285, + "step": 7171, + "time_per_iteration": 2.4891655445098877 + }, + { + "auxiliary_loss_clip": 0.01129813, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.05081189, + "balance_loss_mlp": 1.0197829, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.774354851469812, + "language_loss": 0.77505964, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79668057, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12481689, + "step": 7172, + "time_per_iteration": 2.561622142791748 + }, + { + "auxiliary_loss_clip": 0.01137586, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.05400157, + "balance_loss_mlp": 1.0219444, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.5768352784170643, + "language_loss": 0.76410586, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78584433, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14324951, + "step": 7173, + "time_per_iteration": 3.9172980785369873 + }, + { + "auxiliary_loss_clip": 0.01135012, + "auxiliary_loss_mlp": 0.01031879, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.01861131, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 3.05707211802179, + "language_loss": 0.77286226, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79453111, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13269043, + "step": 7174, + "time_per_iteration": 2.464024782180786 + }, + { + "auxiliary_loss_clip": 0.0113804, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.05560637, + "balance_loss_mlp": 1.01743245, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 2.4720677324352014, + "language_loss": 0.82936609, + "learning_rate": 2.534953154686407e-06, + "loss": 0.85107571, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.15484619, + "step": 7175, + "time_per_iteration": 2.649142026901245 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01039923, + "balance_loss_clip": 1.05111539, + "balance_loss_mlp": 1.02450871, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.3669452044339545, + "language_loss": 0.75161231, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77337605, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.1541748, + "step": 7176, + "time_per_iteration": 2.436105728149414 + }, + { + "auxiliary_loss_clip": 0.01131756, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.0484376, + "balance_loss_mlp": 1.01873732, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.5782032733697053, + "language_loss": 0.73746824, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75912911, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.15588379, + "step": 7177, + "time_per_iteration": 2.5413434505462646 + }, + { + "auxiliary_loss_clip": 0.01137371, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.04930067, + "balance_loss_mlp": 1.02232885, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.8622428000244866, + "language_loss": 0.81899482, + "learning_rate": 2.533827249275387e-06, + "loss": 0.84075564, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.16381836, + "step": 7178, + "time_per_iteration": 2.4264559745788574 + }, + { + "auxiliary_loss_clip": 0.0112668, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.05153561, + "balance_loss_mlp": 1.01943183, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4957953735123248, + "language_loss": 0.84192383, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86351776, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13269043, + "step": 7179, + "time_per_iteration": 2.553605794906616 + }, + { + "auxiliary_loss_clip": 0.01132296, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.0491184, + "balance_loss_mlp": 1.02313292, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 2.243799088202332, + "language_loss": 0.75701451, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.7787168, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14794922, + "step": 7180, + "time_per_iteration": 2.4186294078826904 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.05078197, + "balance_loss_mlp": 1.02191293, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.652983954543229, + "language_loss": 0.81394696, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83567011, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.14685059, + "step": 7181, + "time_per_iteration": 2.4843361377716064 + }, + { + "auxiliary_loss_clip": 0.01129809, + "auxiliary_loss_mlp": 0.01038629, + "balance_loss_clip": 1.04762292, + "balance_loss_mlp": 1.02229786, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.6759351247587582, + "language_loss": 0.88822442, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90990877, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.16320801, + "step": 7182, + "time_per_iteration": 2.481843948364258 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.05750954, + "balance_loss_mlp": 1.02180648, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.5883340219638564, + "language_loss": 0.75760424, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77932048, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12939453, + "step": 7183, + "time_per_iteration": 2.488706350326538 + }, + { + "auxiliary_loss_clip": 0.01132582, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.0493288, + "balance_loss_mlp": 1.02415729, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.572588197744734, + "language_loss": 0.7773158, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79902613, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.1428833, + "step": 7184, + "time_per_iteration": 2.4979546070098877 + }, + { + "auxiliary_loss_clip": 0.01119733, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.0453018, + "balance_loss_mlp": 1.01990175, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.6105234652278049, + "language_loss": 0.73286247, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75440133, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.14239502, + "step": 7185, + "time_per_iteration": 2.6056768894195557 + }, + { + "auxiliary_loss_clip": 0.01132125, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.04682279, + "balance_loss_mlp": 1.02620959, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.7026594937269395, + "language_loss": 0.75702977, + "learning_rate": 2.530823945207421e-06, + "loss": 0.77876467, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.15136719, + "step": 7186, + "time_per_iteration": 2.585395574569702 + }, + { + "auxiliary_loss_clip": 0.01123444, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.04409242, + "balance_loss_mlp": 1.02105296, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 2.7436668000036066, + "language_loss": 0.76249003, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78407389, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13885498, + "step": 7187, + "time_per_iteration": 2.5851457118988037 + }, + { + "auxiliary_loss_clip": 0.01054124, + "auxiliary_loss_mlp": 0.01002772, + "balance_loss_clip": 1.02601266, + "balance_loss_mlp": 1.00109124, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8534849237470771, + "language_loss": 0.68120992, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70177889, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.28173828, + "router_z_loss_mlp": 0.0168457, + "step": 7188, + "time_per_iteration": 4.639712572097778 + }, + { + "auxiliary_loss_clip": 0.01127169, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.05021286, + "balance_loss_mlp": 1.01961267, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 2.006700007077784, + "language_loss": 0.78410071, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80570078, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13214111, + "step": 7189, + "time_per_iteration": 2.4570960998535156 + }, + { + "auxiliary_loss_clip": 0.01131877, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.04685438, + "balance_loss_mlp": 1.02629519, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.9565771190951047, + "language_loss": 0.71488202, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73661399, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.15014648, + "step": 7190, + "time_per_iteration": 2.5187013149261475 + }, + { + "auxiliary_loss_clip": 0.01124745, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.044595, + "balance_loss_mlp": 1.02395487, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.3913942660240055, + "language_loss": 0.7986331, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.82026947, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14935303, + "step": 7191, + "time_per_iteration": 2.5358738899230957 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.04706609, + "balance_loss_mlp": 1.01951122, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.6404469134944386, + "language_loss": 0.74719268, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76878476, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13525391, + "step": 7192, + "time_per_iteration": 2.515685558319092 + }, + { + "auxiliary_loss_clip": 0.01127578, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04830098, + "balance_loss_mlp": 1.02230525, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 1.979878842827398, + "language_loss": 0.79081982, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81246305, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.14447021, + "step": 7193, + "time_per_iteration": 2.4623031616210938 + }, + { + "auxiliary_loss_clip": 0.01134205, + "auxiliary_loss_mlp": 0.01042784, + "balance_loss_clip": 1.0530498, + "balance_loss_mlp": 1.02820504, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 2.103496796271648, + "language_loss": 0.75346011, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77522999, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14599609, + "step": 7194, + "time_per_iteration": 2.496387243270874 + }, + { + "auxiliary_loss_clip": 0.01130275, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.04825044, + "balance_loss_mlp": 1.02407658, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 2.670421995178205, + "language_loss": 0.59828389, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61997825, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.15100098, + "step": 7195, + "time_per_iteration": 2.4826083183288574 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.05051351, + "balance_loss_mlp": 1.01916623, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.0591765040208774, + "language_loss": 0.6533857, + "learning_rate": 2.527068004376515e-06, + "loss": 0.6750803, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.15710449, + "step": 7196, + "time_per_iteration": 2.493813991546631 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.04901683, + "balance_loss_mlp": 1.02441537, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 1.907952192037493, + "language_loss": 0.7294758, + "learning_rate": 2.526692300132797e-06, + "loss": 0.75121355, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15356445, + "step": 7197, + "time_per_iteration": 2.597531795501709 + }, + { + "auxiliary_loss_clip": 0.01124322, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.0472132, + "balance_loss_mlp": 1.02121925, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.6678543086507, + "language_loss": 0.72619152, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.74778605, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13916016, + "step": 7198, + "time_per_iteration": 2.5152711868286133 + }, + { + "auxiliary_loss_clip": 0.01131291, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.05271959, + "balance_loss_mlp": 1.01951671, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 2.033664907164059, + "language_loss": 0.81365681, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83530235, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13745117, + "step": 7199, + "time_per_iteration": 3.930394411087036 + }, + { + "auxiliary_loss_clip": 0.01142416, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.0614475, + "balance_loss_mlp": 1.01940405, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 3.142684388989199, + "language_loss": 0.68181676, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70356876, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13391113, + "step": 7200, + "time_per_iteration": 2.509293556213379 + }, + { + "auxiliary_loss_clip": 0.01131548, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.0489192, + "balance_loss_mlp": 1.02115738, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 1.9522570701844446, + "language_loss": 0.87448454, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89616477, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.15301514, + "step": 7201, + "time_per_iteration": 2.5747296810150146 + }, + { + "auxiliary_loss_clip": 0.01151672, + "auxiliary_loss_mlp": 0.01041389, + "balance_loss_clip": 1.06276286, + "balance_loss_mlp": 1.02491474, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 6.028354744511458, + "language_loss": 0.64223313, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66416371, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.16491699, + "step": 7202, + "time_per_iteration": 2.4660086631774902 + }, + { + "auxiliary_loss_clip": 0.01135357, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.05546904, + "balance_loss_mlp": 1.01619768, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.775448254352334, + "language_loss": 0.81989622, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.84153688, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12506104, + "step": 7203, + "time_per_iteration": 2.4815988540649414 + }, + { + "auxiliary_loss_clip": 0.01139381, + "auxiliary_loss_mlp": 0.01046123, + "balance_loss_clip": 1.05480075, + "balance_loss_mlp": 1.03135276, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.838371342680506, + "language_loss": 0.81432098, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83617604, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14770508, + "step": 7204, + "time_per_iteration": 2.4604458808898926 + }, + { + "auxiliary_loss_clip": 0.01141577, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.05928648, + "balance_loss_mlp": 1.02038264, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.859374719255719, + "language_loss": 0.73470193, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75645924, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13775635, + "step": 7205, + "time_per_iteration": 2.46177077293396 + }, + { + "auxiliary_loss_clip": 0.01140511, + "auxiliary_loss_mlp": 0.01037961, + "balance_loss_clip": 1.0597918, + "balance_loss_mlp": 1.0242579, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.836847659241597, + "language_loss": 0.75280333, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77458811, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13708496, + "step": 7206, + "time_per_iteration": 2.506309747695923 + }, + { + "auxiliary_loss_clip": 0.01145621, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.06320119, + "balance_loss_mlp": 1.0215199, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 1.7658930072464738, + "language_loss": 0.78903902, + "learning_rate": 2.522934161574342e-06, + "loss": 0.81085408, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.14355469, + "step": 7207, + "time_per_iteration": 2.487273693084717 + }, + { + "auxiliary_loss_clip": 0.01136891, + "auxiliary_loss_mlp": 0.01040986, + "balance_loss_clip": 1.05423689, + "balance_loss_mlp": 1.0260731, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 5.0001711424246835, + "language_loss": 0.80867422, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83045292, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14916992, + "step": 7208, + "time_per_iteration": 2.4238216876983643 + }, + { + "auxiliary_loss_clip": 0.01140712, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.05843616, + "balance_loss_mlp": 1.01848459, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 1.8924386848299284, + "language_loss": 0.70131332, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72304744, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14221191, + "step": 7209, + "time_per_iteration": 2.5240230560302734 + }, + { + "auxiliary_loss_clip": 0.01138771, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.05801022, + "balance_loss_mlp": 1.02009964, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.4148116366190508, + "language_loss": 0.81254828, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83428013, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14324951, + "step": 7210, + "time_per_iteration": 2.5421926975250244 + }, + { + "auxiliary_loss_clip": 0.0112737, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.04738355, + "balance_loss_mlp": 1.02795482, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7992786861202668, + "language_loss": 0.82103467, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84273958, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.15148926, + "step": 7211, + "time_per_iteration": 3.9389164447784424 + }, + { + "auxiliary_loss_clip": 0.01136371, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.05655527, + "balance_loss_mlp": 1.02180171, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 1.7736541253262528, + "language_loss": 0.75220513, + "learning_rate": 2.521054347790029e-06, + "loss": 0.77391005, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12304688, + "step": 7212, + "time_per_iteration": 2.51189923286438 + }, + { + "auxiliary_loss_clip": 0.01133585, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.05065465, + "balance_loss_mlp": 1.02552783, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.8035720757730715, + "language_loss": 0.76508248, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78681034, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13684082, + "step": 7213, + "time_per_iteration": 2.5590436458587646 + }, + { + "auxiliary_loss_clip": 0.01134386, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.05138922, + "balance_loss_mlp": 1.02327156, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.4775273802858087, + "language_loss": 0.65339732, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67511147, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13739014, + "step": 7214, + "time_per_iteration": 2.597411632537842 + }, + { + "auxiliary_loss_clip": 0.01127666, + "auxiliary_loss_mlp": 0.01036648, + "balance_loss_clip": 1.0515765, + "balance_loss_mlp": 1.02430987, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.7359202852818583, + "language_loss": 0.71438253, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73602569, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12347412, + "step": 7215, + "time_per_iteration": 2.547848701477051 + }, + { + "auxiliary_loss_clip": 0.01129207, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.05193233, + "balance_loss_mlp": 1.02453136, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 1.6904212962880698, + "language_loss": 0.75366634, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77534592, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.14208984, + "step": 7216, + "time_per_iteration": 2.414092779159546 + }, + { + "auxiliary_loss_clip": 0.01140712, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.05448461, + "balance_loss_mlp": 1.02985477, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.398359248874733, + "language_loss": 0.75505364, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77691805, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.15893555, + "step": 7217, + "time_per_iteration": 3.8938229084014893 + }, + { + "auxiliary_loss_clip": 0.01133807, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.05214429, + "balance_loss_mlp": 1.02800417, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 1.959588091052653, + "language_loss": 0.74593675, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76771158, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.15673828, + "step": 7218, + "time_per_iteration": 2.460965156555176 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.04886007, + "balance_loss_mlp": 1.01953936, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 2.182883720891776, + "language_loss": 0.6857695, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70740354, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13928223, + "step": 7219, + "time_per_iteration": 2.491332530975342 + }, + { + "auxiliary_loss_clip": 0.01132593, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.05356574, + "balance_loss_mlp": 1.023067, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.5656217557311183, + "language_loss": 0.77540821, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79710013, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13525391, + "step": 7220, + "time_per_iteration": 2.5469000339508057 + }, + { + "auxiliary_loss_clip": 0.0113057, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.05038357, + "balance_loss_mlp": 1.01953316, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 2.0794565983349917, + "language_loss": 0.69417268, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71580786, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13421631, + "step": 7221, + "time_per_iteration": 2.4838922023773193 + }, + { + "auxiliary_loss_clip": 0.01131795, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.0499264, + "balance_loss_mlp": 1.0238173, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 1.7498563010764245, + "language_loss": 0.65272045, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67440814, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13165283, + "step": 7222, + "time_per_iteration": 2.511866569519043 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.05537438, + "balance_loss_mlp": 1.02026117, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 2.9251780523143163, + "language_loss": 0.73407352, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75577748, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14361572, + "step": 7223, + "time_per_iteration": 2.4145545959472656 + }, + { + "auxiliary_loss_clip": 0.01139338, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.05521703, + "balance_loss_mlp": 1.01683176, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.7174471629342998, + "language_loss": 0.93809927, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95980442, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.14355469, + "step": 7224, + "time_per_iteration": 2.5275020599365234 + }, + { + "auxiliary_loss_clip": 0.01134928, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.05577302, + "balance_loss_mlp": 1.02492166, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.548308057975317, + "language_loss": 0.61151391, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63324678, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13433838, + "step": 7225, + "time_per_iteration": 2.474555492401123 + }, + { + "auxiliary_loss_clip": 0.01129832, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.05037832, + "balance_loss_mlp": 1.02293038, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.7233278681672082, + "language_loss": 0.77686119, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.7985301, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.14123535, + "step": 7226, + "time_per_iteration": 2.4793100357055664 + }, + { + "auxiliary_loss_clip": 0.01129817, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.05162358, + "balance_loss_mlp": 1.01812911, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.62111162614801, + "language_loss": 0.84917152, + "learning_rate": 2.515411949802964e-06, + "loss": 0.87078589, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13494873, + "step": 7227, + "time_per_iteration": 2.4235148429870605 + }, + { + "auxiliary_loss_clip": 0.01128296, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.04951096, + "balance_loss_mlp": 1.02144194, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 1.932990626616445, + "language_loss": 0.76673305, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78837204, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.1418457, + "step": 7228, + "time_per_iteration": 2.5927159786224365 + }, + { + "auxiliary_loss_clip": 0.01132292, + "auxiliary_loss_mlp": 0.01042635, + "balance_loss_clip": 1.05137968, + "balance_loss_mlp": 1.02667284, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 2.7068091779569565, + "language_loss": 0.8069042, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82865345, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.15966797, + "step": 7229, + "time_per_iteration": 2.52463698387146 + }, + { + "auxiliary_loss_clip": 0.01133228, + "auxiliary_loss_mlp": 0.01042638, + "balance_loss_clip": 1.05128837, + "balance_loss_mlp": 1.02673531, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 1.8062780381687031, + "language_loss": 0.8184346, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84019321, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.15893555, + "step": 7230, + "time_per_iteration": 2.4671781063079834 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.06228793, + "balance_loss_mlp": 1.02696073, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.118563957585949, + "language_loss": 0.77102041, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79292679, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.14691162, + "step": 7231, + "time_per_iteration": 3.950066328048706 + }, + { + "auxiliary_loss_clip": 0.01136086, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.05528998, + "balance_loss_mlp": 1.01865864, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.520992615706741, + "language_loss": 0.68817848, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70985073, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12475586, + "step": 7232, + "time_per_iteration": 2.6842238903045654 + }, + { + "auxiliary_loss_clip": 0.01135016, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.05047596, + "balance_loss_mlp": 1.0207268, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6919263883176736, + "language_loss": 0.72274792, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.7444548, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.1496582, + "step": 7233, + "time_per_iteration": 2.5884201526641846 + }, + { + "auxiliary_loss_clip": 0.01146832, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.06267881, + "balance_loss_mlp": 1.02359223, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 2.0806504391927243, + "language_loss": 0.74492359, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76678205, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.15423584, + "step": 7234, + "time_per_iteration": 2.5629236698150635 + }, + { + "auxiliary_loss_clip": 0.01134112, + "auxiliary_loss_mlp": 0.01043758, + "balance_loss_clip": 1.0491426, + "balance_loss_mlp": 1.02736652, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 2.3132745209578163, + "language_loss": 0.59147418, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61325288, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.16387939, + "step": 7235, + "time_per_iteration": 2.6963510513305664 + }, + { + "auxiliary_loss_clip": 0.01139628, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.05361438, + "balance_loss_mlp": 1.02229679, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.3829987921805107, + "language_loss": 0.77492106, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79668117, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14080811, + "step": 7236, + "time_per_iteration": 2.5858957767486572 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.05582261, + "balance_loss_mlp": 1.01762092, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.7369684277200437, + "language_loss": 0.81123656, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83291495, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.14080811, + "step": 7237, + "time_per_iteration": 2.486565589904785 + }, + { + "auxiliary_loss_clip": 0.01138213, + "auxiliary_loss_mlp": 0.01039183, + "balance_loss_clip": 1.05574632, + "balance_loss_mlp": 1.02542055, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 1.4893806484357288, + "language_loss": 0.62849236, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65026635, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13781738, + "step": 7238, + "time_per_iteration": 2.4599907398223877 + }, + { + "auxiliary_loss_clip": 0.01129829, + "auxiliary_loss_mlp": 0.01041558, + "balance_loss_clip": 1.05049813, + "balance_loss_mlp": 1.02719355, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.630372604850552, + "language_loss": 0.85714364, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87885755, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.14373779, + "step": 7239, + "time_per_iteration": 2.525721788406372 + }, + { + "auxiliary_loss_clip": 0.01130425, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.04955781, + "balance_loss_mlp": 1.02367485, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.5579113959905446, + "language_loss": 0.72528213, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74696821, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.1449585, + "step": 7240, + "time_per_iteration": 2.4786171913146973 + }, + { + "auxiliary_loss_clip": 0.01144326, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.06121027, + "balance_loss_mlp": 1.01831222, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.8236406606199806, + "language_loss": 0.81883824, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.84060812, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14331055, + "step": 7241, + "time_per_iteration": 2.590494394302368 + }, + { + "auxiliary_loss_clip": 0.01145281, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.05643868, + "balance_loss_mlp": 1.02742827, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 3.0274536394392983, + "language_loss": 0.79352343, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81541294, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.16235352, + "step": 7242, + "time_per_iteration": 2.4368600845336914 + }, + { + "auxiliary_loss_clip": 0.0113413, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.04991746, + "balance_loss_mlp": 1.02207947, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 3.4187901374867318, + "language_loss": 0.68938494, + "learning_rate": 2.509388546104138e-06, + "loss": 0.71109736, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.15032959, + "step": 7243, + "time_per_iteration": 3.853442430496216 + }, + { + "auxiliary_loss_clip": 0.01128098, + "auxiliary_loss_mlp": 0.01033439, + "balance_loss_clip": 1.04923129, + "balance_loss_mlp": 1.0202899, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6220011114088286, + "language_loss": 0.81329304, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83490849, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13140869, + "step": 7244, + "time_per_iteration": 2.4041237831115723 + }, + { + "auxiliary_loss_clip": 0.01134852, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.05125999, + "balance_loss_mlp": 1.02164316, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.893574746031771, + "language_loss": 0.73808163, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75978661, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13995361, + "step": 7245, + "time_per_iteration": 2.46966290473938 + }, + { + "auxiliary_loss_clip": 0.01140302, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.05670857, + "balance_loss_mlp": 1.02637088, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.6795454694416012, + "language_loss": 0.76937687, + "learning_rate": 2.508258605639389e-06, + "loss": 0.79117739, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.1338501, + "step": 7246, + "time_per_iteration": 2.5659587383270264 + }, + { + "auxiliary_loss_clip": 0.01133257, + "auxiliary_loss_mlp": 0.01040078, + "balance_loss_clip": 1.04841268, + "balance_loss_mlp": 1.02564752, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 2.083801432026804, + "language_loss": 0.8589375, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.88067079, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.14434814, + "step": 7247, + "time_per_iteration": 2.529623508453369 + }, + { + "auxiliary_loss_clip": 0.01132106, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.0498966, + "balance_loss_mlp": 1.02335286, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.570652499291902, + "language_loss": 0.72435194, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74603975, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13317871, + "step": 7248, + "time_per_iteration": 2.4889578819274902 + }, + { + "auxiliary_loss_clip": 0.01139729, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.05665052, + "balance_loss_mlp": 1.01954782, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.6024446226236426, + "language_loss": 0.87363541, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89536792, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13970947, + "step": 7249, + "time_per_iteration": 2.559006452560425 + }, + { + "auxiliary_loss_clip": 0.01137152, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.05299568, + "balance_loss_mlp": 1.02587438, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.7655448001480156, + "language_loss": 0.82234669, + "learning_rate": 2.506751748594683e-06, + "loss": 0.844109, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13214111, + "step": 7250, + "time_per_iteration": 2.5893990993499756 + }, + { + "auxiliary_loss_clip": 0.01135658, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.05320227, + "balance_loss_mlp": 1.02588594, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.7335038197149497, + "language_loss": 0.84827286, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.87002742, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13903809, + "step": 7251, + "time_per_iteration": 2.4952664375305176 + }, + { + "auxiliary_loss_clip": 0.01128252, + "auxiliary_loss_mlp": 0.01038271, + "balance_loss_clip": 1.04887068, + "balance_loss_mlp": 1.0237627, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.5529027240919828, + "language_loss": 0.69534624, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71701145, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14508057, + "step": 7252, + "time_per_iteration": 2.4429080486297607 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.05174041, + "balance_loss_mlp": 1.02212453, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.6187339746429665, + "language_loss": 0.83715093, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85885656, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14874268, + "step": 7253, + "time_per_iteration": 2.452742099761963 + }, + { + "auxiliary_loss_clip": 0.01133238, + "auxiliary_loss_mlp": 0.01039051, + "balance_loss_clip": 1.0505513, + "balance_loss_mlp": 1.02434015, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.8527980775085395, + "language_loss": 0.70396572, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72568858, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14709473, + "step": 7254, + "time_per_iteration": 2.5333573818206787 + }, + { + "auxiliary_loss_clip": 0.01138941, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.05668974, + "balance_loss_mlp": 1.01994348, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.0027262702822277, + "language_loss": 0.81046706, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.832192, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13616943, + "step": 7255, + "time_per_iteration": 3.8510901927948 + }, + { + "auxiliary_loss_clip": 0.01148759, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.06314445, + "balance_loss_mlp": 1.02370405, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.8573670461827598, + "language_loss": 0.77690053, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79876614, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.14086914, + "step": 7256, + "time_per_iteration": 2.5080862045288086 + }, + { + "auxiliary_loss_clip": 0.01132448, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.053666, + "balance_loss_mlp": 1.02035618, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.454475783998701, + "language_loss": 0.7650851, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78674948, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.13635254, + "step": 7257, + "time_per_iteration": 2.4670088291168213 + }, + { + "auxiliary_loss_clip": 0.01144589, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.06282485, + "balance_loss_mlp": 1.02180004, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.7694713041369066, + "language_loss": 0.73129117, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75310802, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.1529541, + "step": 7258, + "time_per_iteration": 2.550177812576294 + }, + { + "auxiliary_loss_clip": 0.0113387, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.05216122, + "balance_loss_mlp": 1.01920271, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 1.84309813561196, + "language_loss": 0.76819158, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78986216, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13977051, + "step": 7259, + "time_per_iteration": 2.5385589599609375 + }, + { + "auxiliary_loss_clip": 0.01058488, + "auxiliary_loss_mlp": 0.01016103, + "balance_loss_clip": 1.03005636, + "balance_loss_mlp": 1.0145812, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7708841507269638, + "language_loss": 0.57082117, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.5915671, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.28466797, + "router_z_loss_mlp": 0.01522827, + "step": 7260, + "time_per_iteration": 4.388674974441528 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.05360985, + "balance_loss_mlp": 1.02560568, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 1.8363614112585058, + "language_loss": 0.71463239, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.73639512, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14959717, + "step": 7261, + "time_per_iteration": 2.5909321308135986 + }, + { + "auxiliary_loss_clip": 0.01130314, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_clip": 1.04751301, + "balance_loss_mlp": 1.02938545, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 1.8946023261159814, + "language_loss": 0.69971699, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.7214607, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14685059, + "step": 7262, + "time_per_iteration": 2.486037015914917 + }, + { + "auxiliary_loss_clip": 0.01125072, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.04821694, + "balance_loss_mlp": 1.01731682, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.685818691306354, + "language_loss": 0.79917222, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82072198, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12597656, + "step": 7263, + "time_per_iteration": 2.5013647079467773 + }, + { + "auxiliary_loss_clip": 0.01133285, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.05251908, + "balance_loss_mlp": 1.03107953, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.696734629631529, + "language_loss": 0.7539475, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77573323, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14208984, + "step": 7264, + "time_per_iteration": 2.4511630535125732 + }, + { + "auxiliary_loss_clip": 0.01136809, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.05675077, + "balance_loss_mlp": 1.02174103, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.240418398587724, + "language_loss": 0.6150437, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63677216, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.14300537, + "step": 7265, + "time_per_iteration": 2.62343692779541 + }, + { + "auxiliary_loss_clip": 0.01124829, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.04644656, + "balance_loss_mlp": 1.01757967, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 8.224369020426552, + "language_loss": 0.72324073, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74480104, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13616943, + "step": 7266, + "time_per_iteration": 2.447506904602051 + }, + { + "auxiliary_loss_clip": 0.0113554, + "auxiliary_loss_mlp": 0.01038382, + "balance_loss_clip": 1.05164099, + "balance_loss_mlp": 1.02335608, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 1.991719582915369, + "language_loss": 0.82403314, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84577227, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.15032959, + "step": 7267, + "time_per_iteration": 2.547741651535034 + }, + { + "auxiliary_loss_clip": 0.01125972, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.04670334, + "balance_loss_mlp": 1.02434635, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.983590447921161, + "language_loss": 0.75197566, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.7736125, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.1340332, + "step": 7268, + "time_per_iteration": 2.496655225753784 + }, + { + "auxiliary_loss_clip": 0.01134382, + "auxiliary_loss_mlp": 0.01045477, + "balance_loss_clip": 1.04928422, + "balance_loss_mlp": 1.02714288, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 2.2374292875638946, + "language_loss": 0.80361503, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82541364, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.18334961, + "step": 7269, + "time_per_iteration": 2.4424195289611816 + }, + { + "auxiliary_loss_clip": 0.01139411, + "auxiliary_loss_mlp": 0.01042613, + "balance_loss_clip": 1.0561955, + "balance_loss_mlp": 1.02891541, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 2.0526857341515767, + "language_loss": 0.74981809, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77163827, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13684082, + "step": 7270, + "time_per_iteration": 2.490072727203369 + }, + { + "auxiliary_loss_clip": 0.01139048, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.05547237, + "balance_loss_mlp": 1.02262735, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.7647188501766709, + "language_loss": 0.7948584, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.8166163, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14123535, + "step": 7271, + "time_per_iteration": 2.494168281555176 + }, + { + "auxiliary_loss_clip": 0.01095773, + "auxiliary_loss_mlp": 0.0101392, + "balance_loss_clip": 1.06641471, + "balance_loss_mlp": 1.01130319, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.7053822362623589, + "language_loss": 0.54838538, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56948227, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.29394531, + "router_z_loss_mlp": 0.02615356, + "step": 7272, + "time_per_iteration": 3.1270105838775635 + }, + { + "auxiliary_loss_clip": 0.01146639, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.05768156, + "balance_loss_mlp": 1.03208947, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.602625802442049, + "language_loss": 0.6999293, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72186732, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.15081787, + "step": 7273, + "time_per_iteration": 2.483262062072754 + }, + { + "auxiliary_loss_clip": 0.01143416, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.05805182, + "balance_loss_mlp": 1.02558839, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.8910031701383008, + "language_loss": 0.75536573, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77720273, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14691162, + "step": 7274, + "time_per_iteration": 2.6005442142486572 + }, + { + "auxiliary_loss_clip": 0.01138379, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.05630982, + "balance_loss_mlp": 1.01746035, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.6478224930048346, + "language_loss": 0.80223113, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82391083, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12127686, + "step": 7275, + "time_per_iteration": 3.9316275119781494 + }, + { + "auxiliary_loss_clip": 0.01142963, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.05887616, + "balance_loss_mlp": 1.02340066, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.0678858817406436, + "language_loss": 0.80576348, + "learning_rate": 2.496949724407266e-06, + "loss": 0.82756388, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13671875, + "step": 7276, + "time_per_iteration": 2.468252182006836 + }, + { + "auxiliary_loss_clip": 0.01142022, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.05512166, + "balance_loss_mlp": 1.01456785, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.9754715723525185, + "language_loss": 0.73362631, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75533962, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.1473999, + "step": 7277, + "time_per_iteration": 2.5355844497680664 + }, + { + "auxiliary_loss_clip": 0.01144453, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.06324029, + "balance_loss_mlp": 1.02078271, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 2.1215899618983243, + "language_loss": 0.72784263, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74964106, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.1461792, + "step": 7278, + "time_per_iteration": 2.5254263877868652 + }, + { + "auxiliary_loss_clip": 0.01140929, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.06204998, + "balance_loss_mlp": 1.02164531, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.5741701619693362, + "language_loss": 0.66151094, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68326283, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12628174, + "step": 7279, + "time_per_iteration": 2.49233341217041 + }, + { + "auxiliary_loss_clip": 0.01138421, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.05518293, + "balance_loss_mlp": 1.01899636, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.8665919294289626, + "language_loss": 0.81754637, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.83925885, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13848877, + "step": 7280, + "time_per_iteration": 2.618887424468994 + }, + { + "auxiliary_loss_clip": 0.01129531, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.05232191, + "balance_loss_mlp": 1.0174942, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.5692294110080869, + "language_loss": 0.76560885, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.7872082, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12921143, + "step": 7281, + "time_per_iteration": 2.5429599285125732 + }, + { + "auxiliary_loss_clip": 0.01129317, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.04911792, + "balance_loss_mlp": 1.0226469, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.8072370429215379, + "language_loss": 0.75656539, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77821577, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13092041, + "step": 7282, + "time_per_iteration": 2.4820363521575928 + }, + { + "auxiliary_loss_clip": 0.01144263, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.062886, + "balance_loss_mlp": 1.02040982, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.894807905801312, + "language_loss": 0.8484481, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87023497, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.14025879, + "step": 7283, + "time_per_iteration": 2.4996204376220703 + }, + { + "auxiliary_loss_clip": 0.0113604, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.05198812, + "balance_loss_mlp": 1.01736951, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.7584612921481344, + "language_loss": 0.80112165, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82279962, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14398193, + "step": 7284, + "time_per_iteration": 2.5047361850738525 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.05830693, + "balance_loss_mlp": 1.02354622, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.4898600347626063, + "language_loss": 0.80211723, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82385731, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12750244, + "step": 7285, + "time_per_iteration": 2.5125911235809326 + }, + { + "auxiliary_loss_clip": 0.01136436, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.05488825, + "balance_loss_mlp": 1.01945281, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.99215894186845, + "language_loss": 0.75004411, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77174234, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13934326, + "step": 7286, + "time_per_iteration": 2.4481699466705322 + }, + { + "auxiliary_loss_clip": 0.01124046, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.04331231, + "balance_loss_mlp": 1.01550913, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.8178739968796764, + "language_loss": 0.73657656, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75810909, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13684082, + "step": 7287, + "time_per_iteration": 3.969360589981079 + }, + { + "auxiliary_loss_clip": 0.01134392, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.05166292, + "balance_loss_mlp": 1.0241009, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.7660660730768611, + "language_loss": 0.8233366, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84506762, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14611816, + "step": 7288, + "time_per_iteration": 2.498898983001709 + }, + { + "auxiliary_loss_clip": 0.0113495, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.05151629, + "balance_loss_mlp": 1.02063251, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.483948355227729, + "language_loss": 0.84221828, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86391091, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13677979, + "step": 7289, + "time_per_iteration": 2.457627296447754 + }, + { + "auxiliary_loss_clip": 0.01143495, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.05433154, + "balance_loss_mlp": 1.02035499, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.5912341905217895, + "language_loss": 0.78097904, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.8027575, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14001465, + "step": 7290, + "time_per_iteration": 2.513643741607666 + }, + { + "auxiliary_loss_clip": 0.01145836, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.06425023, + "balance_loss_mlp": 1.02433753, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 1.6272117973742326, + "language_loss": 0.77803075, + "learning_rate": 2.491288899685288e-06, + "loss": 0.79986477, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13220215, + "step": 7291, + "time_per_iteration": 2.4735891819000244 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.04975653, + "balance_loss_mlp": 1.01843214, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.6303815955330403, + "language_loss": 0.64850247, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67011881, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13421631, + "step": 7292, + "time_per_iteration": 2.583550214767456 + }, + { + "auxiliary_loss_clip": 0.01135775, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.0541482, + "balance_loss_mlp": 1.01700497, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.6668382504658341, + "language_loss": 0.7424438, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76409531, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.1237793, + "step": 7293, + "time_per_iteration": 2.6495115756988525 + }, + { + "auxiliary_loss_clip": 0.0113888, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.05439043, + "balance_loss_mlp": 1.02357841, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 1.8780649162615075, + "language_loss": 0.78627032, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80804235, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14758301, + "step": 7294, + "time_per_iteration": 2.454606294631958 + }, + { + "auxiliary_loss_clip": 0.01131387, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.05010962, + "balance_loss_mlp": 1.02181315, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.8290752515254085, + "language_loss": 0.7325269, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75418782, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12896729, + "step": 7295, + "time_per_iteration": 2.432025909423828 + }, + { + "auxiliary_loss_clip": 0.01134137, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.05234981, + "balance_loss_mlp": 1.02233601, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.361899579621084, + "language_loss": 0.75402564, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.7757324, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.14202881, + "step": 7296, + "time_per_iteration": 2.467679738998413 + }, + { + "auxiliary_loss_clip": 0.01130302, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.05098319, + "balance_loss_mlp": 1.01634753, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 2.2182384835253113, + "language_loss": 0.69536066, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71695727, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13018799, + "step": 7297, + "time_per_iteration": 2.4503631591796875 + }, + { + "auxiliary_loss_clip": 0.01137247, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.05672145, + "balance_loss_mlp": 1.01716113, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.3832750089065613, + "language_loss": 0.70708477, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.7287544, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12573242, + "step": 7298, + "time_per_iteration": 2.5536630153656006 + }, + { + "auxiliary_loss_clip": 0.01124001, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.04634225, + "balance_loss_mlp": 1.01854575, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.4673976242272486, + "language_loss": 0.72799814, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74956703, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.14337158, + "step": 7299, + "time_per_iteration": 3.9711618423461914 + }, + { + "auxiliary_loss_clip": 0.01132143, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.05095577, + "balance_loss_mlp": 1.02515841, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.7043219904787055, + "language_loss": 0.77125823, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79297483, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14355469, + "step": 7300, + "time_per_iteration": 2.4742414951324463 + }, + { + "auxiliary_loss_clip": 0.01132463, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.05273128, + "balance_loss_mlp": 1.02139318, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.838334210538858, + "language_loss": 0.70946491, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.73113537, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13189697, + "step": 7301, + "time_per_iteration": 2.5476667881011963 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.05642033, + "balance_loss_mlp": 1.02168274, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.9545693714878971, + "language_loss": 0.70682251, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72858083, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.14294434, + "step": 7302, + "time_per_iteration": 2.5089473724365234 + }, + { + "auxiliary_loss_clip": 0.01134488, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.05425155, + "balance_loss_mlp": 1.02484846, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 2.4156487357513123, + "language_loss": 0.8207289, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84244698, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12481689, + "step": 7303, + "time_per_iteration": 2.595808744430542 + }, + { + "auxiliary_loss_clip": 0.01139094, + "auxiliary_loss_mlp": 0.01042423, + "balance_loss_clip": 1.05471718, + "balance_loss_mlp": 1.02700353, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.4227589261446405, + "language_loss": 0.68467426, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70648944, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.15435791, + "step": 7304, + "time_per_iteration": 3.9317147731781006 + }, + { + "auxiliary_loss_clip": 0.01140765, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.0614953, + "balance_loss_mlp": 1.01861215, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.453210417682829, + "language_loss": 0.77991694, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80163109, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12030029, + "step": 7305, + "time_per_iteration": 2.5620388984680176 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.05301976, + "balance_loss_mlp": 1.02231169, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.9023374940762254, + "language_loss": 0.6894595, + "learning_rate": 2.485623883278308e-06, + "loss": 0.71114016, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13275146, + "step": 7306, + "time_per_iteration": 2.525843858718872 + }, + { + "auxiliary_loss_clip": 0.0114142, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.05963421, + "balance_loss_mlp": 1.01983523, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.646028683535015, + "language_loss": 0.62607419, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64782155, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13482666, + "step": 7307, + "time_per_iteration": 2.4736669063568115 + }, + { + "auxiliary_loss_clip": 0.0113336, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.0491569, + "balance_loss_mlp": 1.02506638, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 2.0608372910835366, + "language_loss": 0.72267449, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74440348, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.14477539, + "step": 7308, + "time_per_iteration": 2.4037344455718994 + }, + { + "auxiliary_loss_clip": 0.01131029, + "auxiliary_loss_mlp": 0.01042171, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.02611351, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 2.6344211610520474, + "language_loss": 0.77336419, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79509616, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.16064453, + "step": 7309, + "time_per_iteration": 2.4730379581451416 + }, + { + "auxiliary_loss_clip": 0.01129837, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.05246735, + "balance_loss_mlp": 1.02112496, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.8164209110181213, + "language_loss": 0.71067262, + "learning_rate": 2.484112510474251e-06, + "loss": 0.73230702, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12469482, + "step": 7310, + "time_per_iteration": 2.4814343452453613 + }, + { + "auxiliary_loss_clip": 0.01141321, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.05813658, + "balance_loss_mlp": 1.02324021, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.239581758020995, + "language_loss": 0.75794584, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77972877, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13720703, + "step": 7311, + "time_per_iteration": 2.4447054862976074 + }, + { + "auxiliary_loss_clip": 0.0113556, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.05275917, + "balance_loss_mlp": 1.0246017, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.9796776066566555, + "language_loss": 0.81280386, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83454108, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13568115, + "step": 7312, + "time_per_iteration": 2.423795700073242 + }, + { + "auxiliary_loss_clip": 0.01136747, + "auxiliary_loss_mlp": 0.01034896, + "balance_loss_clip": 1.05198359, + "balance_loss_mlp": 1.02160966, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.0860573057485636, + "language_loss": 0.84674352, + "learning_rate": 2.482978788066318e-06, + "loss": 0.86845994, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13275146, + "step": 7313, + "time_per_iteration": 2.4534595012664795 + }, + { + "auxiliary_loss_clip": 0.01137552, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.05510259, + "balance_loss_mlp": 1.02115238, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.8530168187241702, + "language_loss": 0.67788172, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69960308, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13433838, + "step": 7314, + "time_per_iteration": 2.4224979877471924 + }, + { + "auxiliary_loss_clip": 0.01138333, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.05475509, + "balance_loss_mlp": 1.01798141, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.7464840495784737, + "language_loss": 0.76396006, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.78565991, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13677979, + "step": 7315, + "time_per_iteration": 2.446892261505127 + }, + { + "auxiliary_loss_clip": 0.01144292, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.06385064, + "balance_loss_mlp": 1.01822245, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.7975622325945113, + "language_loss": 0.7460078, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.7677592, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12646484, + "step": 7316, + "time_per_iteration": 2.511324882507324 + }, + { + "auxiliary_loss_clip": 0.01138972, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.05949688, + "balance_loss_mlp": 1.0233295, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.4105048087537755, + "language_loss": 0.65993941, + "learning_rate": 2.481466901851506e-06, + "loss": 0.68168432, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12182617, + "step": 7317, + "time_per_iteration": 2.4861793518066406 + }, + { + "auxiliary_loss_clip": 0.01137214, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.05594349, + "balance_loss_mlp": 1.01972055, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.8199063635961938, + "language_loss": 0.79809612, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.81980026, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13476562, + "step": 7318, + "time_per_iteration": 2.4295735359191895 + }, + { + "auxiliary_loss_clip": 0.01141143, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.05758691, + "balance_loss_mlp": 1.02295685, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 2.554853553099767, + "language_loss": 0.7980234, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81979728, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13287354, + "step": 7319, + "time_per_iteration": 2.534968137741089 + }, + { + "auxiliary_loss_clip": 0.01144214, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.05898166, + "balance_loss_mlp": 1.02442217, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.8660034380700827, + "language_loss": 0.79848742, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82031393, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13989258, + "step": 7320, + "time_per_iteration": 4.103570222854614 + }, + { + "auxiliary_loss_clip": 0.01135694, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.05632293, + "balance_loss_mlp": 1.02677965, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.997208923184229, + "language_loss": 0.69681704, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71857309, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13116455, + "step": 7321, + "time_per_iteration": 2.4478721618652344 + }, + { + "auxiliary_loss_clip": 0.01102254, + "auxiliary_loss_mlp": 0.01009359, + "balance_loss_clip": 1.07333851, + "balance_loss_mlp": 1.00693703, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8768998208062518, + "language_loss": 0.56891721, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59003329, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 0.02418518, + "step": 7322, + "time_per_iteration": 3.1608150005340576 + }, + { + "auxiliary_loss_clip": 0.01146042, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.06635892, + "balance_loss_mlp": 1.01599586, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.4706273832076875, + "language_loss": 0.76298648, + "learning_rate": 2.479198525097822e-06, + "loss": 0.7847209, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.11407471, + "step": 7323, + "time_per_iteration": 2.4933550357818604 + }, + { + "auxiliary_loss_clip": 0.01144706, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.06285632, + "balance_loss_mlp": 1.0220561, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.6460117379862858, + "language_loss": 0.8087368, + "learning_rate": 2.478820398622511e-06, + "loss": 0.83053678, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13244629, + "step": 7324, + "time_per_iteration": 2.4934945106506348 + }, + { + "auxiliary_loss_clip": 0.01064949, + "auxiliary_loss_mlp": 0.01009134, + "balance_loss_clip": 1.03694415, + "balance_loss_mlp": 1.00774062, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6756960935688637, + "language_loss": 0.54536819, + "learning_rate": 2.478442253990283e-06, + "loss": 0.566109, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.01393127, + "step": 7325, + "time_per_iteration": 3.0887463092803955 + }, + { + "auxiliary_loss_clip": 0.01145024, + "auxiliary_loss_mlp": 0.01026333, + "balance_loss_clip": 1.06813145, + "balance_loss_mlp": 1.0146203, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.5835389477473847, + "language_loss": 0.69944364, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.72115719, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11712646, + "step": 7326, + "time_per_iteration": 2.491668224334717 + }, + { + "auxiliary_loss_clip": 0.01136878, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.05836058, + "balance_loss_mlp": 1.01646614, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 3.8632260466503894, + "language_loss": 0.76558918, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78725237, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12988281, + "step": 7327, + "time_per_iteration": 2.5308878421783447 + }, + { + "auxiliary_loss_clip": 0.01135106, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.05758905, + "balance_loss_mlp": 1.0181725, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.9035984609455767, + "language_loss": 0.84186661, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86352426, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12506104, + "step": 7328, + "time_per_iteration": 2.4708240032196045 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.05586326, + "balance_loss_mlp": 1.01565504, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.037066602823262, + "language_loss": 0.779037, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80066234, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12695312, + "step": 7329, + "time_per_iteration": 2.4987006187438965 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.05261302, + "balance_loss_mlp": 1.01983428, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.6008856674983427, + "language_loss": 0.73709583, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75879216, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13372803, + "step": 7330, + "time_per_iteration": 2.589447259902954 + }, + { + "auxiliary_loss_clip": 0.01147718, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.06678391, + "balance_loss_mlp": 1.01977682, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.6846168568986515, + "language_loss": 0.74806249, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76985884, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12145996, + "step": 7331, + "time_per_iteration": 3.9719979763031006 + }, + { + "auxiliary_loss_clip": 0.01131537, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.05417967, + "balance_loss_mlp": 1.01614928, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.3888340086365074, + "language_loss": 0.76265419, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78425479, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12359619, + "step": 7332, + "time_per_iteration": 2.6102333068847656 + }, + { + "auxiliary_loss_clip": 0.01131519, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.05312765, + "balance_loss_mlp": 1.02609754, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 2.0288304224347486, + "language_loss": 0.73929965, + "learning_rate": 2.475416445004285e-06, + "loss": 0.76099443, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.11853027, + "step": 7333, + "time_per_iteration": 2.4595391750335693 + }, + { + "auxiliary_loss_clip": 0.01133609, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.05592942, + "balance_loss_mlp": 1.02343965, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 2.254844299910427, + "language_loss": 0.79701233, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81870914, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.1262207, + "step": 7334, + "time_per_iteration": 2.534562587738037 + }, + { + "auxiliary_loss_clip": 0.01141092, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.05386364, + "balance_loss_mlp": 1.02237928, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 1.9814390374489352, + "language_loss": 0.75348735, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77528071, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.15869141, + "step": 7335, + "time_per_iteration": 2.474794864654541 + }, + { + "auxiliary_loss_clip": 0.0113296, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.05279064, + "balance_loss_mlp": 1.02168155, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 2.167633579589827, + "language_loss": 0.72229767, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74397504, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13104248, + "step": 7336, + "time_per_iteration": 2.4217004776000977 + }, + { + "auxiliary_loss_clip": 0.01132539, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.04780865, + "balance_loss_mlp": 1.02898633, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.4477327335399663, + "language_loss": 0.634238, + "learning_rate": 2.473903107384165e-06, + "loss": 0.65599036, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13702393, + "step": 7337, + "time_per_iteration": 2.4961843490600586 + }, + { + "auxiliary_loss_clip": 0.01083072, + "auxiliary_loss_mlp": 0.01007781, + "balance_loss_clip": 1.05325007, + "balance_loss_mlp": 1.00627458, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7323276601017157, + "language_loss": 0.52638698, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54729545, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.29833984, + "router_z_loss_mlp": 0.01507568, + "step": 7338, + "time_per_iteration": 3.1545145511627197 + }, + { + "auxiliary_loss_clip": 0.01135284, + "auxiliary_loss_mlp": 0.01045668, + "balance_loss_clip": 1.04918528, + "balance_loss_mlp": 1.0309155, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.8895170143121844, + "language_loss": 0.70218086, + "learning_rate": 2.473146330693997e-06, + "loss": 0.72399038, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14746094, + "step": 7339, + "time_per_iteration": 2.5180513858795166 + }, + { + "auxiliary_loss_clip": 0.01128288, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.05203116, + "balance_loss_mlp": 1.02419865, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.427202912468365, + "language_loss": 0.69729185, + "learning_rate": 2.472767915429105e-06, + "loss": 0.71893668, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11999512, + "step": 7340, + "time_per_iteration": 2.4118802547454834 + }, + { + "auxiliary_loss_clip": 0.01067516, + "auxiliary_loss_mlp": 0.01006877, + "balance_loss_clip": 1.03775835, + "balance_loss_mlp": 1.00559747, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.9026747265623769, + "language_loss": 0.64065516, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66139913, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.29785156, + "router_z_loss_mlp": 0.01280212, + "step": 7341, + "time_per_iteration": 2.8811557292938232 + }, + { + "auxiliary_loss_clip": 0.01136766, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.05823886, + "balance_loss_mlp": 1.01982439, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.7805832213504442, + "language_loss": 0.74248636, + "learning_rate": 2.47201103113145e-06, + "loss": 0.76418388, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13171387, + "step": 7342, + "time_per_iteration": 3.9505674839019775 + }, + { + "auxiliary_loss_clip": 0.01127113, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04708076, + "balance_loss_mlp": 1.02027011, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 2.1947451282790547, + "language_loss": 0.79546356, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81707466, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13739014, + "step": 7343, + "time_per_iteration": 2.4957995414733887 + }, + { + "auxiliary_loss_clip": 0.01131934, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.05124354, + "balance_loss_mlp": 1.01905632, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.608700846204843, + "language_loss": 0.76990962, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.79154873, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.1293335, + "step": 7344, + "time_per_iteration": 2.520545721054077 + }, + { + "auxiliary_loss_clip": 0.01072431, + "auxiliary_loss_mlp": 0.01005892, + "balance_loss_clip": 1.0424602, + "balance_loss_mlp": 1.00455129, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.79442095749765, + "language_loss": 0.6380868, + "learning_rate": 2.470875570480556e-06, + "loss": 0.6588701, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.30029297, + "router_z_loss_mlp": 0.01341248, + "step": 7345, + "time_per_iteration": 2.854095220565796 + }, + { + "auxiliary_loss_clip": 0.01149263, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.06747222, + "balance_loss_mlp": 1.02005053, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 2.0911244736888457, + "language_loss": 0.86061156, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88243818, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13354492, + "step": 7346, + "time_per_iteration": 2.5902774333953857 + }, + { + "auxiliary_loss_clip": 0.01136168, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.05626214, + "balance_loss_mlp": 1.02276039, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 6.395050517596193, + "language_loss": 0.80337811, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82510763, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14013672, + "step": 7347, + "time_per_iteration": 2.509056568145752 + }, + { + "auxiliary_loss_clip": 0.01132854, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.05383837, + "balance_loss_mlp": 1.01638126, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.8123912762728525, + "language_loss": 0.83310902, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.85473549, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13391113, + "step": 7348, + "time_per_iteration": 2.4861111640930176 + }, + { + "auxiliary_loss_clip": 0.01132522, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.04916644, + "balance_loss_mlp": 1.021837, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.7162835846716347, + "language_loss": 0.70435011, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72603834, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14477539, + "step": 7349, + "time_per_iteration": 3.9772799015045166 + }, + { + "auxiliary_loss_clip": 0.0113807, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.0553174, + "balance_loss_mlp": 1.02014828, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6661367059401821, + "language_loss": 0.74108553, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76280677, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13909912, + "step": 7350, + "time_per_iteration": 2.486344575881958 + }, + { + "auxiliary_loss_clip": 0.01129197, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.04873025, + "balance_loss_mlp": 1.01917005, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 2.016871246892374, + "language_loss": 0.80318236, + "learning_rate": 2.468604167463827e-06, + "loss": 0.82479596, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13000488, + "step": 7351, + "time_per_iteration": 2.4587390422821045 + }, + { + "auxiliary_loss_clip": 0.01134835, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.05799937, + "balance_loss_mlp": 1.02263868, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 2.3218247987488523, + "language_loss": 0.73262954, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75432038, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11608887, + "step": 7352, + "time_per_iteration": 2.5036730766296387 + }, + { + "auxiliary_loss_clip": 0.01130761, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.0515784, + "balance_loss_mlp": 1.01497841, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 2.0282176912182472, + "language_loss": 0.87198901, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89357531, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12902832, + "step": 7353, + "time_per_iteration": 2.5049290657043457 + }, + { + "auxiliary_loss_clip": 0.01142682, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.05936718, + "balance_loss_mlp": 1.02507734, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0609296052659745, + "language_loss": 0.75901479, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78081965, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.12744141, + "step": 7354, + "time_per_iteration": 2.437124490737915 + }, + { + "auxiliary_loss_clip": 0.01128142, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.05088115, + "balance_loss_mlp": 1.0224719, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.8303497853892798, + "language_loss": 0.64961571, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67124283, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12109375, + "step": 7355, + "time_per_iteration": 2.729340076446533 + }, + { + "auxiliary_loss_clip": 0.01137327, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.05347061, + "balance_loss_mlp": 1.02309918, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.9103417218923349, + "language_loss": 0.78231674, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80405819, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.137146, + "step": 7356, + "time_per_iteration": 2.4793617725372314 + }, + { + "auxiliary_loss_clip": 0.01138387, + "auxiliary_loss_mlp": 0.01039608, + "balance_loss_clip": 1.05498922, + "balance_loss_mlp": 1.02637613, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.6087825005363339, + "language_loss": 0.77142012, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79320002, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.13226318, + "step": 7357, + "time_per_iteration": 2.4551260471343994 + }, + { + "auxiliary_loss_clip": 0.01138547, + "auxiliary_loss_mlp": 0.01036818, + "balance_loss_clip": 1.05890656, + "balance_loss_mlp": 1.02316213, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.5395612797454694, + "language_loss": 0.73171848, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75347221, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13641357, + "step": 7358, + "time_per_iteration": 2.60562801361084 + }, + { + "auxiliary_loss_clip": 0.01133786, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.05270255, + "balance_loss_mlp": 1.02068698, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.7090897312159148, + "language_loss": 0.75704068, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77871072, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12530518, + "step": 7359, + "time_per_iteration": 2.5445139408111572 + }, + { + "auxiliary_loss_clip": 0.0113721, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.05615377, + "balance_loss_mlp": 1.01991618, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.6773215366499858, + "language_loss": 0.69815171, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.71985531, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13238525, + "step": 7360, + "time_per_iteration": 2.447699785232544 + }, + { + "auxiliary_loss_clip": 0.01137189, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.0539515, + "balance_loss_mlp": 1.02054644, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.4649759863533633, + "language_loss": 0.69344217, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71515131, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13171387, + "step": 7361, + "time_per_iteration": 2.487030267715454 + }, + { + "auxiliary_loss_clip": 0.0113383, + "auxiliary_loss_mlp": 0.01037717, + "balance_loss_clip": 1.05269837, + "balance_loss_mlp": 1.02366805, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 2.03299623075719, + "language_loss": 0.82677364, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84848905, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14050293, + "step": 7362, + "time_per_iteration": 2.4325809478759766 + }, + { + "auxiliary_loss_clip": 0.01138118, + "auxiliary_loss_mlp": 0.01038476, + "balance_loss_clip": 1.0542357, + "balance_loss_mlp": 1.0245409, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.776326550259957, + "language_loss": 0.7451427, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76690871, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1394043, + "step": 7363, + "time_per_iteration": 3.87446665763855 + }, + { + "auxiliary_loss_clip": 0.0105501, + "auxiliary_loss_mlp": 0.01011358, + "balance_loss_clip": 1.02666426, + "balance_loss_mlp": 1.00947487, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6813331280013021, + "language_loss": 0.55698991, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57765359, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.01882935, + "step": 7364, + "time_per_iteration": 3.164555788040161 + }, + { + "auxiliary_loss_clip": 0.01127749, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.04868484, + "balance_loss_mlp": 1.02107716, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.6071900894705613, + "language_loss": 0.74310857, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76472235, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12548828, + "step": 7365, + "time_per_iteration": 2.4703876972198486 + }, + { + "auxiliary_loss_clip": 0.01128241, + "auxiliary_loss_mlp": 0.01041036, + "balance_loss_clip": 1.04717767, + "balance_loss_mlp": 1.02652872, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5484644847831703, + "language_loss": 0.74322701, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76491982, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14508057, + "step": 7366, + "time_per_iteration": 2.4540510177612305 + }, + { + "auxiliary_loss_clip": 0.01136766, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.05530787, + "balance_loss_mlp": 1.02272677, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 2.9151733503204302, + "language_loss": 0.73457664, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75631535, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.1439209, + "step": 7367, + "time_per_iteration": 2.6444220542907715 + }, + { + "auxiliary_loss_clip": 0.01132168, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.05052996, + "balance_loss_mlp": 1.02138567, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.3519484266972925, + "language_loss": 0.7381438, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.75981092, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13165283, + "step": 7368, + "time_per_iteration": 2.650954484939575 + }, + { + "auxiliary_loss_clip": 0.01131362, + "auxiliary_loss_mlp": 0.01036284, + "balance_loss_clip": 1.05053282, + "balance_loss_mlp": 1.02326643, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.7744231098490115, + "language_loss": 0.79958445, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82126093, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13018799, + "step": 7369, + "time_per_iteration": 2.5071842670440674 + }, + { + "auxiliary_loss_clip": 0.0113462, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.05204737, + "balance_loss_mlp": 1.02710402, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.902290985136419, + "language_loss": 0.71983922, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74158794, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13146973, + "step": 7370, + "time_per_iteration": 2.5708868503570557 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.05858374, + "balance_loss_mlp": 1.01962435, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.7651216048347782, + "language_loss": 0.70425975, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72599423, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13024902, + "step": 7371, + "time_per_iteration": 2.563276529312134 + }, + { + "auxiliary_loss_clip": 0.01134774, + "auxiliary_loss_mlp": 0.01026067, + "balance_loss_clip": 1.05616403, + "balance_loss_mlp": 1.0142951, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.170460327542224, + "language_loss": 0.68308431, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70469272, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11773682, + "step": 7372, + "time_per_iteration": 2.499865770339966 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.04968071, + "balance_loss_mlp": 1.01904941, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 3.0142159990349318, + "language_loss": 0.83571535, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85734582, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.14135742, + "step": 7373, + "time_per_iteration": 2.482652187347412 + }, + { + "auxiliary_loss_clip": 0.01080244, + "auxiliary_loss_mlp": 0.01008147, + "balance_loss_clip": 1.05127954, + "balance_loss_mlp": 1.00652599, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.7553778058110396, + "language_loss": 0.55281585, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57369977, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.28955078, + "router_z_loss_mlp": 0.01620483, + "step": 7374, + "time_per_iteration": 3.133538246154785 + }, + { + "auxiliary_loss_clip": 0.01132542, + "auxiliary_loss_mlp": 0.01038404, + "balance_loss_clip": 1.05377102, + "balance_loss_mlp": 1.02441466, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.774969067481993, + "language_loss": 0.83093858, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.85264802, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13995361, + "step": 7375, + "time_per_iteration": 3.846578359603882 + }, + { + "auxiliary_loss_clip": 0.01139303, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.0577786, + "balance_loss_mlp": 1.01803279, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 2.1127587463361706, + "language_loss": 0.84186077, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86357105, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13690186, + "step": 7376, + "time_per_iteration": 2.4567465782165527 + }, + { + "auxiliary_loss_clip": 0.01132542, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.05332041, + "balance_loss_mlp": 1.02029884, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7939494398801272, + "language_loss": 0.77009261, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79175282, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13171387, + "step": 7377, + "time_per_iteration": 2.44522762298584 + }, + { + "auxiliary_loss_clip": 0.01132985, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.05244184, + "balance_loss_mlp": 1.02955973, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 2.573632716743667, + "language_loss": 0.76231861, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78408396, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.14001465, + "step": 7378, + "time_per_iteration": 2.4639270305633545 + }, + { + "auxiliary_loss_clip": 0.01142879, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.06070697, + "balance_loss_mlp": 1.02275932, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.0262886676501415, + "language_loss": 0.6888752, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71065938, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.12792969, + "step": 7379, + "time_per_iteration": 2.4858484268188477 + }, + { + "auxiliary_loss_clip": 0.01128684, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.0495218, + "balance_loss_mlp": 1.01965022, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.7781300328030034, + "language_loss": 0.73384142, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75545824, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13336182, + "step": 7380, + "time_per_iteration": 2.473743200302124 + }, + { + "auxiliary_loss_clip": 0.01132059, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.05225539, + "balance_loss_mlp": 1.01556444, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.6636290557680526, + "language_loss": 0.64772391, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66933513, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1350708, + "step": 7381, + "time_per_iteration": 2.52935528755188 + }, + { + "auxiliary_loss_clip": 0.01128928, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.0483253, + "balance_loss_mlp": 1.0206883, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.073463461449115, + "language_loss": 0.80085313, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82248646, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13708496, + "step": 7382, + "time_per_iteration": 2.452949285507202 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01041666, + "balance_loss_clip": 1.04906797, + "balance_loss_mlp": 1.02866638, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 4.88651997777665, + "language_loss": 0.65795046, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67965961, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13000488, + "step": 7383, + "time_per_iteration": 2.6022024154663086 + }, + { + "auxiliary_loss_clip": 0.01137682, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.0551126, + "balance_loss_mlp": 1.02105474, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 3.0670390660861613, + "language_loss": 0.7567327, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77846599, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.14575195, + "step": 7384, + "time_per_iteration": 2.537893295288086 + }, + { + "auxiliary_loss_clip": 0.0113584, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.05282879, + "balance_loss_mlp": 1.02176833, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.5934120866819221, + "language_loss": 0.81191659, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83362997, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13739014, + "step": 7385, + "time_per_iteration": 3.8626511096954346 + }, + { + "auxiliary_loss_clip": 0.01131825, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_clip": 1.04886818, + "balance_loss_mlp": 1.0268445, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.9302870975718023, + "language_loss": 0.82117879, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84291977, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.1541748, + "step": 7386, + "time_per_iteration": 2.4308581352233887 + }, + { + "auxiliary_loss_clip": 0.01138989, + "auxiliary_loss_mlp": 0.01041585, + "balance_loss_clip": 1.05097079, + "balance_loss_mlp": 1.02562332, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.874226083862065, + "language_loss": 0.7021836, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72398931, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.1595459, + "step": 7387, + "time_per_iteration": 2.659616708755493 + }, + { + "auxiliary_loss_clip": 0.0113513, + "auxiliary_loss_mlp": 0.01041389, + "balance_loss_clip": 1.05372548, + "balance_loss_mlp": 1.0269767, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 7.403450406245205, + "language_loss": 0.71747267, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.7392379, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.144104, + "step": 7388, + "time_per_iteration": 2.4200093746185303 + }, + { + "auxiliary_loss_clip": 0.01135863, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.05137694, + "balance_loss_mlp": 1.01745439, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.673565225856514, + "language_loss": 0.69590503, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71758354, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14526367, + "step": 7389, + "time_per_iteration": 2.46273136138916 + }, + { + "auxiliary_loss_clip": 0.01132323, + "auxiliary_loss_mlp": 0.01033425, + "balance_loss_clip": 1.05148625, + "balance_loss_mlp": 1.01982927, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 1.806312473933998, + "language_loss": 0.7505244, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77218187, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13592529, + "step": 7390, + "time_per_iteration": 2.604700803756714 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.0620302, + "balance_loss_mlp": 1.01658988, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.1660942404928907, + "language_loss": 0.81638193, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83809054, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13287354, + "step": 7391, + "time_per_iteration": 2.506931781768799 + }, + { + "auxiliary_loss_clip": 0.01133685, + "auxiliary_loss_mlp": 0.01038638, + "balance_loss_clip": 1.05329216, + "balance_loss_mlp": 1.02435684, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.7355097970751923, + "language_loss": 0.73724103, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.7589643, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.14282227, + "step": 7392, + "time_per_iteration": 2.4530773162841797 + }, + { + "auxiliary_loss_clip": 0.01130144, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.05038178, + "balance_loss_mlp": 1.02199519, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 2.7457961143723955, + "language_loss": 0.79611737, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81776673, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12799072, + "step": 7393, + "time_per_iteration": 3.9085395336151123 + }, + { + "auxiliary_loss_clip": 0.01140868, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.0574255, + "balance_loss_mlp": 1.02163219, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 2.435967452038812, + "language_loss": 0.80942035, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83119047, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14526367, + "step": 7394, + "time_per_iteration": 2.5636179447174072 + }, + { + "auxiliary_loss_clip": 0.01132005, + "auxiliary_loss_mlp": 0.01040384, + "balance_loss_clip": 1.05139315, + "balance_loss_mlp": 1.02713346, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 2.744453227331378, + "language_loss": 0.79948843, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.82121229, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13226318, + "step": 7395, + "time_per_iteration": 2.4610769748687744 + }, + { + "auxiliary_loss_clip": 0.01138971, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.05603719, + "balance_loss_mlp": 1.02501023, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 2.0066236580696564, + "language_loss": 0.68747777, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70925421, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13671875, + "step": 7396, + "time_per_iteration": 2.5030252933502197 + }, + { + "auxiliary_loss_clip": 0.01137407, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.05538678, + "balance_loss_mlp": 1.01994205, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.7872986691097597, + "language_loss": 0.81019348, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83190525, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13842773, + "step": 7397, + "time_per_iteration": 2.6027047634124756 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.06330562, + "balance_loss_mlp": 1.02235866, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.5426371315395466, + "language_loss": 0.67624104, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69803447, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13592529, + "step": 7398, + "time_per_iteration": 2.532482862472534 + }, + { + "auxiliary_loss_clip": 0.01133315, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.05403686, + "balance_loss_mlp": 1.02200317, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.784387410199509, + "language_loss": 0.69694501, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71862811, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12988281, + "step": 7399, + "time_per_iteration": 2.4069857597351074 + }, + { + "auxiliary_loss_clip": 0.01143012, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.06342757, + "balance_loss_mlp": 1.02221274, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.8631878837640916, + "language_loss": 0.72373974, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.7455256, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13366699, + "step": 7400, + "time_per_iteration": 2.4574930667877197 + }, + { + "auxiliary_loss_clip": 0.01133552, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.0523603, + "balance_loss_mlp": 1.01939654, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.8765962467343988, + "language_loss": 0.85142195, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87309182, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.14050293, + "step": 7401, + "time_per_iteration": 2.4790189266204834 + }, + { + "auxiliary_loss_clip": 0.01132602, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.02698183, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 2.207256611365788, + "language_loss": 0.83607328, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85781825, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.14916992, + "step": 7402, + "time_per_iteration": 2.529423952102661 + }, + { + "auxiliary_loss_clip": 0.01145365, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.06319237, + "balance_loss_mlp": 1.02245378, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.9363382602794323, + "language_loss": 0.76615191, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.78796643, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13641357, + "step": 7403, + "time_per_iteration": 2.4660420417785645 + }, + { + "auxiliary_loss_clip": 0.01063779, + "auxiliary_loss_mlp": 0.01006398, + "balance_loss_clip": 1.03553009, + "balance_loss_mlp": 1.00447011, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7439083554464205, + "language_loss": 0.59963781, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62033951, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.28271484, + "router_z_loss_mlp": 0.01927185, + "step": 7404, + "time_per_iteration": 3.036511182785034 + }, + { + "auxiliary_loss_clip": 0.01141006, + "auxiliary_loss_mlp": 0.01058539, + "balance_loss_clip": 1.05483258, + "balance_loss_mlp": 1.04105067, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.7574059014017118, + "language_loss": 0.82378495, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84578037, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.17474365, + "step": 7405, + "time_per_iteration": 2.436415433883667 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.05504966, + "balance_loss_mlp": 1.01776099, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 2.110787858667843, + "language_loss": 0.7507695, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77243906, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13684082, + "step": 7406, + "time_per_iteration": 2.5071403980255127 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.05282068, + "balance_loss_mlp": 1.01675582, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.5383472219523764, + "language_loss": 0.65290582, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67450148, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12896729, + "step": 7407, + "time_per_iteration": 3.9250681400299072 + }, + { + "auxiliary_loss_clip": 0.01140078, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.05732751, + "balance_loss_mlp": 1.0221591, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5893773669604385, + "language_loss": 0.67982495, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.70158529, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13787842, + "step": 7408, + "time_per_iteration": 2.4869208335876465 + }, + { + "auxiliary_loss_clip": 0.01138079, + "auxiliary_loss_mlp": 0.01039207, + "balance_loss_clip": 1.05657661, + "balance_loss_mlp": 1.02344704, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 2.158302061802956, + "language_loss": 0.72032952, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74210238, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.1574707, + "step": 7409, + "time_per_iteration": 2.6803345680236816 + }, + { + "auxiliary_loss_clip": 0.01140191, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.05832875, + "balance_loss_mlp": 1.02217603, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 1.8336501564215075, + "language_loss": 0.65305173, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67482585, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.15045166, + "step": 7410, + "time_per_iteration": 2.58776593208313 + }, + { + "auxiliary_loss_clip": 0.01147248, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.05757999, + "balance_loss_mlp": 1.02314901, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.2837527125517587, + "language_loss": 0.73440397, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.75625944, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.15148926, + "step": 7411, + "time_per_iteration": 2.4819228649139404 + }, + { + "auxiliary_loss_clip": 0.01138895, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.05643415, + "balance_loss_mlp": 1.02035189, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.9465964911261848, + "language_loss": 0.78903592, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81076366, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13531494, + "step": 7412, + "time_per_iteration": 2.4374399185180664 + }, + { + "auxiliary_loss_clip": 0.01134879, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.05108118, + "balance_loss_mlp": 1.02425122, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 2.992547473298896, + "language_loss": 0.80075866, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82248414, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13397217, + "step": 7413, + "time_per_iteration": 2.410862684249878 + }, + { + "auxiliary_loss_clip": 0.01133807, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.05226386, + "balance_loss_mlp": 1.01936412, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 2.0702651665568563, + "language_loss": 0.7649399, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.78660607, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13458252, + "step": 7414, + "time_per_iteration": 2.4460835456848145 + }, + { + "auxiliary_loss_clip": 0.01136817, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.05745018, + "balance_loss_mlp": 1.02061534, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.6004815561088808, + "language_loss": 0.83241451, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85412347, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13470459, + "step": 7415, + "time_per_iteration": 2.48704195022583 + }, + { + "auxiliary_loss_clip": 0.01132584, + "auxiliary_loss_mlp": 0.01051849, + "balance_loss_clip": 1.0493027, + "balance_loss_mlp": 1.03577995, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.6673848996383323, + "language_loss": 0.84301239, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86485672, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.16064453, + "step": 7416, + "time_per_iteration": 2.5017948150634766 + }, + { + "auxiliary_loss_clip": 0.01133933, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.05097294, + "balance_loss_mlp": 1.01970053, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.4838337798258245, + "language_loss": 0.8104018, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.8320812, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14300537, + "step": 7417, + "time_per_iteration": 2.5930519104003906 + }, + { + "auxiliary_loss_clip": 0.01143528, + "auxiliary_loss_mlp": 0.01040568, + "balance_loss_clip": 1.05954313, + "balance_loss_mlp": 1.02673984, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 2.1474057145548775, + "language_loss": 0.80965191, + "learning_rate": 2.443197426237077e-06, + "loss": 0.8314929, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.1383667, + "step": 7418, + "time_per_iteration": 2.4676623344421387 + }, + { + "auxiliary_loss_clip": 0.01137534, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.05345201, + "balance_loss_mlp": 1.01646876, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.595016216353901, + "language_loss": 0.77363276, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79531491, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14196777, + "step": 7419, + "time_per_iteration": 4.048392057418823 + }, + { + "auxiliary_loss_clip": 0.0113253, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.05118704, + "balance_loss_mlp": 1.02298903, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.5790996355130276, + "language_loss": 0.72024059, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74192804, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13226318, + "step": 7420, + "time_per_iteration": 2.516514301300049 + }, + { + "auxiliary_loss_clip": 0.01131226, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.05296373, + "balance_loss_mlp": 1.01721287, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.582713059795025, + "language_loss": 0.75003415, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77166545, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14685059, + "step": 7421, + "time_per_iteration": 2.538041830062866 + }, + { + "auxiliary_loss_clip": 0.01138035, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.05946732, + "balance_loss_mlp": 1.01976216, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 2.1336938878761376, + "language_loss": 0.76536143, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78707087, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13165283, + "step": 7422, + "time_per_iteration": 2.457906723022461 + }, + { + "auxiliary_loss_clip": 0.01138651, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.05681157, + "balance_loss_mlp": 1.02095151, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 4.275951019564261, + "language_loss": 0.65577209, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67750275, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13446045, + "step": 7423, + "time_per_iteration": 2.630040407180786 + }, + { + "auxiliary_loss_clip": 0.01132956, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.05518007, + "balance_loss_mlp": 1.01663673, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.4328383838568493, + "language_loss": 0.79360873, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81522751, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1227417, + "step": 7424, + "time_per_iteration": 2.4714643955230713 + }, + { + "auxiliary_loss_clip": 0.01130816, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.05337334, + "balance_loss_mlp": 1.01939225, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.428393695950734, + "language_loss": 0.80272472, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82435429, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12750244, + "step": 7425, + "time_per_iteration": 2.562851667404175 + }, + { + "auxiliary_loss_clip": 0.01129216, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.05025768, + "balance_loss_mlp": 1.01904464, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.5294914197517289, + "language_loss": 0.77118266, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.7927928, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12756348, + "step": 7426, + "time_per_iteration": 2.4412126541137695 + }, + { + "auxiliary_loss_clip": 0.01126019, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.04468465, + "balance_loss_mlp": 1.01926506, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.6438715876031262, + "language_loss": 0.650352, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.6719355, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13079834, + "step": 7427, + "time_per_iteration": 2.565791606903076 + }, + { + "auxiliary_loss_clip": 0.01145259, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.06476641, + "balance_loss_mlp": 1.0182035, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.7536776539583259, + "language_loss": 0.7493183, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77108383, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13079834, + "step": 7428, + "time_per_iteration": 2.4366538524627686 + }, + { + "auxiliary_loss_clip": 0.01120212, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.04256487, + "balance_loss_mlp": 1.02342916, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 3.290954544773601, + "language_loss": 0.78011137, + "learning_rate": 2.439018845165806e-06, + "loss": 0.80168492, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.137146, + "step": 7429, + "time_per_iteration": 3.869614362716675 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.04860616, + "balance_loss_mlp": 1.01819205, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.5308468177827068, + "language_loss": 0.91088504, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93249452, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13647461, + "step": 7430, + "time_per_iteration": 2.5031938552856445 + }, + { + "auxiliary_loss_clip": 0.01131417, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.04689932, + "balance_loss_mlp": 1.02304339, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 2.300826222877538, + "language_loss": 0.79680169, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81849825, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15203857, + "step": 7431, + "time_per_iteration": 2.472538948059082 + }, + { + "auxiliary_loss_clip": 0.01138497, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.05368471, + "balance_loss_mlp": 1.0176326, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 5.601962865998284, + "language_loss": 0.80229175, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82399881, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14593506, + "step": 7432, + "time_per_iteration": 2.4825501441955566 + }, + { + "auxiliary_loss_clip": 0.0114161, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.0580318, + "balance_loss_mlp": 1.02442789, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.046656845128546, + "language_loss": 0.76919878, + "learning_rate": 2.437498860702301e-06, + "loss": 0.79099572, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13647461, + "step": 7433, + "time_per_iteration": 2.466183662414551 + }, + { + "auxiliary_loss_clip": 0.01137494, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.05980992, + "balance_loss_mlp": 1.02010703, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.8292967119764172, + "language_loss": 0.77776659, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79945701, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11444092, + "step": 7434, + "time_per_iteration": 2.5307841300964355 + }, + { + "auxiliary_loss_clip": 0.01135831, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.05464423, + "balance_loss_mlp": 1.01953363, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.709465998097488, + "language_loss": 0.64480823, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66649449, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13256836, + "step": 7435, + "time_per_iteration": 2.4370474815368652 + }, + { + "auxiliary_loss_clip": 0.0114418, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.06211555, + "balance_loss_mlp": 1.01701021, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.7799154957814756, + "language_loss": 0.8337059, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85545862, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.14086914, + "step": 7436, + "time_per_iteration": 2.487037181854248 + }, + { + "auxiliary_loss_clip": 0.01139702, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.05688405, + "balance_loss_mlp": 1.02321517, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.6781372550471183, + "language_loss": 0.79436219, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81613559, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14440918, + "step": 7437, + "time_per_iteration": 4.001074314117432 + }, + { + "auxiliary_loss_clip": 0.01129764, + "auxiliary_loss_mlp": 0.01040301, + "balance_loss_clip": 1.04744375, + "balance_loss_mlp": 1.02665186, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.661686510101544, + "language_loss": 0.72035062, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74205124, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13653564, + "step": 7438, + "time_per_iteration": 2.5609662532806396 + }, + { + "auxiliary_loss_clip": 0.01133599, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.05066061, + "balance_loss_mlp": 1.02289987, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.7899902877555756, + "language_loss": 0.67404425, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69575757, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14849854, + "step": 7439, + "time_per_iteration": 2.5210578441619873 + }, + { + "auxiliary_loss_clip": 0.01140926, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.05770016, + "balance_loss_mlp": 1.02237988, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 2.1147169798586365, + "language_loss": 0.74034548, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.7621274, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14892578, + "step": 7440, + "time_per_iteration": 2.507716655731201 + }, + { + "auxiliary_loss_clip": 0.01134142, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.05161786, + "balance_loss_mlp": 1.02512681, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 2.1913432338469123, + "language_loss": 0.74127662, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.7630021, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13293457, + "step": 7441, + "time_per_iteration": 2.504026174545288 + }, + { + "auxiliary_loss_clip": 0.01142778, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.05518639, + "balance_loss_mlp": 1.02868116, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.9205849619075717, + "language_loss": 0.7515403, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.773408, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.15307617, + "step": 7442, + "time_per_iteration": 2.493563413619995 + }, + { + "auxiliary_loss_clip": 0.01141176, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.05673623, + "balance_loss_mlp": 1.0200808, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.893999518992829, + "language_loss": 0.7452082, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76696432, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14337158, + "step": 7443, + "time_per_iteration": 2.543513298034668 + }, + { + "auxiliary_loss_clip": 0.01130723, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.0499804, + "balance_loss_mlp": 1.01667881, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 2.0078348210330503, + "language_loss": 0.77531731, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.7969358, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.14440918, + "step": 7444, + "time_per_iteration": 2.4931881427764893 + }, + { + "auxiliary_loss_clip": 0.01135267, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.05243611, + "balance_loss_mlp": 1.01948392, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.1698215203803697, + "language_loss": 0.85372424, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87540972, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13806152, + "step": 7445, + "time_per_iteration": 2.5013864040374756 + }, + { + "auxiliary_loss_clip": 0.01145717, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.05951571, + "balance_loss_mlp": 1.02020621, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 2.094250100751249, + "language_loss": 0.6376195, + "learning_rate": 2.432557082778765e-06, + "loss": 0.65942705, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14819336, + "step": 7446, + "time_per_iteration": 2.4850668907165527 + }, + { + "auxiliary_loss_clip": 0.01079097, + "auxiliary_loss_mlp": 0.01004958, + "balance_loss_clip": 1.04729474, + "balance_loss_mlp": 1.00333405, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7337865662366658, + "language_loss": 0.50184643, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52268696, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.31884766, + "router_z_loss_mlp": 0.01626587, + "step": 7447, + "time_per_iteration": 2.9230804443359375 + }, + { + "auxiliary_loss_clip": 0.0107176, + "auxiliary_loss_mlp": 0.01003999, + "balance_loss_clip": 1.04322815, + "balance_loss_mlp": 1.00233424, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.82674697001496, + "language_loss": 0.59374571, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61450326, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.28466797, + "router_z_loss_mlp": 0.01664734, + "step": 7448, + "time_per_iteration": 3.165517568588257 + }, + { + "auxiliary_loss_clip": 0.01138619, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.0579145, + "balance_loss_mlp": 1.01830709, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.5155091000006584, + "language_loss": 0.58864701, + "learning_rate": 2.431416277672789e-06, + "loss": 0.6103434, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1270752, + "step": 7449, + "time_per_iteration": 2.835146903991699 + }, + { + "auxiliary_loss_clip": 0.01142461, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.06159997, + "balance_loss_mlp": 1.01821542, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 2.256784301228133, + "language_loss": 0.8007021, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.82243323, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12432861, + "step": 7450, + "time_per_iteration": 2.4674465656280518 + }, + { + "auxiliary_loss_clip": 0.01145825, + "auxiliary_loss_mlp": 0.01039442, + "balance_loss_clip": 1.06307888, + "balance_loss_mlp": 1.02594733, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.0402350695932836, + "language_loss": 0.79789871, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81975138, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13494873, + "step": 7451, + "time_per_iteration": 3.8601536750793457 + }, + { + "auxiliary_loss_clip": 0.01084928, + "auxiliary_loss_mlp": 0.01008166, + "balance_loss_clip": 1.05582857, + "balance_loss_mlp": 1.00650871, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8541956277713012, + "language_loss": 0.62824214, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64917302, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.29150391, + "router_z_loss_mlp": 0.01657104, + "step": 7452, + "time_per_iteration": 3.1784205436706543 + }, + { + "auxiliary_loss_clip": 0.01143036, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.06041372, + "balance_loss_mlp": 1.02015114, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 2.398065633811614, + "language_loss": 0.62654388, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64832574, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14978027, + "step": 7453, + "time_per_iteration": 2.490684986114502 + }, + { + "auxiliary_loss_clip": 0.01060689, + "auxiliary_loss_mlp": 0.01003631, + "balance_loss_clip": 1.03244698, + "balance_loss_mlp": 1.00204754, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7630011957146732, + "language_loss": 0.56979513, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59043831, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.28173828, + "router_z_loss_mlp": 0.01585388, + "step": 7454, + "time_per_iteration": 2.9875357151031494 + }, + { + "auxiliary_loss_clip": 0.01134603, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.05281579, + "balance_loss_mlp": 1.02212536, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.1935018708393184, + "language_loss": 0.75063968, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.77233827, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13110352, + "step": 7455, + "time_per_iteration": 2.3968026638031006 + }, + { + "auxiliary_loss_clip": 0.0113455, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.05478716, + "balance_loss_mlp": 1.01983786, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.9975761596691552, + "language_loss": 0.76148903, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78316605, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13317871, + "step": 7456, + "time_per_iteration": 2.5590150356292725 + }, + { + "auxiliary_loss_clip": 0.01134238, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.0546267, + "balance_loss_mlp": 1.02362156, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.9997402564970683, + "language_loss": 0.76257217, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78428257, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13195801, + "step": 7457, + "time_per_iteration": 2.441896677017212 + }, + { + "auxiliary_loss_clip": 0.01134895, + "auxiliary_loss_mlp": 0.01044839, + "balance_loss_clip": 1.04910922, + "balance_loss_mlp": 1.02815521, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 2.438981815896281, + "language_loss": 0.68006432, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70186168, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.16687012, + "step": 7458, + "time_per_iteration": 2.488834857940674 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.05275738, + "balance_loss_mlp": 1.01815128, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 2.635075809398323, + "language_loss": 0.71933281, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74107027, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.15148926, + "step": 7459, + "time_per_iteration": 2.462843894958496 + }, + { + "auxiliary_loss_clip": 0.01133259, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.05132735, + "balance_loss_mlp": 1.02020049, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.6629973575363008, + "language_loss": 0.69857472, + "learning_rate": 2.427232068909154e-06, + "loss": 0.72024441, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13513184, + "step": 7460, + "time_per_iteration": 2.50756573677063 + }, + { + "auxiliary_loss_clip": 0.01134309, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.05305266, + "balance_loss_mlp": 1.02387726, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 2.2351262644391587, + "language_loss": 0.77112138, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79284, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.13653564, + "step": 7461, + "time_per_iteration": 2.4359779357910156 + }, + { + "auxiliary_loss_clip": 0.01132144, + "auxiliary_loss_mlp": 0.01035102, + "balance_loss_clip": 1.04932749, + "balance_loss_mlp": 1.02155995, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7574327352766521, + "language_loss": 0.67559034, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.69726276, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13549805, + "step": 7462, + "time_per_iteration": 3.9794390201568604 + }, + { + "auxiliary_loss_clip": 0.01072217, + "auxiliary_loss_mlp": 0.01005851, + "balance_loss_clip": 1.04362905, + "balance_loss_mlp": 1.00407791, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.745166832947554, + "language_loss": 0.5443089, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56508958, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.28662109, + "router_z_loss_mlp": 0.01773071, + "step": 7463, + "time_per_iteration": 3.100937604904175 + }, + { + "auxiliary_loss_clip": 0.01148456, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.06564832, + "balance_loss_mlp": 1.01813126, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.8188611807830701, + "language_loss": 0.75718558, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.778988, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13653564, + "step": 7464, + "time_per_iteration": 2.519113779067993 + }, + { + "auxiliary_loss_clip": 0.01138036, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.05566776, + "balance_loss_mlp": 1.02217793, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.7871224109477843, + "language_loss": 0.73623836, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75797081, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13031006, + "step": 7465, + "time_per_iteration": 2.5891501903533936 + }, + { + "auxiliary_loss_clip": 0.01144069, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_clip": 1.05694723, + "balance_loss_mlp": 1.02689886, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 2.0200061932353663, + "language_loss": 0.79951584, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82138366, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.15814209, + "step": 7466, + "time_per_iteration": 2.5823984146118164 + }, + { + "auxiliary_loss_clip": 0.01140602, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.05899763, + "balance_loss_mlp": 1.01917601, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.9948866325605503, + "language_loss": 0.80300045, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82473719, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13903809, + "step": 7467, + "time_per_iteration": 2.46724796295166 + }, + { + "auxiliary_loss_clip": 0.01132494, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.05499351, + "balance_loss_mlp": 1.02129996, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.7511973907692509, + "language_loss": 0.7509402, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77261001, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13189697, + "step": 7468, + "time_per_iteration": 2.463470935821533 + }, + { + "auxiliary_loss_clip": 0.01134603, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.05152607, + "balance_loss_mlp": 1.0184598, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.7265052918394383, + "language_loss": 0.70956606, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73122036, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.12371826, + "step": 7469, + "time_per_iteration": 2.451815605163574 + }, + { + "auxiliary_loss_clip": 0.01147864, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.0642606, + "balance_loss_mlp": 1.02646685, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7358237766682054, + "language_loss": 0.71953136, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74142134, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14666748, + "step": 7470, + "time_per_iteration": 2.484732151031494 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.0478133, + "balance_loss_mlp": 1.02143073, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.295872038992119, + "language_loss": 0.77203894, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79370582, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.1505127, + "step": 7471, + "time_per_iteration": 2.48037052154541 + }, + { + "auxiliary_loss_clip": 0.01131501, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.04869807, + "balance_loss_mlp": 1.02074099, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.742844048565365, + "language_loss": 0.70023233, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72188711, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13238525, + "step": 7472, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01055902, + "auxiliary_loss_mlp": 0.01004412, + "balance_loss_clip": 1.02765393, + "balance_loss_mlp": 1.00285149, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.745548240080344, + "language_loss": 0.61639798, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63700116, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.28222656, + "router_z_loss_mlp": 0.01559448, + "step": 7473, + "time_per_iteration": 3.0329489707946777 + }, + { + "auxiliary_loss_clip": 0.01135661, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.05069637, + "balance_loss_mlp": 1.02564788, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.2755592805739773, + "language_loss": 0.77787429, + "learning_rate": 2.421903879707657e-06, + "loss": 0.79963022, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14282227, + "step": 7474, + "time_per_iteration": 3.804497480392456 + }, + { + "auxiliary_loss_clip": 0.01131117, + "auxiliary_loss_mlp": 0.01038642, + "balance_loss_clip": 1.05061316, + "balance_loss_mlp": 1.02501619, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.7296319270631322, + "language_loss": 0.71749872, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.7391963, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13647461, + "step": 7475, + "time_per_iteration": 2.55016827583313 + }, + { + "auxiliary_loss_clip": 0.0114054, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.05455577, + "balance_loss_mlp": 1.01996064, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.8073819548301813, + "language_loss": 0.77006072, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79180098, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.13525391, + "step": 7476, + "time_per_iteration": 2.5568342208862305 + }, + { + "auxiliary_loss_clip": 0.01141805, + "auxiliary_loss_mlp": 0.01042158, + "balance_loss_clip": 1.05697381, + "balance_loss_mlp": 1.02719676, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.0623849610940566, + "language_loss": 0.71763563, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73947525, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14935303, + "step": 7477, + "time_per_iteration": 2.4683144092559814 + }, + { + "auxiliary_loss_clip": 0.01140962, + "auxiliary_loss_mlp": 0.01037304, + "balance_loss_clip": 1.05436146, + "balance_loss_mlp": 1.02264094, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.1280401736169585, + "language_loss": 0.67556912, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.69735181, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14660645, + "step": 7478, + "time_per_iteration": 2.4206607341766357 + }, + { + "auxiliary_loss_clip": 0.01132102, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.05309057, + "balance_loss_mlp": 1.0233798, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8291057619848983, + "language_loss": 0.89748132, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91917002, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13397217, + "step": 7479, + "time_per_iteration": 2.4672696590423584 + }, + { + "auxiliary_loss_clip": 0.01142311, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.05455089, + "balance_loss_mlp": 1.02746987, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 3.2763190448874275, + "language_loss": 0.75620878, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77805436, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14770508, + "step": 7480, + "time_per_iteration": 3.941033124923706 + }, + { + "auxiliary_loss_clip": 0.0114877, + "auxiliary_loss_mlp": 0.01037643, + "balance_loss_clip": 1.06497812, + "balance_loss_mlp": 1.02255058, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.0522811001008585, + "language_loss": 0.79833513, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82019931, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.15100098, + "step": 7481, + "time_per_iteration": 2.44429087638855 + }, + { + "auxiliary_loss_clip": 0.01128338, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.0496192, + "balance_loss_mlp": 1.02031565, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.6104965508584734, + "language_loss": 0.68584776, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70747989, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.14550781, + "step": 7482, + "time_per_iteration": 2.5658819675445557 + }, + { + "auxiliary_loss_clip": 0.01142562, + "auxiliary_loss_mlp": 0.01043668, + "balance_loss_clip": 1.05982089, + "balance_loss_mlp": 1.02948248, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0349519180649374, + "language_loss": 0.85267627, + "learning_rate": 2.418476956872571e-06, + "loss": 0.8745386, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14196777, + "step": 7483, + "time_per_iteration": 2.422701835632324 + }, + { + "auxiliary_loss_clip": 0.01140471, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.05639219, + "balance_loss_mlp": 1.02926481, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8726740061869303, + "language_loss": 0.80792487, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82976091, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13891602, + "step": 7484, + "time_per_iteration": 2.5794944763183594 + }, + { + "auxiliary_loss_clip": 0.01151142, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.06183004, + "balance_loss_mlp": 1.01698875, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.679981861538242, + "language_loss": 0.74452353, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.76635528, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15039062, + "step": 7485, + "time_per_iteration": 2.4429712295532227 + }, + { + "auxiliary_loss_clip": 0.01096396, + "auxiliary_loss_mlp": 0.01008682, + "balance_loss_clip": 1.06850672, + "balance_loss_mlp": 1.00581539, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7845419761521673, + "language_loss": 0.58611786, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60716867, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.27978516, + "router_z_loss_mlp": 0.02864075, + "step": 7486, + "time_per_iteration": 3.1411614418029785 + }, + { + "auxiliary_loss_clip": 0.01133039, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.05190527, + "balance_loss_mlp": 1.01940739, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.370120959608543, + "language_loss": 0.8276003, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.84926879, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14398193, + "step": 7487, + "time_per_iteration": 2.4607651233673096 + }, + { + "auxiliary_loss_clip": 0.01134495, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.0537293, + "balance_loss_mlp": 1.02006352, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5445328019779005, + "language_loss": 0.76877338, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79046321, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.14440918, + "step": 7488, + "time_per_iteration": 2.5467875003814697 + }, + { + "auxiliary_loss_clip": 0.01145277, + "auxiliary_loss_mlp": 0.01044597, + "balance_loss_clip": 1.05733538, + "balance_loss_mlp": 1.02914739, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.133357328371335, + "language_loss": 0.72464269, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74654144, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.15441895, + "step": 7489, + "time_per_iteration": 2.484375238418579 + }, + { + "auxiliary_loss_clip": 0.01143646, + "auxiliary_loss_mlp": 0.01037531, + "balance_loss_clip": 1.05843353, + "balance_loss_mlp": 1.02154469, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.2666977333125073, + "language_loss": 0.69659579, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71840763, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.15991211, + "step": 7490, + "time_per_iteration": 2.509777545928955 + }, + { + "auxiliary_loss_clip": 0.01072097, + "auxiliary_loss_mlp": 0.01014111, + "balance_loss_clip": 1.04285634, + "balance_loss_mlp": 1.01244855, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7332477023156487, + "language_loss": 0.56626666, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58712876, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.29199219, + "router_z_loss_mlp": 0.01661682, + "step": 7491, + "time_per_iteration": 3.041160821914673 + }, + { + "auxiliary_loss_clip": 0.01129721, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.04835224, + "balance_loss_mlp": 1.02233958, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.7316319935924294, + "language_loss": 0.79246497, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81412053, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.1350708, + "step": 7492, + "time_per_iteration": 2.4735310077667236 + }, + { + "auxiliary_loss_clip": 0.01148029, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.06039786, + "balance_loss_mlp": 1.02573991, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.3994802216223623, + "language_loss": 0.92724097, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94913173, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.15301514, + "step": 7493, + "time_per_iteration": 2.4171085357666016 + }, + { + "auxiliary_loss_clip": 0.01050908, + "auxiliary_loss_mlp": 0.01006499, + "balance_loss_clip": 1.0230006, + "balance_loss_mlp": 1.00489736, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.7948042534300745, + "language_loss": 0.62894416, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64951825, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.01603699, + "step": 7494, + "time_per_iteration": 3.1201655864715576 + }, + { + "auxiliary_loss_clip": 0.01131498, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.05097795, + "balance_loss_mlp": 1.02240491, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4520665157875992, + "language_loss": 0.81794786, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.83963668, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.14978027, + "step": 7495, + "time_per_iteration": 3.8735413551330566 + }, + { + "auxiliary_loss_clip": 0.01143685, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.05801201, + "balance_loss_mlp": 1.0184648, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.6654805145865177, + "language_loss": 0.85735703, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87912965, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.15112305, + "step": 7496, + "time_per_iteration": 2.580470323562622 + }, + { + "auxiliary_loss_clip": 0.01134447, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.05219579, + "balance_loss_mlp": 1.01730943, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.0888354184093325, + "language_loss": 0.76670367, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78837156, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.15026855, + "step": 7497, + "time_per_iteration": 2.477834701538086 + }, + { + "auxiliary_loss_clip": 0.01128684, + "auxiliary_loss_mlp": 0.01039365, + "balance_loss_clip": 1.04416776, + "balance_loss_mlp": 1.02444005, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 1.9626952429313136, + "language_loss": 0.74718785, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.76886833, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.14941406, + "step": 7498, + "time_per_iteration": 2.4301059246063232 + }, + { + "auxiliary_loss_clip": 0.01138227, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.05389977, + "balance_loss_mlp": 1.02081752, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 3.0241256949363087, + "language_loss": 0.70285428, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72459507, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.15032959, + "step": 7499, + "time_per_iteration": 2.6046104431152344 + }, + { + "auxiliary_loss_clip": 0.01152738, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.06628621, + "balance_loss_mlp": 1.02072406, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 2.0427164093137713, + "language_loss": 0.77281415, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79470116, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.15228271, + "step": 7500, + "time_per_iteration": 2.450674057006836 + }, + { + "auxiliary_loss_clip": 0.01138036, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.0546937, + "balance_loss_mlp": 1.02121532, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.8785860920496182, + "language_loss": 0.62224138, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64397967, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14587402, + "step": 7501, + "time_per_iteration": 2.491683006286621 + }, + { + "auxiliary_loss_clip": 0.01137905, + "auxiliary_loss_mlp": 0.01042196, + "balance_loss_clip": 1.04968655, + "balance_loss_mlp": 1.02557826, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9505171938049302, + "language_loss": 0.84499419, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86679518, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.16625977, + "step": 7502, + "time_per_iteration": 2.3835999965667725 + }, + { + "auxiliary_loss_clip": 0.01133281, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.05112147, + "balance_loss_mlp": 1.02230906, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.3783725798417636, + "language_loss": 0.79613996, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81783777, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.14178467, + "step": 7503, + "time_per_iteration": 2.5008654594421387 + }, + { + "auxiliary_loss_clip": 0.01136235, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.05338788, + "balance_loss_mlp": 1.02684712, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 1.98433361464308, + "language_loss": 0.807648, + "learning_rate": 2.410475823155484e-06, + "loss": 0.82942683, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14782715, + "step": 7504, + "time_per_iteration": 2.4169795513153076 + }, + { + "auxiliary_loss_clip": 0.01138971, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.05469608, + "balance_loss_mlp": 1.02215338, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 2.0294591866372858, + "language_loss": 0.6362378, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65798557, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13653564, + "step": 7505, + "time_per_iteration": 2.537018060684204 + }, + { + "auxiliary_loss_clip": 0.01072484, + "auxiliary_loss_mlp": 0.01007784, + "balance_loss_clip": 1.04387975, + "balance_loss_mlp": 1.00622094, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8288080805925285, + "language_loss": 0.5885793, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60938197, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.01564026, + "step": 7506, + "time_per_iteration": 4.493480682373047 + }, + { + "auxiliary_loss_clip": 0.01132465, + "auxiliary_loss_mlp": 0.01038715, + "balance_loss_clip": 1.05309343, + "balance_loss_mlp": 1.02426064, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 2.353946474427309, + "language_loss": 0.79054713, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81225896, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14459229, + "step": 7507, + "time_per_iteration": 2.4268760681152344 + }, + { + "auxiliary_loss_clip": 0.01137671, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.05104756, + "balance_loss_mlp": 1.02189732, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5697227297510257, + "language_loss": 0.73919892, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76095116, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.15637207, + "step": 7508, + "time_per_iteration": 2.5407450199127197 + }, + { + "auxiliary_loss_clip": 0.01128446, + "auxiliary_loss_mlp": 0.01040747, + "balance_loss_clip": 1.04795575, + "balance_loss_mlp": 1.02689505, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.916659809456556, + "language_loss": 0.79491156, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81660348, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13861084, + "step": 7509, + "time_per_iteration": 2.423778533935547 + }, + { + "auxiliary_loss_clip": 0.01127584, + "auxiliary_loss_mlp": 0.01036104, + "balance_loss_clip": 1.04763317, + "balance_loss_mlp": 1.02191246, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.8146630316626713, + "language_loss": 0.73396122, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75559813, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14202881, + "step": 7510, + "time_per_iteration": 2.515056610107422 + }, + { + "auxiliary_loss_clip": 0.01126953, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.04419589, + "balance_loss_mlp": 1.02009845, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.7587254316525687, + "language_loss": 0.76688814, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.7885251, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.16638184, + "step": 7511, + "time_per_iteration": 2.526700735092163 + }, + { + "auxiliary_loss_clip": 0.0113036, + "auxiliary_loss_mlp": 0.01040304, + "balance_loss_clip": 1.04686141, + "balance_loss_mlp": 1.02317381, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.655997215175135, + "language_loss": 0.78803623, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80974281, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.17114258, + "step": 7512, + "time_per_iteration": 2.5087668895721436 + }, + { + "auxiliary_loss_clip": 0.01141719, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.05140352, + "balance_loss_mlp": 1.02517653, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 1.9425633771343604, + "language_loss": 0.87269378, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89452386, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.16113281, + "step": 7513, + "time_per_iteration": 2.4377424716949463 + }, + { + "auxiliary_loss_clip": 0.01130704, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.05422711, + "balance_loss_mlp": 1.01745844, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.6141555113735786, + "language_loss": 0.67503715, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69664693, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12811279, + "step": 7514, + "time_per_iteration": 2.5845165252685547 + }, + { + "auxiliary_loss_clip": 0.01137177, + "auxiliary_loss_mlp": 0.01041733, + "balance_loss_clip": 1.05554581, + "balance_loss_mlp": 1.02342248, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 1.7750748685073008, + "language_loss": 0.69427615, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71606529, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.18310547, + "step": 7515, + "time_per_iteration": 2.4649312496185303 + }, + { + "auxiliary_loss_clip": 0.01142141, + "auxiliary_loss_mlp": 0.01048808, + "balance_loss_clip": 1.05546832, + "balance_loss_mlp": 1.03192759, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.2133327301505643, + "language_loss": 0.81626546, + "learning_rate": 2.405900656236963e-06, + "loss": 0.838175, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.16870117, + "step": 7516, + "time_per_iteration": 2.4620423316955566 + }, + { + "auxiliary_loss_clip": 0.0113268, + "auxiliary_loss_mlp": 0.01040157, + "balance_loss_clip": 1.05277467, + "balance_loss_mlp": 1.02419519, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.7133259987695433, + "language_loss": 0.65822577, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67995417, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.15979004, + "step": 7517, + "time_per_iteration": 2.445054531097412 + }, + { + "auxiliary_loss_clip": 0.0112966, + "auxiliary_loss_mlp": 0.0102779, + "balance_loss_clip": 1.05157733, + "balance_loss_mlp": 1.01519608, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.876698004560217, + "language_loss": 0.63197732, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65355188, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12597656, + "step": 7518, + "time_per_iteration": 3.825162410736084 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01040948, + "balance_loss_clip": 1.05126417, + "balance_loss_mlp": 1.02663052, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.45469052760654, + "language_loss": 0.59422696, + "learning_rate": 2.404756517215982e-06, + "loss": 0.6159606, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14318848, + "step": 7519, + "time_per_iteration": 2.668414354324341 + }, + { + "auxiliary_loss_clip": 0.01135481, + "auxiliary_loss_mlp": 0.01049187, + "balance_loss_clip": 1.05341744, + "balance_loss_mlp": 1.0331471, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.3958738439712453, + "language_loss": 0.72837347, + "learning_rate": 2.404375106826223e-06, + "loss": 0.75022018, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.16033936, + "step": 7520, + "time_per_iteration": 2.478522539138794 + }, + { + "auxiliary_loss_clip": 0.01138627, + "auxiliary_loss_mlp": 0.01038125, + "balance_loss_clip": 1.05647576, + "balance_loss_mlp": 1.02457678, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.861931158926447, + "language_loss": 0.7562778, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.7780453, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13525391, + "step": 7521, + "time_per_iteration": 2.3936643600463867 + }, + { + "auxiliary_loss_clip": 0.01139095, + "auxiliary_loss_mlp": 0.01041281, + "balance_loss_clip": 1.05237889, + "balance_loss_mlp": 1.0263145, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.847686452134886, + "language_loss": 0.67762291, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69942665, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.14941406, + "step": 7522, + "time_per_iteration": 2.474932909011841 + }, + { + "auxiliary_loss_clip": 0.01134877, + "auxiliary_loss_mlp": 0.01039007, + "balance_loss_clip": 1.05541778, + "balance_loss_mlp": 1.0247916, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.544605329524058, + "language_loss": 0.60649943, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62823826, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.14196777, + "step": 7523, + "time_per_iteration": 4.019302129745483 + }, + { + "auxiliary_loss_clip": 0.01137993, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.052917, + "balance_loss_mlp": 1.02521944, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 1.9188811051818817, + "language_loss": 0.78418255, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80596614, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.15136719, + "step": 7524, + "time_per_iteration": 2.3975322246551514 + }, + { + "auxiliary_loss_clip": 0.01145087, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.06364655, + "balance_loss_mlp": 1.02376366, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.7090318294566653, + "language_loss": 0.63769507, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65951926, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13580322, + "step": 7525, + "time_per_iteration": 2.5412070751190186 + }, + { + "auxiliary_loss_clip": 0.01126228, + "auxiliary_loss_mlp": 0.01038637, + "balance_loss_clip": 1.04679763, + "balance_loss_mlp": 1.02495193, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.4938863633361068, + "language_loss": 0.79118854, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81283718, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13702393, + "step": 7526, + "time_per_iteration": 2.4188733100891113 + }, + { + "auxiliary_loss_clip": 0.01130401, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.05053306, + "balance_loss_mlp": 1.01901734, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.7189657074789912, + "language_loss": 0.80699199, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82862425, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13806152, + "step": 7527, + "time_per_iteration": 2.456199884414673 + }, + { + "auxiliary_loss_clip": 0.01135266, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.0171876, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.6503968587434434, + "language_loss": 0.65178609, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67345119, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14038086, + "step": 7528, + "time_per_iteration": 2.51324200630188 + }, + { + "auxiliary_loss_clip": 0.01130302, + "auxiliary_loss_mlp": 0.01037868, + "balance_loss_clip": 1.04988551, + "balance_loss_mlp": 1.02435565, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 5.08075282213447, + "language_loss": 0.75623596, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77791768, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13525391, + "step": 7529, + "time_per_iteration": 2.528881788253784 + }, + { + "auxiliary_loss_clip": 0.0113952, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.05974984, + "balance_loss_mlp": 1.02106714, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 2.046149536387738, + "language_loss": 0.72748595, + "learning_rate": 2.400560161948384e-06, + "loss": 0.74922997, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13818359, + "step": 7530, + "time_per_iteration": 2.4385976791381836 + }, + { + "auxiliary_loss_clip": 0.01131677, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.05157721, + "balance_loss_mlp": 1.0170927, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6751327101426714, + "language_loss": 0.76260298, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78422028, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12976074, + "step": 7531, + "time_per_iteration": 2.5285377502441406 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.04666221, + "balance_loss_mlp": 1.02395391, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.5709236737386743, + "language_loss": 0.6692515, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.6908623, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.14099121, + "step": 7532, + "time_per_iteration": 2.5186495780944824 + }, + { + "auxiliary_loss_clip": 0.01127379, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.04918289, + "balance_loss_mlp": 1.02751923, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.150227845719371, + "language_loss": 0.79168963, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81337082, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13238525, + "step": 7533, + "time_per_iteration": 2.5090479850769043 + }, + { + "auxiliary_loss_clip": 0.01141679, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.05326295, + "balance_loss_mlp": 1.01859903, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 2.419129405355741, + "language_loss": 0.82869732, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85045421, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15405273, + "step": 7534, + "time_per_iteration": 2.477800130844116 + }, + { + "auxiliary_loss_clip": 0.0113931, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.05689442, + "balance_loss_mlp": 1.01742184, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.6964178038173603, + "language_loss": 0.76625311, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78797054, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.15014648, + "step": 7535, + "time_per_iteration": 2.4972217082977295 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.0103545, + "balance_loss_clip": 1.04787493, + "balance_loss_mlp": 1.02243233, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.524123884943088, + "language_loss": 0.80500531, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13018799, + "step": 7536, + "time_per_iteration": 2.468170404434204 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.05890918, + "balance_loss_mlp": 1.02318954, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 1.729215208971788, + "language_loss": 0.75720942, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.77900243, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14007568, + "step": 7537, + "time_per_iteration": 2.540703058242798 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.05663514, + "balance_loss_mlp": 1.01946092, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.4743186158114088, + "language_loss": 0.75751698, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.77924263, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13470459, + "step": 7538, + "time_per_iteration": 2.456725597381592 + }, + { + "auxiliary_loss_clip": 0.01054364, + "auxiliary_loss_mlp": 0.01003566, + "balance_loss_clip": 1.02587378, + "balance_loss_mlp": 1.00200903, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7846494735307891, + "language_loss": 0.62401509, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64459437, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.28564453, + "router_z_loss_mlp": 0.01557922, + "step": 7539, + "time_per_iteration": 4.6945765018463135 + }, + { + "auxiliary_loss_clip": 0.01128037, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.0480094, + "balance_loss_mlp": 1.02617085, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.715418175743887, + "language_loss": 0.65804648, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67972517, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13665771, + "step": 7540, + "time_per_iteration": 2.503894567489624 + }, + { + "auxiliary_loss_clip": 0.01141186, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.05655873, + "balance_loss_mlp": 1.02612376, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.8652429601638814, + "language_loss": 0.84863281, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87045324, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14727783, + "step": 7541, + "time_per_iteration": 2.489431619644165 + }, + { + "auxiliary_loss_clip": 0.01133766, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.05086684, + "balance_loss_mlp": 1.02330649, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7659769839354593, + "language_loss": 0.76966929, + "learning_rate": 2.395980224383889e-06, + "loss": 0.79137641, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.1362915, + "step": 7542, + "time_per_iteration": 2.5686702728271484 + }, + { + "auxiliary_loss_clip": 0.01137276, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.05589128, + "balance_loss_mlp": 1.01816928, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.6935572867474964, + "language_loss": 0.806036, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82773161, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14123535, + "step": 7543, + "time_per_iteration": 2.4420595169067383 + }, + { + "auxiliary_loss_clip": 0.01138379, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.0557214, + "balance_loss_mlp": 1.0264914, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.8096885120078743, + "language_loss": 0.76253247, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78432846, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14709473, + "step": 7544, + "time_per_iteration": 2.506483554840088 + }, + { + "auxiliary_loss_clip": 0.01136307, + "auxiliary_loss_mlp": 0.01038671, + "balance_loss_clip": 1.05461812, + "balance_loss_mlp": 1.02490866, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 3.378753511645543, + "language_loss": 0.75254256, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77429235, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13757324, + "step": 7545, + "time_per_iteration": 2.490025281906128 + }, + { + "auxiliary_loss_clip": 0.01130677, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.05145717, + "balance_loss_mlp": 1.01994479, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.7856687772111834, + "language_loss": 0.71939421, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74103308, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13250732, + "step": 7546, + "time_per_iteration": 2.577258348464966 + }, + { + "auxiliary_loss_clip": 0.01130877, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.04760969, + "balance_loss_mlp": 1.01868224, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.531118966660709, + "language_loss": 0.76017588, + "learning_rate": 2.394071277466609e-06, + "loss": 0.78184086, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.16943359, + "step": 7547, + "time_per_iteration": 2.453596353530884 + }, + { + "auxiliary_loss_clip": 0.01132384, + "auxiliary_loss_mlp": 0.01035476, + "balance_loss_clip": 1.05039263, + "balance_loss_mlp": 1.01932335, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.429881266236959, + "language_loss": 0.69653273, + "learning_rate": 2.393689443195573e-06, + "loss": 0.71821129, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.16149902, + "step": 7548, + "time_per_iteration": 2.462742328643799 + }, + { + "auxiliary_loss_clip": 0.01132902, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_clip": 1.04985666, + "balance_loss_mlp": 1.02731037, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.0511285350411494, + "language_loss": 0.73005569, + "learning_rate": 2.393307593995794e-06, + "loss": 0.75179672, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13879395, + "step": 7549, + "time_per_iteration": 2.5392496585845947 + }, + { + "auxiliary_loss_clip": 0.01134542, + "auxiliary_loss_mlp": 0.01029209, + "balance_loss_clip": 1.05404568, + "balance_loss_mlp": 1.01637626, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.5462174191796398, + "language_loss": 0.64783704, + "learning_rate": 2.392925729881751e-06, + "loss": 0.6694746, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.1282959, + "step": 7550, + "time_per_iteration": 4.013259410858154 + }, + { + "auxiliary_loss_clip": 0.01133621, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.05415177, + "balance_loss_mlp": 1.02064991, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6076527053400576, + "language_loss": 0.68494278, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70661807, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13262939, + "step": 7551, + "time_per_iteration": 2.484144449234009 + }, + { + "auxiliary_loss_clip": 0.01135753, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.05348063, + "balance_loss_mlp": 1.02175796, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.963240916202337, + "language_loss": 0.7935468, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81526434, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14245605, + "step": 7552, + "time_per_iteration": 2.4757282733917236 + }, + { + "auxiliary_loss_clip": 0.0106597, + "auxiliary_loss_mlp": 0.01006163, + "balance_loss_clip": 1.03811431, + "balance_loss_mlp": 1.00441909, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8177922411319964, + "language_loss": 0.57756954, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59829086, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.27880859, + "router_z_loss_mlp": 0.01739502, + "step": 7553, + "time_per_iteration": 3.058001756668091 + }, + { + "auxiliary_loss_clip": 0.01141811, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.06218696, + "balance_loss_mlp": 1.02061284, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.369231232707817, + "language_loss": 0.76872563, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.79047859, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12890625, + "step": 7554, + "time_per_iteration": 2.517427921295166 + }, + { + "auxiliary_loss_clip": 0.01145756, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.05859303, + "balance_loss_mlp": 1.02036262, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 4.534736936302102, + "language_loss": 0.76921034, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79102308, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.15161133, + "step": 7555, + "time_per_iteration": 2.428743839263916 + }, + { + "auxiliary_loss_clip": 0.01133933, + "auxiliary_loss_mlp": 0.01033793, + "balance_loss_clip": 1.0544765, + "balance_loss_mlp": 1.02059639, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.321434363138273, + "language_loss": 0.72709864, + "learning_rate": 2.390634232808903e-06, + "loss": 0.7487759, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13195801, + "step": 7556, + "time_per_iteration": 2.5067660808563232 + }, + { + "auxiliary_loss_clip": 0.01145567, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.05985188, + "balance_loss_mlp": 1.02063692, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 2.442954961316228, + "language_loss": 0.63138425, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.6531862, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13989258, + "step": 7557, + "time_per_iteration": 2.4979875087738037 + }, + { + "auxiliary_loss_clip": 0.0106826, + "auxiliary_loss_mlp": 0.01015018, + "balance_loss_clip": 1.04058444, + "balance_loss_mlp": 1.01350558, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6818569102666596, + "language_loss": 0.57581782, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.5966506, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.0151062, + "step": 7558, + "time_per_iteration": 2.98980712890625 + }, + { + "auxiliary_loss_clip": 0.01134713, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.05202985, + "balance_loss_mlp": 1.01813388, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 3.1019527452418476, + "language_loss": 0.56873518, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.59041125, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14758301, + "step": 7559, + "time_per_iteration": 2.4487693309783936 + }, + { + "auxiliary_loss_clip": 0.01128365, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.04878616, + "balance_loss_mlp": 1.02471948, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 1.731229335105409, + "language_loss": 0.7199254, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74159771, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14160156, + "step": 7560, + "time_per_iteration": 2.4278953075408936 + }, + { + "auxiliary_loss_clip": 0.01140788, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.05688512, + "balance_loss_mlp": 1.01940179, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 1.9352124571002833, + "language_loss": 0.6852594, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.7070027, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14141846, + "step": 7561, + "time_per_iteration": 3.857266664505005 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.05574262, + "balance_loss_mlp": 1.01996911, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.8856499519185053, + "language_loss": 0.85129535, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.8729552, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1227417, + "step": 7562, + "time_per_iteration": 2.4697673320770264 + }, + { + "auxiliary_loss_clip": 0.01129285, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.04949105, + "balance_loss_mlp": 1.02369475, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.8414640863243503, + "language_loss": 0.89202291, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91368502, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13244629, + "step": 7563, + "time_per_iteration": 2.5690224170684814 + }, + { + "auxiliary_loss_clip": 0.01130746, + "auxiliary_loss_mlp": 0.01037969, + "balance_loss_clip": 1.04915476, + "balance_loss_mlp": 1.02349067, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.8742861222233518, + "language_loss": 0.71391308, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73560023, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14465332, + "step": 7564, + "time_per_iteration": 2.485630512237549 + }, + { + "auxiliary_loss_clip": 0.01142756, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.05875182, + "balance_loss_mlp": 1.02016604, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.2004137965773745, + "language_loss": 0.68677688, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70853937, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13317871, + "step": 7565, + "time_per_iteration": 2.4927785396575928 + }, + { + "auxiliary_loss_clip": 0.01122997, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.04556072, + "balance_loss_mlp": 1.0228169, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 2.351400597794851, + "language_loss": 0.80483198, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82642293, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13275146, + "step": 7566, + "time_per_iteration": 3.8863165378570557 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.01035712, + "balance_loss_clip": 1.0507617, + "balance_loss_mlp": 1.02052474, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.6516452112930693, + "language_loss": 0.73592377, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75760627, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.15209961, + "step": 7567, + "time_per_iteration": 2.4041755199432373 + }, + { + "auxiliary_loss_clip": 0.01143626, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.06125784, + "balance_loss_mlp": 1.02253139, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.465296323552939, + "language_loss": 0.81395388, + "learning_rate": 2.386049642000249e-06, + "loss": 0.8357482, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13293457, + "step": 7568, + "time_per_iteration": 2.557708978652954 + }, + { + "auxiliary_loss_clip": 0.01144086, + "auxiliary_loss_mlp": 0.01055544, + "balance_loss_clip": 1.05761325, + "balance_loss_mlp": 1.03756714, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 2.113338130992246, + "language_loss": 0.79405046, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81604677, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.17993164, + "step": 7569, + "time_per_iteration": 2.4246296882629395 + }, + { + "auxiliary_loss_clip": 0.0114251, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.05662572, + "balance_loss_mlp": 1.01979804, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.683246865140871, + "language_loss": 0.75060725, + "learning_rate": 2.385285337909412e-06, + "loss": 0.7723878, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.1574707, + "step": 7570, + "time_per_iteration": 2.512942314147949 + }, + { + "auxiliary_loss_clip": 0.01133656, + "auxiliary_loss_mlp": 0.01043813, + "balance_loss_clip": 1.05173373, + "balance_loss_mlp": 1.02898359, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 2.6602065878785592, + "language_loss": 0.74489939, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76667404, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14837646, + "step": 7571, + "time_per_iteration": 2.523723840713501 + }, + { + "auxiliary_loss_clip": 0.01129174, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.05284524, + "balance_loss_mlp": 1.02059841, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.5044301192626497, + "language_loss": 0.81229806, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83392203, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1262207, + "step": 7572, + "time_per_iteration": 2.4539480209350586 + }, + { + "auxiliary_loss_clip": 0.01135608, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.05168486, + "balance_loss_mlp": 1.02441621, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 1.946884339338173, + "language_loss": 0.7284283, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75017583, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14727783, + "step": 7573, + "time_per_iteration": 2.565052032470703 + }, + { + "auxiliary_loss_clip": 0.01136591, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.05151343, + "balance_loss_mlp": 1.02216029, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 2.102299754813842, + "language_loss": 0.74534386, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76708221, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.15075684, + "step": 7574, + "time_per_iteration": 2.548319101333618 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.05835235, + "balance_loss_mlp": 1.02123225, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.6875295517898905, + "language_loss": 0.71303624, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73483258, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.14013672, + "step": 7575, + "time_per_iteration": 2.51658034324646 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.04186487, + "balance_loss_mlp": 1.02275801, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.8011618381005163, + "language_loss": 0.73053133, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75211847, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1427002, + "step": 7576, + "time_per_iteration": 2.4955484867095947 + }, + { + "auxiliary_loss_clip": 0.01127133, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.04846525, + "balance_loss_mlp": 1.02461863, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.6544724718468236, + "language_loss": 0.66071343, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68237162, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14068604, + "step": 7577, + "time_per_iteration": 2.496340036392212 + }, + { + "auxiliary_loss_clip": 0.01138717, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.05700493, + "balance_loss_mlp": 1.02891457, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.8489414400176842, + "language_loss": 0.74284351, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76467681, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.15692139, + "step": 7578, + "time_per_iteration": 2.543971300125122 + }, + { + "auxiliary_loss_clip": 0.01123997, + "auxiliary_loss_mlp": 0.01036488, + "balance_loss_clip": 1.04586589, + "balance_loss_mlp": 1.02220035, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.8095666148200389, + "language_loss": 0.70452338, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72612822, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.14294434, + "step": 7579, + "time_per_iteration": 2.5559754371643066 + }, + { + "auxiliary_loss_clip": 0.01135, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.05701637, + "balance_loss_mlp": 1.021806, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.6395485628426143, + "language_loss": 0.78509372, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80679339, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13165283, + "step": 7580, + "time_per_iteration": 2.5987656116485596 + }, + { + "auxiliary_loss_clip": 0.01128398, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.04950142, + "balance_loss_mlp": 1.02052307, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.682945880953654, + "language_loss": 0.68777066, + "learning_rate": 2.381080623899444e-06, + "loss": 0.70940351, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1439209, + "step": 7581, + "time_per_iteration": 2.613966226577759 + }, + { + "auxiliary_loss_clip": 0.01130717, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.05297613, + "balance_loss_mlp": 1.02072048, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.7293092709530296, + "language_loss": 0.73458457, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75624865, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.1496582, + "step": 7582, + "time_per_iteration": 2.591416835784912 + }, + { + "auxiliary_loss_clip": 0.0112931, + "auxiliary_loss_mlp": 0.01045468, + "balance_loss_clip": 1.04787016, + "balance_loss_mlp": 1.02998888, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.676530392237054, + "language_loss": 0.72210205, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74384981, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.15484619, + "step": 7583, + "time_per_iteration": 3.9675705432891846 + }, + { + "auxiliary_loss_clip": 0.0113042, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.04664254, + "balance_loss_mlp": 1.02140284, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 2.659545591171386, + "language_loss": 0.72838151, + "learning_rate": 2.379933579440195e-06, + "loss": 0.75004482, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14501953, + "step": 7584, + "time_per_iteration": 2.5165231227874756 + }, + { + "auxiliary_loss_clip": 0.01130604, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.04959619, + "balance_loss_mlp": 1.02178526, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.6159609934225327, + "language_loss": 0.68194282, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70361251, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.14556885, + "step": 7585, + "time_per_iteration": 2.6301209926605225 + }, + { + "auxiliary_loss_clip": 0.01131384, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.05092168, + "balance_loss_mlp": 1.01834941, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.6278550465778283, + "language_loss": 0.76551342, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78714824, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.1373291, + "step": 7586, + "time_per_iteration": 2.455207347869873 + }, + { + "auxiliary_loss_clip": 0.01145705, + "auxiliary_loss_mlp": 0.01026035, + "balance_loss_clip": 1.0646553, + "balance_loss_mlp": 1.01342869, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 2.223131351085814, + "language_loss": 0.7869435, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80866092, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.1260376, + "step": 7587, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01133982, + "auxiliary_loss_mlp": 0.01046702, + "balance_loss_clip": 1.04871941, + "balance_loss_mlp": 1.03174162, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.1801631167215167, + "language_loss": 0.69295865, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71476549, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14953613, + "step": 7588, + "time_per_iteration": 2.407653331756592 + }, + { + "auxiliary_loss_clip": 0.01135032, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.0561527, + "balance_loss_mlp": 1.02360415, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.610850865365428, + "language_loss": 0.79319823, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81491566, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13110352, + "step": 7589, + "time_per_iteration": 2.4562528133392334 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.04731107, + "balance_loss_mlp": 1.02038014, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 3.8875841077219317, + "language_loss": 0.62579858, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64741045, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14379883, + "step": 7590, + "time_per_iteration": 2.501558780670166 + }, + { + "auxiliary_loss_clip": 0.01126831, + "auxiliary_loss_mlp": 0.01040212, + "balance_loss_clip": 1.04782629, + "balance_loss_mlp": 1.02734375, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.8885233301990736, + "language_loss": 0.730492, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75216246, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12854004, + "step": 7591, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.01037992, + "balance_loss_clip": 1.05363739, + "balance_loss_mlp": 1.02346075, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.1402711712092466, + "language_loss": 0.76784593, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.7895844, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14526367, + "step": 7592, + "time_per_iteration": 2.4538142681121826 + }, + { + "auxiliary_loss_clip": 0.01130225, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.0489341, + "balance_loss_mlp": 1.01840234, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.3259146499127947, + "language_loss": 0.69549644, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71711779, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13513184, + "step": 7593, + "time_per_iteration": 3.946261167526245 + }, + { + "auxiliary_loss_clip": 0.01123622, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.04784405, + "balance_loss_mlp": 1.017102, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 1.9447952039613328, + "language_loss": 0.83924645, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86077762, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.1237793, + "step": 7594, + "time_per_iteration": 2.4964215755462646 + }, + { + "auxiliary_loss_clip": 0.01057839, + "auxiliary_loss_mlp": 0.01006236, + "balance_loss_clip": 1.03072345, + "balance_loss_mlp": 1.00479341, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7944400347636348, + "language_loss": 0.52753782, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54817855, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.01441956, + "step": 7595, + "time_per_iteration": 3.083261251449585 + }, + { + "auxiliary_loss_clip": 0.01131864, + "auxiliary_loss_mlp": 0.01046245, + "balance_loss_clip": 1.04852033, + "balance_loss_mlp": 1.02947271, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.166470080359409, + "language_loss": 0.87002945, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89181054, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.16772461, + "step": 7596, + "time_per_iteration": 2.3937411308288574 + }, + { + "auxiliary_loss_clip": 0.01139408, + "auxiliary_loss_mlp": 0.01042626, + "balance_loss_clip": 1.05893731, + "balance_loss_mlp": 1.02931046, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 1.583780723182871, + "language_loss": 0.77235472, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79417503, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13323975, + "step": 7597, + "time_per_iteration": 2.483954429626465 + }, + { + "auxiliary_loss_clip": 0.01137401, + "auxiliary_loss_mlp": 0.01033542, + "balance_loss_clip": 1.0556227, + "balance_loss_mlp": 1.01902223, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 3.4353030949958656, + "language_loss": 0.78393793, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80564737, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.1451416, + "step": 7598, + "time_per_iteration": 2.5070950984954834 + }, + { + "auxiliary_loss_clip": 0.01137267, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.05682874, + "balance_loss_mlp": 1.02317262, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 4.482666034630311, + "language_loss": 0.71311593, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73484826, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12786865, + "step": 7599, + "time_per_iteration": 2.5293188095092773 + }, + { + "auxiliary_loss_clip": 0.01127465, + "auxiliary_loss_mlp": 0.01037999, + "balance_loss_clip": 1.04606211, + "balance_loss_mlp": 1.02411675, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.7951722235880996, + "language_loss": 0.69710588, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71876055, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13879395, + "step": 7600, + "time_per_iteration": 2.465911388397217 + }, + { + "auxiliary_loss_clip": 0.01132587, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.05355966, + "balance_loss_mlp": 1.02393353, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.8001102575665708, + "language_loss": 0.78917241, + "learning_rate": 2.373431223132319e-06, + "loss": 0.81086457, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12695312, + "step": 7601, + "time_per_iteration": 2.469521999359131 + }, + { + "auxiliary_loss_clip": 0.01140419, + "auxiliary_loss_mlp": 0.01037486, + "balance_loss_clip": 1.05844307, + "balance_loss_mlp": 1.02434886, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 2.680945621888624, + "language_loss": 0.71817434, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.7399534, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13128662, + "step": 7602, + "time_per_iteration": 2.6329846382141113 + }, + { + "auxiliary_loss_clip": 0.01130368, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.05002403, + "balance_loss_mlp": 1.02009344, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 2.7988683886199035, + "language_loss": 0.73276424, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75442028, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.15118408, + "step": 7603, + "time_per_iteration": 2.563314437866211 + }, + { + "auxiliary_loss_clip": 0.01132535, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.05370831, + "balance_loss_mlp": 1.0260464, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.7774203349253046, + "language_loss": 0.83343118, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85516214, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14520264, + "step": 7604, + "time_per_iteration": 3.98478364944458 + }, + { + "auxiliary_loss_clip": 0.01138286, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_clip": 1.05421162, + "balance_loss_mlp": 1.0325166, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.7027626360453936, + "language_loss": 0.8620311, + "learning_rate": 2.371900659559016e-06, + "loss": 0.88389683, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.15771484, + "step": 7605, + "time_per_iteration": 2.5242385864257812 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.01037321, + "balance_loss_clip": 1.04472804, + "balance_loss_mlp": 1.02283669, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.7127892560437068, + "language_loss": 0.73484164, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75647432, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14477539, + "step": 7606, + "time_per_iteration": 2.5347955226898193 + }, + { + "auxiliary_loss_clip": 0.01133935, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.05282521, + "balance_loss_mlp": 1.02514482, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 2.971381651179521, + "language_loss": 0.80554223, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82729852, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.16540527, + "step": 7607, + "time_per_iteration": 2.496002435684204 + }, + { + "auxiliary_loss_clip": 0.01137596, + "auxiliary_loss_mlp": 0.01041168, + "balance_loss_clip": 1.05750728, + "balance_loss_mlp": 1.02528954, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 5.407330098352683, + "language_loss": 0.80892789, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83071554, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.15893555, + "step": 7608, + "time_per_iteration": 2.5402016639709473 + }, + { + "auxiliary_loss_clip": 0.01134142, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.05387342, + "balance_loss_mlp": 1.02510428, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.8815606627433583, + "language_loss": 0.68397379, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70570785, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.14172363, + "step": 7609, + "time_per_iteration": 2.487692356109619 + }, + { + "auxiliary_loss_clip": 0.01129891, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.04857588, + "balance_loss_mlp": 1.03470922, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.8420995206591548, + "language_loss": 0.80840695, + "learning_rate": 2.369987137894757e-06, + "loss": 0.83020115, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.14825439, + "step": 7610, + "time_per_iteration": 4.168868780136108 + }, + { + "auxiliary_loss_clip": 0.01133693, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.05158055, + "balance_loss_mlp": 1.02240038, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.2224618726113685, + "language_loss": 0.82052588, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84223008, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14312744, + "step": 7611, + "time_per_iteration": 2.488067626953125 + }, + { + "auxiliary_loss_clip": 0.01139117, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.05802774, + "balance_loss_mlp": 1.0183084, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.9656645514736366, + "language_loss": 0.73817086, + "learning_rate": 2.369221630917819e-06, + "loss": 0.7598924, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14733887, + "step": 7612, + "time_per_iteration": 2.6983115673065186 + }, + { + "auxiliary_loss_clip": 0.0113277, + "auxiliary_loss_mlp": 0.01037376, + "balance_loss_clip": 1.05286658, + "balance_loss_mlp": 1.02301121, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.6000900979435486, + "language_loss": 0.85210121, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87380278, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14379883, + "step": 7613, + "time_per_iteration": 2.592169761657715 + }, + { + "auxiliary_loss_clip": 0.01131839, + "auxiliary_loss_mlp": 0.01033646, + "balance_loss_clip": 1.04984438, + "balance_loss_mlp": 1.01993084, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.079700648699431, + "language_loss": 0.75485283, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77650768, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13726807, + "step": 7614, + "time_per_iteration": 2.447380542755127 + }, + { + "auxiliary_loss_clip": 0.01131123, + "auxiliary_loss_mlp": 0.01033316, + "balance_loss_clip": 1.05051064, + "balance_loss_mlp": 1.01958346, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.552262571833737, + "language_loss": 0.74688876, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76853311, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.1373291, + "step": 7615, + "time_per_iteration": 2.5122618675231934 + }, + { + "auxiliary_loss_clip": 0.01058055, + "auxiliary_loss_mlp": 0.01012032, + "balance_loss_clip": 1.03133798, + "balance_loss_mlp": 1.0103035, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7726359224792398, + "language_loss": 0.57652897, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59722984, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01727295, + "step": 7616, + "time_per_iteration": 3.019523859024048 + }, + { + "auxiliary_loss_clip": 0.01133137, + "auxiliary_loss_mlp": 0.01040099, + "balance_loss_clip": 1.05277824, + "balance_loss_mlp": 1.0259254, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.9679949077708838, + "language_loss": 0.71116853, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73290086, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14160156, + "step": 7617, + "time_per_iteration": 2.4761404991149902 + }, + { + "auxiliary_loss_clip": 0.01141523, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.05911922, + "balance_loss_mlp": 1.02091527, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 1.9072160582178714, + "language_loss": 0.76263362, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78440094, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.1428833, + "step": 7618, + "time_per_iteration": 2.4406423568725586 + }, + { + "auxiliary_loss_clip": 0.01139397, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.05373955, + "balance_loss_mlp": 1.02930737, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 7.821553842761325, + "language_loss": 0.7677108, + "learning_rate": 2.366541916231585e-06, + "loss": 0.78953415, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13616943, + "step": 7619, + "time_per_iteration": 2.490830898284912 + }, + { + "auxiliary_loss_clip": 0.01131415, + "auxiliary_loss_mlp": 0.01039055, + "balance_loss_clip": 1.0539428, + "balance_loss_mlp": 1.02646613, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 2.477671270175933, + "language_loss": 0.71673769, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73844242, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12579346, + "step": 7620, + "time_per_iteration": 2.471621036529541 + }, + { + "auxiliary_loss_clip": 0.0113116, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.05356932, + "balance_loss_mlp": 1.02022338, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.579309001930781, + "language_loss": 0.78209502, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80373907, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13018799, + "step": 7621, + "time_per_iteration": 2.715318441390991 + }, + { + "auxiliary_loss_clip": 0.01060221, + "auxiliary_loss_mlp": 0.01010828, + "balance_loss_clip": 1.03301549, + "balance_loss_mlp": 1.00944376, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7808697377559364, + "language_loss": 0.64969504, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67040551, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01383972, + "step": 7622, + "time_per_iteration": 3.0689520835876465 + }, + { + "auxiliary_loss_clip": 0.01136042, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.05410337, + "balance_loss_mlp": 1.01497436, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.8503505472574264, + "language_loss": 0.79791892, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81956959, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14044189, + "step": 7623, + "time_per_iteration": 2.6240811347961426 + }, + { + "auxiliary_loss_clip": 0.01137532, + "auxiliary_loss_mlp": 0.01041203, + "balance_loss_clip": 1.05605006, + "balance_loss_mlp": 1.02760684, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 2.06876921114356, + "language_loss": 0.70654428, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72833163, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.1361084, + "step": 7624, + "time_per_iteration": 2.4503870010375977 + }, + { + "auxiliary_loss_clip": 0.01133442, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.05285192, + "balance_loss_mlp": 1.01959503, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.246886080602895, + "language_loss": 0.73131996, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75298858, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13830566, + "step": 7625, + "time_per_iteration": 2.494807481765747 + }, + { + "auxiliary_loss_clip": 0.01138633, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.05517554, + "balance_loss_mlp": 1.02182615, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 2.093605011227263, + "language_loss": 0.77921844, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80096, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13684082, + "step": 7626, + "time_per_iteration": 2.5362424850463867 + }, + { + "auxiliary_loss_clip": 0.01135518, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.05001426, + "balance_loss_mlp": 1.02308822, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.6644101712138288, + "language_loss": 0.84727359, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.86900538, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14575195, + "step": 7627, + "time_per_iteration": 3.957685708999634 + }, + { + "auxiliary_loss_clip": 0.01135198, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.05135357, + "balance_loss_mlp": 1.02337945, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.8438471236055296, + "language_loss": 0.69553262, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71725559, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13726807, + "step": 7628, + "time_per_iteration": 2.5937788486480713 + }, + { + "auxiliary_loss_clip": 0.01131644, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.05038249, + "balance_loss_mlp": 1.01894712, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.540906698029129, + "language_loss": 0.78499854, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80664146, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13708496, + "step": 7629, + "time_per_iteration": 2.4573869705200195 + }, + { + "auxiliary_loss_clip": 0.01141457, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.05229545, + "balance_loss_mlp": 1.0264039, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.1374003500278436, + "language_loss": 0.79089642, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81273007, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.15490723, + "step": 7630, + "time_per_iteration": 2.4597713947296143 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.05888057, + "balance_loss_mlp": 1.02040792, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 2.009179843880045, + "language_loss": 0.72115111, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74291801, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.13995361, + "step": 7631, + "time_per_iteration": 2.6177337169647217 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01039498, + "balance_loss_clip": 1.05214798, + "balance_loss_mlp": 1.02493644, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.5260935055588, + "language_loss": 0.71923637, + "learning_rate": 2.361563500108531e-06, + "loss": 0.74095458, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14556885, + "step": 7632, + "time_per_iteration": 2.4732937812805176 + }, + { + "auxiliary_loss_clip": 0.0113275, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.0494194, + "balance_loss_mlp": 1.02115917, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.4513774637888206, + "language_loss": 0.68892801, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71061903, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.15209961, + "step": 7633, + "time_per_iteration": 2.4663007259368896 + }, + { + "auxiliary_loss_clip": 0.01136788, + "auxiliary_loss_mlp": 0.0103522, + "balance_loss_clip": 1.05156136, + "balance_loss_mlp": 1.0209384, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.5218162062765812, + "language_loss": 0.80617148, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82789153, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14282227, + "step": 7634, + "time_per_iteration": 2.5360488891601562 + }, + { + "auxiliary_loss_clip": 0.01137872, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.04855704, + "balance_loss_mlp": 1.02526879, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.7293796476218914, + "language_loss": 0.81362718, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83542109, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.16235352, + "step": 7635, + "time_per_iteration": 2.4440982341766357 + }, + { + "auxiliary_loss_clip": 0.01138144, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.05763876, + "balance_loss_mlp": 1.02379358, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.5300419770104574, + "language_loss": 0.64639902, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66815728, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13903809, + "step": 7636, + "time_per_iteration": 2.747983694076538 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.05285239, + "balance_loss_mlp": 1.0198437, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4686142099120352, + "language_loss": 0.8039366, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82556242, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13616943, + "step": 7637, + "time_per_iteration": 3.996957778930664 + }, + { + "auxiliary_loss_clip": 0.01137935, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.0561105, + "balance_loss_mlp": 1.0183115, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.4338624451461845, + "language_loss": 0.7516706, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77338868, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.15563965, + "step": 7638, + "time_per_iteration": 2.4823620319366455 + }, + { + "auxiliary_loss_clip": 0.01132563, + "auxiliary_loss_mlp": 0.01037393, + "balance_loss_clip": 1.05536771, + "balance_loss_mlp": 1.0231595, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 2.0728592852907144, + "language_loss": 0.73852789, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76022744, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.14227295, + "step": 7639, + "time_per_iteration": 2.4682557582855225 + }, + { + "auxiliary_loss_clip": 0.01132165, + "auxiliary_loss_mlp": 0.01040031, + "balance_loss_clip": 1.0509547, + "balance_loss_mlp": 1.02564812, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.9563769797109347, + "language_loss": 0.67926109, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70098299, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.1439209, + "step": 7640, + "time_per_iteration": 2.4587323665618896 + }, + { + "auxiliary_loss_clip": 0.01138279, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.05384398, + "balance_loss_mlp": 1.02124226, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.9248059440189238, + "language_loss": 0.75613737, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.7778796, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14685059, + "step": 7641, + "time_per_iteration": 2.4621143341064453 + }, + { + "auxiliary_loss_clip": 0.01139704, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.0567956, + "balance_loss_mlp": 1.01887774, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7456942972776288, + "language_loss": 0.74536878, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76710069, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14593506, + "step": 7642, + "time_per_iteration": 2.4422223567962646 + }, + { + "auxiliary_loss_clip": 0.01077411, + "auxiliary_loss_mlp": 0.01008447, + "balance_loss_clip": 1.048262, + "balance_loss_mlp": 1.00706232, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.846916321790412, + "language_loss": 0.58166271, + "learning_rate": 2.357349183091694e-06, + "loss": 0.6025213, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.29150391, + "router_z_loss_mlp": 0.01383972, + "step": 7643, + "time_per_iteration": 2.8590712547302246 + }, + { + "auxiliary_loss_clip": 0.01140026, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.05335712, + "balance_loss_mlp": 1.02019143, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5053991037160936, + "language_loss": 0.93138552, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95312929, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.14160156, + "step": 7644, + "time_per_iteration": 2.4401462078094482 + }, + { + "auxiliary_loss_clip": 0.01136546, + "auxiliary_loss_mlp": 0.01042115, + "balance_loss_clip": 1.05432081, + "balance_loss_mlp": 1.02794743, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 3.5979380438069617, + "language_loss": 0.82374012, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.84552681, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14178467, + "step": 7645, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01103297, + "auxiliary_loss_mlp": 0.01010224, + "balance_loss_clip": 1.07513535, + "balance_loss_mlp": 1.00741696, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7619590880238438, + "language_loss": 0.5983929, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61952817, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.28173828, + "router_z_loss_mlp": 0.02810669, + "step": 7646, + "time_per_iteration": 3.007782220840454 + }, + { + "auxiliary_loss_clip": 0.01132848, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.05201638, + "balance_loss_mlp": 1.01800919, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.721588543109116, + "language_loss": 0.72423708, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74588698, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14129639, + "step": 7647, + "time_per_iteration": 2.534972667694092 + }, + { + "auxiliary_loss_clip": 0.01133303, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.05032825, + "balance_loss_mlp": 1.0220958, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 3.6045287200821097, + "language_loss": 0.66936481, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.6910677, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14886475, + "step": 7648, + "time_per_iteration": 4.0442187786102295 + }, + { + "auxiliary_loss_clip": 0.01124361, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.04496813, + "balance_loss_mlp": 1.02521276, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.591356991886547, + "language_loss": 0.78896594, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81059837, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13665771, + "step": 7649, + "time_per_iteration": 2.5518805980682373 + }, + { + "auxiliary_loss_clip": 0.01121248, + "auxiliary_loss_mlp": 0.01040333, + "balance_loss_clip": 1.04244554, + "balance_loss_mlp": 1.02607584, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 1.657285730619669, + "language_loss": 0.69186103, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.7134769, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14245605, + "step": 7650, + "time_per_iteration": 2.563870429992676 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.0104534, + "balance_loss_clip": 1.05335963, + "balance_loss_mlp": 1.02843571, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.557262941997462, + "language_loss": 0.84240735, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86427373, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.16906738, + "step": 7651, + "time_per_iteration": 2.445340871810913 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.05504394, + "balance_loss_mlp": 1.02313817, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.5778733436143, + "language_loss": 0.75139362, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77312016, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.14898682, + "step": 7652, + "time_per_iteration": 2.473635673522949 + }, + { + "auxiliary_loss_clip": 0.01128739, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.04692841, + "balance_loss_mlp": 1.01846731, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.7304663358623793, + "language_loss": 0.76034343, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78195834, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14282227, + "step": 7653, + "time_per_iteration": 2.5144457817077637 + }, + { + "auxiliary_loss_clip": 0.01143228, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.05533886, + "balance_loss_mlp": 1.02616501, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.1061932586661882, + "language_loss": 0.66362822, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68549192, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.16967773, + "step": 7654, + "time_per_iteration": 3.903803586959839 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01037175, + "balance_loss_clip": 1.04852772, + "balance_loss_mlp": 1.02252972, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.7029443427407083, + "language_loss": 0.79634154, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81803143, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14648438, + "step": 7655, + "time_per_iteration": 2.4717087745666504 + }, + { + "auxiliary_loss_clip": 0.01136095, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.05494881, + "balance_loss_mlp": 1.02341652, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.63326353431136, + "language_loss": 0.67818975, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69993091, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14611816, + "step": 7656, + "time_per_iteration": 2.522716999053955 + }, + { + "auxiliary_loss_clip": 0.01134206, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.05138159, + "balance_loss_mlp": 1.02168655, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.9735453840143067, + "language_loss": 0.80931127, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83101177, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14154053, + "step": 7657, + "time_per_iteration": 2.4836981296539307 + }, + { + "auxiliary_loss_clip": 0.01135753, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.0532105, + "balance_loss_mlp": 1.0168196, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.065257662870589, + "language_loss": 0.70748842, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72916514, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.15112305, + "step": 7658, + "time_per_iteration": 2.5384466648101807 + }, + { + "auxiliary_loss_clip": 0.01065083, + "auxiliary_loss_mlp": 0.01007157, + "balance_loss_clip": 1.03730893, + "balance_loss_mlp": 1.00572038, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9554300509449088, + "language_loss": 0.62072861, + "learning_rate": 2.351216345708928e-06, + "loss": 0.641451, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.27783203, + "router_z_loss_mlp": 0.01437378, + "step": 7659, + "time_per_iteration": 3.1513149738311768 + }, + { + "auxiliary_loss_clip": 0.0112947, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.05058694, + "balance_loss_mlp": 1.02014327, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.6715874208695432, + "language_loss": 0.67859447, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70023632, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.14575195, + "step": 7660, + "time_per_iteration": 2.592841148376465 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.0456593, + "balance_loss_mlp": 1.02118492, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 4.063826394247324, + "language_loss": 0.77165025, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79327554, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14434814, + "step": 7661, + "time_per_iteration": 2.5683956146240234 + }, + { + "auxiliary_loss_clip": 0.01132145, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.05330086, + "balance_loss_mlp": 1.02589822, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.734821060327814, + "language_loss": 0.74866956, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77039957, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14971924, + "step": 7662, + "time_per_iteration": 2.547745943069458 + }, + { + "auxiliary_loss_clip": 0.01138819, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.05210483, + "balance_loss_mlp": 1.02289796, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 3.4917818469931374, + "language_loss": 0.78997433, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81175554, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.16387939, + "step": 7663, + "time_per_iteration": 2.4864695072174072 + }, + { + "auxiliary_loss_clip": 0.01125386, + "auxiliary_loss_mlp": 0.01037571, + "balance_loss_clip": 1.04686809, + "balance_loss_mlp": 1.02367711, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 2.433221718646782, + "language_loss": 0.73476303, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.7563926, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13873291, + "step": 7664, + "time_per_iteration": 2.458785057067871 + }, + { + "auxiliary_loss_clip": 0.01142064, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.05633342, + "balance_loss_mlp": 1.02307272, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 2.243533782687606, + "language_loss": 0.72461987, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74641138, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.14013672, + "step": 7665, + "time_per_iteration": 2.416287660598755 + }, + { + "auxiliary_loss_clip": 0.01133647, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.05226827, + "balance_loss_mlp": 1.01956511, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 2.007798136500192, + "language_loss": 0.77955317, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80122584, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.140625, + "step": 7666, + "time_per_iteration": 2.4590401649475098 + }, + { + "auxiliary_loss_clip": 0.01129523, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.04793882, + "balance_loss_mlp": 1.02123618, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.4437559915672333, + "language_loss": 0.7409786, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76264358, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.15734863, + "step": 7667, + "time_per_iteration": 2.56083083152771 + }, + { + "auxiliary_loss_clip": 0.01140818, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.05799687, + "balance_loss_mlp": 1.0221318, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.407468500332977, + "language_loss": 0.76120639, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78297371, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13781738, + "step": 7668, + "time_per_iteration": 2.480276584625244 + }, + { + "auxiliary_loss_clip": 0.01139713, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.06197858, + "balance_loss_mlp": 1.01955557, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 2.073685128356064, + "language_loss": 0.77909607, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80081332, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12457275, + "step": 7669, + "time_per_iteration": 2.4592339992523193 + }, + { + "auxiliary_loss_clip": 0.01126558, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.04586446, + "balance_loss_mlp": 1.01522112, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.988290984543223, + "language_loss": 0.82261938, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84417361, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13659668, + "step": 7670, + "time_per_iteration": 2.53410005569458 + }, + { + "auxiliary_loss_clip": 0.01124737, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.04565465, + "balance_loss_mlp": 1.02302635, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6652307024586244, + "language_loss": 0.63282394, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.65444922, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.14758301, + "step": 7671, + "time_per_iteration": 3.918933391571045 + }, + { + "auxiliary_loss_clip": 0.01062308, + "auxiliary_loss_mlp": 0.01004874, + "balance_loss_clip": 1.03440452, + "balance_loss_mlp": 1.00302362, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.697713645393259, + "language_loss": 0.55873775, + "learning_rate": 2.346230902123583e-06, + "loss": 0.5794096, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.27832031, + "router_z_loss_mlp": 0.01849365, + "step": 7672, + "time_per_iteration": 3.1983718872070312 + }, + { + "auxiliary_loss_clip": 0.01136608, + "auxiliary_loss_mlp": 0.0104298, + "balance_loss_clip": 1.0528934, + "balance_loss_mlp": 1.02853203, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.7739571330241999, + "language_loss": 0.70751047, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.72930634, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14453125, + "step": 7673, + "time_per_iteration": 2.5944957733154297 + }, + { + "auxiliary_loss_clip": 0.01132354, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.05169988, + "balance_loss_mlp": 1.02386665, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.8571356706914772, + "language_loss": 0.70560247, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72731388, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.14929199, + "step": 7674, + "time_per_iteration": 2.6142685413360596 + }, + { + "auxiliary_loss_clip": 0.01124923, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.04329324, + "balance_loss_mlp": 1.02578616, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.510039525075644, + "language_loss": 0.65572679, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.6773743, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.14056396, + "step": 7675, + "time_per_iteration": 2.6087100505828857 + }, + { + "auxiliary_loss_clip": 0.01053557, + "auxiliary_loss_mlp": 0.01001049, + "balance_loss_clip": 1.02596176, + "balance_loss_mlp": 0.99960053, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7448588129613802, + "language_loss": 0.58621526, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60676134, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01446533, + "step": 7676, + "time_per_iteration": 3.208688497543335 + }, + { + "auxiliary_loss_clip": 0.01070251, + "auxiliary_loss_mlp": 0.01004821, + "balance_loss_clip": 1.04140365, + "balance_loss_mlp": 1.00316095, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7918345108640781, + "language_loss": 0.62681228, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64756298, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.28857422, + "router_z_loss_mlp": 0.01663208, + "step": 7677, + "time_per_iteration": 2.9522299766540527 + }, + { + "auxiliary_loss_clip": 0.01128504, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.04899335, + "balance_loss_mlp": 1.02022481, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.3499397638833806, + "language_loss": 0.76076233, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78240001, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.15026855, + "step": 7678, + "time_per_iteration": 2.466942548751831 + }, + { + "auxiliary_loss_clip": 0.01131065, + "auxiliary_loss_mlp": 0.01038774, + "balance_loss_clip": 1.05063486, + "balance_loss_mlp": 1.02399802, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 2.6873825406923895, + "language_loss": 0.66765243, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68935078, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.14764404, + "step": 7679, + "time_per_iteration": 2.4299590587615967 + }, + { + "auxiliary_loss_clip": 0.01122058, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.04273093, + "balance_loss_mlp": 1.02745318, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 2.023307812019226, + "language_loss": 0.70215273, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.72378409, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13623047, + "step": 7680, + "time_per_iteration": 3.8671135902404785 + }, + { + "auxiliary_loss_clip": 0.01133254, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_clip": 1.0494628, + "balance_loss_mlp": 1.03352797, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 1.8586032725022203, + "language_loss": 0.63727248, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65909106, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.15075684, + "step": 7681, + "time_per_iteration": 2.4557201862335205 + }, + { + "auxiliary_loss_clip": 0.01150264, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.06817079, + "balance_loss_mlp": 1.02219403, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 2.0527685266122004, + "language_loss": 0.66965413, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69150817, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12939453, + "step": 7682, + "time_per_iteration": 2.4843716621398926 + }, + { + "auxiliary_loss_clip": 0.01134203, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.053388, + "balance_loss_mlp": 1.0204159, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.0896022465668027, + "language_loss": 0.74332327, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76500696, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13757324, + "step": 7683, + "time_per_iteration": 2.5474331378936768 + }, + { + "auxiliary_loss_clip": 0.01136606, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.05680895, + "balance_loss_mlp": 1.0253377, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.8946313365447898, + "language_loss": 0.76531148, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78706574, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13470459, + "step": 7684, + "time_per_iteration": 2.4792675971984863 + }, + { + "auxiliary_loss_clip": 0.01135746, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.05082846, + "balance_loss_mlp": 1.02639592, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 2.1203352097638195, + "language_loss": 0.79843825, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82020503, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.14526367, + "step": 7685, + "time_per_iteration": 2.5078811645507812 + }, + { + "auxiliary_loss_clip": 0.0113317, + "auxiliary_loss_mlp": 0.0104566, + "balance_loss_clip": 1.05254626, + "balance_loss_mlp": 1.03126526, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 1.9774280333947074, + "language_loss": 0.66782725, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68961549, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.14398193, + "step": 7686, + "time_per_iteration": 2.5729973316192627 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.0522269, + "balance_loss_mlp": 1.01832533, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.2782006822063674, + "language_loss": 0.73730344, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75899494, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14593506, + "step": 7687, + "time_per_iteration": 2.567814350128174 + }, + { + "auxiliary_loss_clip": 0.01124378, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.04453206, + "balance_loss_mlp": 1.02032614, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.2360740149361997, + "language_loss": 0.74799085, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76958585, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14782715, + "step": 7688, + "time_per_iteration": 2.5478951930999756 + }, + { + "auxiliary_loss_clip": 0.0112377, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.04339123, + "balance_loss_mlp": 1.02274871, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.9758233953049484, + "language_loss": 0.79094779, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.81256109, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14801025, + "step": 7689, + "time_per_iteration": 2.4946203231811523 + }, + { + "auxiliary_loss_clip": 0.01134538, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.05005789, + "balance_loss_mlp": 1.02605319, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 5.35242860252011, + "language_loss": 0.57230937, + "learning_rate": 2.339324323980964e-06, + "loss": 0.59407318, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.15783691, + "step": 7690, + "time_per_iteration": 2.484595775604248 + }, + { + "auxiliary_loss_clip": 0.01136445, + "auxiliary_loss_mlp": 0.0103963, + "balance_loss_clip": 1.05441475, + "balance_loss_mlp": 1.02496767, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.386087379859787, + "language_loss": 0.82873225, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85049301, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.14654541, + "step": 7691, + "time_per_iteration": 3.9564926624298096 + }, + { + "auxiliary_loss_clip": 0.01135331, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.05407095, + "balance_loss_mlp": 1.01581001, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.4923171450687964, + "language_loss": 0.75135267, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77299595, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13171387, + "step": 7692, + "time_per_iteration": 2.481430768966675 + }, + { + "auxiliary_loss_clip": 0.01136218, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.05408764, + "balance_loss_mlp": 1.02177525, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.6271601773034852, + "language_loss": 0.74143398, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76315999, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14593506, + "step": 7693, + "time_per_iteration": 2.6013097763061523 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01041607, + "balance_loss_clip": 1.05379081, + "balance_loss_mlp": 1.02721894, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5362537381299795, + "language_loss": 0.85553581, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87730283, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14385986, + "step": 7694, + "time_per_iteration": 2.49019193649292 + }, + { + "auxiliary_loss_clip": 0.01128643, + "auxiliary_loss_mlp": 0.01048774, + "balance_loss_clip": 1.04787076, + "balance_loss_mlp": 1.03268051, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 3.474459498873572, + "language_loss": 0.79076201, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81253612, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.16101074, + "step": 7695, + "time_per_iteration": 2.5256805419921875 + }, + { + "auxiliary_loss_clip": 0.01135998, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.05720401, + "balance_loss_mlp": 1.02173805, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.6293539561065709, + "language_loss": 0.72265983, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74436963, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13238525, + "step": 7696, + "time_per_iteration": 2.456000328063965 + }, + { + "auxiliary_loss_clip": 0.01131103, + "auxiliary_loss_mlp": 0.01052937, + "balance_loss_clip": 1.04985738, + "balance_loss_mlp": 1.03572309, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.7863335859273344, + "language_loss": 0.69767392, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71951431, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.17211914, + "step": 7697, + "time_per_iteration": 3.862807512283325 + }, + { + "auxiliary_loss_clip": 0.01134235, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.05264938, + "balance_loss_mlp": 1.01706696, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.871471784896055, + "language_loss": 0.84606451, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.86771703, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13952637, + "step": 7698, + "time_per_iteration": 2.5485732555389404 + }, + { + "auxiliary_loss_clip": 0.01126708, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.04706717, + "balance_loss_mlp": 1.02258945, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 1.7462312591666913, + "language_loss": 0.71519947, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73682821, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13580322, + "step": 7699, + "time_per_iteration": 2.4413094520568848 + }, + { + "auxiliary_loss_clip": 0.01131918, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.04631805, + "balance_loss_mlp": 1.02393329, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.9273535100523087, + "language_loss": 0.7166037, + "learning_rate": 2.335485529281996e-06, + "loss": 0.73831457, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.15240479, + "step": 7700, + "time_per_iteration": 2.5050806999206543 + }, + { + "auxiliary_loss_clip": 0.01131627, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.0530709, + "balance_loss_mlp": 1.02027297, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 3.6792446546603244, + "language_loss": 0.72503126, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74668443, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13415527, + "step": 7701, + "time_per_iteration": 2.4577114582061768 + }, + { + "auxiliary_loss_clip": 0.01135487, + "auxiliary_loss_mlp": 0.01037145, + "balance_loss_clip": 1.05082047, + "balance_loss_mlp": 1.02230954, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 2.0178960068735674, + "language_loss": 0.64797384, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.66970015, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14837646, + "step": 7702, + "time_per_iteration": 2.6096768379211426 + }, + { + "auxiliary_loss_clip": 0.01128066, + "auxiliary_loss_mlp": 0.01030139, + "balance_loss_clip": 1.05050492, + "balance_loss_mlp": 1.01666856, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.860089322617547, + "language_loss": 0.73451322, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75609529, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13464355, + "step": 7703, + "time_per_iteration": 2.4356374740600586 + }, + { + "auxiliary_loss_clip": 0.01146614, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.05953503, + "balance_loss_mlp": 1.02177107, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 3.0578919073226682, + "language_loss": 0.69286287, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.71469808, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15142822, + "step": 7704, + "time_per_iteration": 2.511383533477783 + }, + { + "auxiliary_loss_clip": 0.01146282, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.06294298, + "balance_loss_mlp": 1.01694274, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 3.902482680193903, + "language_loss": 0.816329, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83810526, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14416504, + "step": 7705, + "time_per_iteration": 2.4948408603668213 + }, + { + "auxiliary_loss_clip": 0.01133014, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.04803574, + "balance_loss_mlp": 1.02101946, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.7916663131091473, + "language_loss": 0.77771521, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79939544, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13983154, + "step": 7706, + "time_per_iteration": 2.5562431812286377 + }, + { + "auxiliary_loss_clip": 0.01136097, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.05830717, + "balance_loss_mlp": 1.0182333, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 7.921019798993272, + "language_loss": 0.70041543, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.72209859, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13989258, + "step": 7707, + "time_per_iteration": 2.4534752368927 + }, + { + "auxiliary_loss_clip": 0.01140308, + "auxiliary_loss_mlp": 0.01037277, + "balance_loss_clip": 1.05441475, + "balance_loss_mlp": 1.02168417, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.9379473635249613, + "language_loss": 0.6151154, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63689125, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.15600586, + "step": 7708, + "time_per_iteration": 2.5971267223358154 + }, + { + "auxiliary_loss_clip": 0.01136091, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.0539403, + "balance_loss_mlp": 1.01949143, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 3.295031648965959, + "language_loss": 0.77314699, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79484892, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14605713, + "step": 7709, + "time_per_iteration": 2.5188753604888916 + }, + { + "auxiliary_loss_clip": 0.01135475, + "auxiliary_loss_mlp": 0.01040991, + "balance_loss_clip": 1.05230212, + "balance_loss_mlp": 1.0263164, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.6000203105062754, + "language_loss": 0.76693273, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.7886973, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14666748, + "step": 7710, + "time_per_iteration": 2.5541892051696777 + }, + { + "auxiliary_loss_clip": 0.01140232, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.05692911, + "balance_loss_mlp": 1.01700485, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 2.190467336476177, + "language_loss": 0.72975409, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75148463, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.15820312, + "step": 7711, + "time_per_iteration": 2.529827356338501 + }, + { + "auxiliary_loss_clip": 0.01131879, + "auxiliary_loss_mlp": 0.01043369, + "balance_loss_clip": 1.05205798, + "balance_loss_mlp": 1.02840793, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.261846647208044, + "language_loss": 0.71350253, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73525506, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.14953613, + "step": 7712, + "time_per_iteration": 2.664687156677246 + }, + { + "auxiliary_loss_clip": 0.01145259, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.05737376, + "balance_loss_mlp": 1.02456522, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 2.0902572375849338, + "language_loss": 0.7290715, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75093067, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.16101074, + "step": 7713, + "time_per_iteration": 2.545555353164673 + }, + { + "auxiliary_loss_clip": 0.0114468, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.05897975, + "balance_loss_mlp": 1.02399457, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.5518117832564482, + "language_loss": 0.58569026, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60753071, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.15380859, + "step": 7714, + "time_per_iteration": 2.545088052749634 + }, + { + "auxiliary_loss_clip": 0.01132458, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.05281758, + "balance_loss_mlp": 1.0213809, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.5381712733379063, + "language_loss": 0.70726615, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72894472, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.14013672, + "step": 7715, + "time_per_iteration": 3.84389591217041 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.06224442, + "balance_loss_mlp": 1.02637219, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 1.782173120236541, + "language_loss": 0.67872268, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70061183, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14361572, + "step": 7716, + "time_per_iteration": 2.5122628211975098 + }, + { + "auxiliary_loss_clip": 0.01136233, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.05353332, + "balance_loss_mlp": 1.01531816, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.522708250005898, + "language_loss": 0.80633032, + "learning_rate": 2.328956666474691e-06, + "loss": 0.82799858, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.15258789, + "step": 7717, + "time_per_iteration": 2.5380077362060547 + }, + { + "auxiliary_loss_clip": 0.01130042, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.04752922, + "balance_loss_mlp": 1.01919365, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.804165914620722, + "language_loss": 0.73280859, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75444508, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.144104, + "step": 7718, + "time_per_iteration": 2.522814989089966 + }, + { + "auxiliary_loss_clip": 0.01121086, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.0417738, + "balance_loss_mlp": 1.02271378, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.8127187146707953, + "language_loss": 0.70494705, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72653532, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.15026855, + "step": 7719, + "time_per_iteration": 2.62945556640625 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.0501703, + "balance_loss_mlp": 1.02427101, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.7234316128384939, + "language_loss": 0.86851519, + "learning_rate": 2.327804137953357e-06, + "loss": 0.89025617, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.1472168, + "step": 7720, + "time_per_iteration": 2.564434766769409 + }, + { + "auxiliary_loss_clip": 0.01062406, + "auxiliary_loss_mlp": 0.01009048, + "balance_loss_clip": 1.03471088, + "balance_loss_mlp": 1.00712848, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7131613402948433, + "language_loss": 0.55081666, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57153124, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01922607, + "step": 7721, + "time_per_iteration": 3.1245322227478027 + }, + { + "auxiliary_loss_clip": 0.01131005, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.0518384, + "balance_loss_mlp": 1.02399755, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.0069433421695444, + "language_loss": 0.80795813, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.82965297, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.14489746, + "step": 7722, + "time_per_iteration": 2.5573394298553467 + }, + { + "auxiliary_loss_clip": 0.01134423, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.04847622, + "balance_loss_mlp": 1.03310347, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.8913107829956062, + "language_loss": 0.78008169, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80192804, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.17102051, + "step": 7723, + "time_per_iteration": 2.5205509662628174 + }, + { + "auxiliary_loss_clip": 0.0113158, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.05169392, + "balance_loss_mlp": 1.0170784, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.787482706022066, + "language_loss": 0.68334728, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70497602, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.14221191, + "step": 7724, + "time_per_iteration": 4.087443590164185 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01038656, + "balance_loss_clip": 1.04928625, + "balance_loss_mlp": 1.02208591, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 3.334710347399319, + "language_loss": 0.67697996, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69866323, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.16552734, + "step": 7725, + "time_per_iteration": 2.51751446723938 + }, + { + "auxiliary_loss_clip": 0.0112574, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.04737842, + "balance_loss_mlp": 1.02521729, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.9800034175764616, + "language_loss": 0.64621878, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.6678561, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12768555, + "step": 7726, + "time_per_iteration": 2.560894250869751 + }, + { + "auxiliary_loss_clip": 0.01134312, + "auxiliary_loss_mlp": 0.01042953, + "balance_loss_clip": 1.05104256, + "balance_loss_mlp": 1.02855241, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.766705596180863, + "language_loss": 0.74863601, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77040869, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.14385986, + "step": 7727, + "time_per_iteration": 2.529104232788086 + }, + { + "auxiliary_loss_clip": 0.01131006, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.04972732, + "balance_loss_mlp": 1.02439117, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 1.9852587974619047, + "language_loss": 0.78780562, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80950499, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14550781, + "step": 7728, + "time_per_iteration": 2.5491695404052734 + }, + { + "auxiliary_loss_clip": 0.01129922, + "auxiliary_loss_mlp": 0.01042115, + "balance_loss_clip": 1.04762936, + "balance_loss_mlp": 1.02666545, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 1.9672490594891414, + "language_loss": 0.76458639, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78630674, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.15454102, + "step": 7729, + "time_per_iteration": 2.464386463165283 + }, + { + "auxiliary_loss_clip": 0.01137249, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.05538917, + "balance_loss_mlp": 1.02399802, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.6689927343986861, + "language_loss": 0.80080038, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82255089, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13806152, + "step": 7730, + "time_per_iteration": 2.456181287765503 + }, + { + "auxiliary_loss_clip": 0.01137089, + "auxiliary_loss_mlp": 0.01037061, + "balance_loss_clip": 1.05721974, + "balance_loss_mlp": 1.02300632, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.627293011077751, + "language_loss": 0.77197081, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79371232, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.140625, + "step": 7731, + "time_per_iteration": 2.4524660110473633 + }, + { + "auxiliary_loss_clip": 0.0112856, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.04887009, + "balance_loss_mlp": 1.01967084, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.8425486321286761, + "language_loss": 0.65955991, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68117571, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13372803, + "step": 7732, + "time_per_iteration": 2.556006908416748 + }, + { + "auxiliary_loss_clip": 0.01124276, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.04076362, + "balance_loss_mlp": 1.0215466, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.2059610659983866, + "language_loss": 0.7290166, + "learning_rate": 2.32280855998725e-06, + "loss": 0.75063407, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.15930176, + "step": 7733, + "time_per_iteration": 2.538039207458496 + }, + { + "auxiliary_loss_clip": 0.01058467, + "auxiliary_loss_mlp": 0.01001934, + "balance_loss_clip": 1.03066015, + "balance_loss_mlp": 1.00011575, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.1885153822142038, + "language_loss": 0.51857924, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.53918326, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01818848, + "step": 7734, + "time_per_iteration": 4.548669338226318 + }, + { + "auxiliary_loss_clip": 0.01135771, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.05676031, + "balance_loss_mlp": 1.018929, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.1219816033092718, + "language_loss": 0.75843143, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.78012151, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.14306641, + "step": 7735, + "time_per_iteration": 2.415391206741333 + }, + { + "auxiliary_loss_clip": 0.01131269, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.05080938, + "balance_loss_mlp": 1.02859724, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 1.859103793301225, + "language_loss": 0.69619989, + "learning_rate": 2.321655439354519e-06, + "loss": 0.71794844, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.15008545, + "step": 7736, + "time_per_iteration": 2.6068472862243652 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.05055594, + "balance_loss_mlp": 1.02245593, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.8015109139549237, + "language_loss": 0.7182821, + "learning_rate": 2.321271041396427e-06, + "loss": 0.73994505, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13983154, + "step": 7737, + "time_per_iteration": 2.4492926597595215 + }, + { + "auxiliary_loss_clip": 0.01144706, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.06367183, + "balance_loss_mlp": 1.02328134, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 1.8897165788023005, + "language_loss": 0.84104919, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.86287862, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.14959717, + "step": 7738, + "time_per_iteration": 2.4359583854675293 + }, + { + "auxiliary_loss_clip": 0.01071672, + "auxiliary_loss_mlp": 0.01012637, + "balance_loss_clip": 1.04398465, + "balance_loss_mlp": 1.01095736, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.8013663494806864, + "language_loss": 0.57812071, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59896374, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.27636719, + "router_z_loss_mlp": 0.01679993, + "step": 7739, + "time_per_iteration": 3.1549787521362305 + }, + { + "auxiliary_loss_clip": 0.01126146, + "auxiliary_loss_mlp": 0.01048355, + "balance_loss_clip": 1.04574645, + "balance_loss_mlp": 1.0329473, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 4.586793569951319, + "language_loss": 0.85098618, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87273121, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.15423584, + "step": 7740, + "time_per_iteration": 2.4844348430633545 + }, + { + "auxiliary_loss_clip": 0.01133041, + "auxiliary_loss_mlp": 0.01040956, + "balance_loss_clip": 1.0528388, + "balance_loss_mlp": 1.02621615, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.6970976315234287, + "language_loss": 0.75890642, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78064638, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.14727783, + "step": 7741, + "time_per_iteration": 3.966724157333374 + }, + { + "auxiliary_loss_clip": 0.01134071, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.05085492, + "balance_loss_mlp": 1.0258286, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.9195142397980351, + "language_loss": 0.81255054, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83428299, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13336182, + "step": 7742, + "time_per_iteration": 2.498206615447998 + }, + { + "auxiliary_loss_clip": 0.01126369, + "auxiliary_loss_mlp": 0.01038199, + "balance_loss_clip": 1.04537797, + "balance_loss_mlp": 1.02382278, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.7134590529981522, + "language_loss": 0.72371101, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74535668, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.1439209, + "step": 7743, + "time_per_iteration": 2.51139760017395 + }, + { + "auxiliary_loss_clip": 0.01129783, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.04888809, + "balance_loss_mlp": 1.02175164, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 1.776743588883762, + "language_loss": 0.7095136, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73117214, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14306641, + "step": 7744, + "time_per_iteration": 2.420713424682617 + }, + { + "auxiliary_loss_clip": 0.01131337, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.05149198, + "balance_loss_mlp": 1.01878619, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.682148007682863, + "language_loss": 0.84991211, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87154508, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1315918, + "step": 7745, + "time_per_iteration": 2.591684341430664 + }, + { + "auxiliary_loss_clip": 0.01129299, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.05038738, + "balance_loss_mlp": 1.02341592, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.396114754546265, + "language_loss": 0.72885656, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75052965, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.14593506, + "step": 7746, + "time_per_iteration": 2.527043104171753 + }, + { + "auxiliary_loss_clip": 0.01126908, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.04888535, + "balance_loss_mlp": 1.02299523, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.6044882266905678, + "language_loss": 0.69544411, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.71707821, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13519287, + "step": 7747, + "time_per_iteration": 2.7696447372436523 + }, + { + "auxiliary_loss_clip": 0.01130537, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.05255795, + "balance_loss_mlp": 1.02262163, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 2.3129042672455955, + "language_loss": 0.67374539, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69540727, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13037109, + "step": 7748, + "time_per_iteration": 2.5408992767333984 + }, + { + "auxiliary_loss_clip": 0.01134919, + "auxiliary_loss_mlp": 0.0103797, + "balance_loss_clip": 1.05133915, + "balance_loss_mlp": 1.02241278, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.0999243615373553, + "language_loss": 0.63911891, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66084784, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.15563965, + "step": 7749, + "time_per_iteration": 2.5040690898895264 + }, + { + "auxiliary_loss_clip": 0.01131407, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.050493, + "balance_loss_mlp": 1.0222261, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.1080392297231527, + "language_loss": 0.74891078, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.77059782, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.15081787, + "step": 7750, + "time_per_iteration": 2.427607297897339 + }, + { + "auxiliary_loss_clip": 0.01129731, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.04576886, + "balance_loss_mlp": 1.01768506, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 1.8862847997172485, + "language_loss": 0.74862736, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.77024996, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14831543, + "step": 7751, + "time_per_iteration": 2.584169864654541 + }, + { + "auxiliary_loss_clip": 0.01140367, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.05716586, + "balance_loss_mlp": 1.02201915, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 2.3011459954123046, + "language_loss": 0.73247296, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75424397, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14715576, + "step": 7752, + "time_per_iteration": 2.4447436332702637 + }, + { + "auxiliary_loss_clip": 0.01133731, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.04985619, + "balance_loss_mlp": 1.02417004, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.330536042757398, + "language_loss": 0.69003403, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71175992, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14685059, + "step": 7753, + "time_per_iteration": 2.521062135696411 + }, + { + "auxiliary_loss_clip": 0.01122433, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.04474688, + "balance_loss_mlp": 1.01757014, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8289616552627725, + "language_loss": 0.73526156, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75679898, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13757324, + "step": 7754, + "time_per_iteration": 2.465294122695923 + }, + { + "auxiliary_loss_clip": 0.01148072, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.06134188, + "balance_loss_mlp": 1.0209595, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.6075687541199795, + "language_loss": 0.78896743, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81080538, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.14752197, + "step": 7755, + "time_per_iteration": 2.508666515350342 + }, + { + "auxiliary_loss_clip": 0.01118621, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.04196751, + "balance_loss_mlp": 1.01796055, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.720088975086829, + "language_loss": 0.72020185, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74170327, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13555908, + "step": 7756, + "time_per_iteration": 2.477834463119507 + }, + { + "auxiliary_loss_clip": 0.01123434, + "auxiliary_loss_mlp": 0.01034944, + "balance_loss_clip": 1.04558945, + "balance_loss_mlp": 1.02138424, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.7781105215481123, + "language_loss": 0.78303844, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80462223, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13562012, + "step": 7757, + "time_per_iteration": 2.5215046405792236 + }, + { + "auxiliary_loss_clip": 0.01128348, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.04931569, + "balance_loss_mlp": 1.01770711, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 1.8568230411965458, + "language_loss": 0.66418052, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68577611, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13500977, + "step": 7758, + "time_per_iteration": 4.0059709548950195 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.04881692, + "balance_loss_mlp": 1.02651536, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.6108539853688413, + "language_loss": 0.74760342, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76929748, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.14880371, + "step": 7759, + "time_per_iteration": 2.4194698333740234 + }, + { + "auxiliary_loss_clip": 0.01130663, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.05190825, + "balance_loss_mlp": 1.02120972, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.5315456099246176, + "language_loss": 0.77757406, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79922789, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13500977, + "step": 7760, + "time_per_iteration": 2.602825164794922 + }, + { + "auxiliary_loss_clip": 0.01126822, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.04816937, + "balance_loss_mlp": 1.01698732, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.7803741935364028, + "language_loss": 0.74256146, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76413852, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13873291, + "step": 7761, + "time_per_iteration": 2.496659517288208 + }, + { + "auxiliary_loss_clip": 0.01135381, + "auxiliary_loss_mlp": 0.01037987, + "balance_loss_clip": 1.05216336, + "balance_loss_mlp": 1.02237058, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.8728999993171902, + "language_loss": 0.7870692, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80880284, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.15612793, + "step": 7762, + "time_per_iteration": 2.488171339035034 + }, + { + "auxiliary_loss_clip": 0.01078785, + "auxiliary_loss_mlp": 0.01002182, + "balance_loss_clip": 1.05154109, + "balance_loss_mlp": 1.00082278, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7955884652373066, + "language_loss": 0.59781122, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61862093, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01359558, + "step": 7763, + "time_per_iteration": 3.2159597873687744 + }, + { + "auxiliary_loss_clip": 0.01134262, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.05168498, + "balance_loss_mlp": 1.02181196, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.414942791700403, + "language_loss": 0.79157543, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81329775, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.16162109, + "step": 7764, + "time_per_iteration": 2.5265052318573 + }, + { + "auxiliary_loss_clip": 0.01127697, + "auxiliary_loss_mlp": 0.0103797, + "balance_loss_clip": 1.0490427, + "balance_loss_mlp": 1.02438569, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.7801755278544946, + "language_loss": 0.71713233, + "learning_rate": 2.310503005696839e-06, + "loss": 0.73878902, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13580322, + "step": 7765, + "time_per_iteration": 2.4495747089385986 + }, + { + "auxiliary_loss_clip": 0.01131552, + "auxiliary_loss_mlp": 0.01039866, + "balance_loss_clip": 1.04880047, + "balance_loss_mlp": 1.02535796, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.3613900625436903, + "language_loss": 0.77644849, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.7981627, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.1451416, + "step": 7766, + "time_per_iteration": 2.4817750453948975 + }, + { + "auxiliary_loss_clip": 0.01121875, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.04215431, + "balance_loss_mlp": 1.02235055, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 2.3503608437780503, + "language_loss": 0.65356624, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67514241, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13378906, + "step": 7767, + "time_per_iteration": 2.4422526359558105 + }, + { + "auxiliary_loss_clip": 0.01131643, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.05113006, + "balance_loss_mlp": 1.02103472, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.8767934617878634, + "language_loss": 0.74304503, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76470929, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.1373291, + "step": 7768, + "time_per_iteration": 3.8848092555999756 + }, + { + "auxiliary_loss_clip": 0.01135045, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.05419207, + "balance_loss_mlp": 1.01639056, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.667694321624612, + "language_loss": 0.7084952, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73014963, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14013672, + "step": 7769, + "time_per_iteration": 2.475881338119507 + }, + { + "auxiliary_loss_clip": 0.01133557, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.05224442, + "balance_loss_mlp": 1.02080464, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.9372257988523385, + "language_loss": 0.81706011, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83874142, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13769531, + "step": 7770, + "time_per_iteration": 2.4759421348571777 + }, + { + "auxiliary_loss_clip": 0.01068914, + "auxiliary_loss_mlp": 0.01010059, + "balance_loss_clip": 1.04212618, + "balance_loss_mlp": 1.00810444, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7973407556041981, + "language_loss": 0.55677122, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.5775609, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01956177, + "step": 7771, + "time_per_iteration": 3.1028497219085693 + }, + { + "auxiliary_loss_clip": 0.01123606, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.04536891, + "balance_loss_mlp": 1.0205636, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.035588155330591, + "language_loss": 0.66185248, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.68343902, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.14471436, + "step": 7772, + "time_per_iteration": 2.475952386856079 + }, + { + "auxiliary_loss_clip": 0.01125806, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.04664791, + "balance_loss_mlp": 1.02035546, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 2.278245502744129, + "language_loss": 0.63957149, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.66118419, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.15112305, + "step": 7773, + "time_per_iteration": 2.5854406356811523 + }, + { + "auxiliary_loss_clip": 0.01127103, + "auxiliary_loss_mlp": 0.0104234, + "balance_loss_clip": 1.04671872, + "balance_loss_mlp": 1.02734327, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 2.2493455590065228, + "language_loss": 0.80552757, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82722199, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.14990234, + "step": 7774, + "time_per_iteration": 2.490461826324463 + }, + { + "auxiliary_loss_clip": 0.0112936, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.04916871, + "balance_loss_mlp": 1.0159266, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.584511030819097, + "language_loss": 0.77584434, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79743731, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14007568, + "step": 7775, + "time_per_iteration": 2.4899821281433105 + }, + { + "auxiliary_loss_clip": 0.01122564, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.04461551, + "balance_loss_mlp": 1.02070856, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.972258806878342, + "language_loss": 0.70099217, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72257257, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.14758301, + "step": 7776, + "time_per_iteration": 2.4484925270080566 + }, + { + "auxiliary_loss_clip": 0.01135081, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.05332625, + "balance_loss_mlp": 1.02353227, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.3667699921281855, + "language_loss": 0.7359767, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.7576983, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13549805, + "step": 7777, + "time_per_iteration": 2.525423288345337 + }, + { + "auxiliary_loss_clip": 0.01132318, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.05348015, + "balance_loss_mlp": 1.01989412, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.1120367799732342, + "language_loss": 0.69373626, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71539176, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13330078, + "step": 7778, + "time_per_iteration": 3.875150680541992 + }, + { + "auxiliary_loss_clip": 0.01135371, + "auxiliary_loss_mlp": 0.0103628, + "balance_loss_clip": 1.05542397, + "balance_loss_mlp": 1.02182579, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.6123239137747365, + "language_loss": 0.73745656, + "learning_rate": 2.305115506191206e-06, + "loss": 0.7591731, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14447021, + "step": 7779, + "time_per_iteration": 2.5232274532318115 + }, + { + "auxiliary_loss_clip": 0.01123142, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.04642844, + "balance_loss_mlp": 1.02232099, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5172704638328973, + "language_loss": 0.72464681, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74624997, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.14862061, + "step": 7780, + "time_per_iteration": 2.501342296600342 + }, + { + "auxiliary_loss_clip": 0.0113807, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.05406201, + "balance_loss_mlp": 1.02474785, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.09496253203441, + "language_loss": 0.74185181, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76362544, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.14532471, + "step": 7781, + "time_per_iteration": 2.535989284515381 + }, + { + "auxiliary_loss_clip": 0.01130206, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.04753149, + "balance_loss_mlp": 1.0225054, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.937542283055191, + "language_loss": 0.63150167, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65317225, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14349365, + "step": 7782, + "time_per_iteration": 2.5708730220794678 + }, + { + "auxiliary_loss_clip": 0.01137639, + "auxiliary_loss_mlp": 0.01042339, + "balance_loss_clip": 1.05381131, + "balance_loss_mlp": 1.0284512, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.8041241388283367, + "language_loss": 0.63550204, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.6573019, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13885498, + "step": 7783, + "time_per_iteration": 2.582348346710205 + }, + { + "auxiliary_loss_clip": 0.01134443, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.05123365, + "balance_loss_mlp": 1.02492106, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.5266900145660607, + "language_loss": 0.67957318, + "learning_rate": 2.303190847569801e-06, + "loss": 0.70131928, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.15246582, + "step": 7784, + "time_per_iteration": 2.453446626663208 + }, + { + "auxiliary_loss_clip": 0.01129289, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.05136704, + "balance_loss_mlp": 1.01921177, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.8938512152405964, + "language_loss": 0.84395123, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.86556029, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12402344, + "step": 7785, + "time_per_iteration": 3.870110511779785 + }, + { + "auxiliary_loss_clip": 0.011296, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.04856765, + "balance_loss_mlp": 1.02849841, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 1.7710906358530178, + "language_loss": 0.77459031, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.7963295, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.15808105, + "step": 7786, + "time_per_iteration": 2.5175893306732178 + }, + { + "auxiliary_loss_clip": 0.01129, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.05051398, + "balance_loss_mlp": 1.0162847, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 1.9539206859843479, + "language_loss": 0.73849964, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76007986, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12762451, + "step": 7787, + "time_per_iteration": 2.463020086288452 + }, + { + "auxiliary_loss_clip": 0.01137278, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.05797625, + "balance_loss_mlp": 1.0217011, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.6094110257157732, + "language_loss": 0.65587687, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67760414, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13751221, + "step": 7788, + "time_per_iteration": 2.579493761062622 + }, + { + "auxiliary_loss_clip": 0.01127264, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.04993927, + "balance_loss_mlp": 1.01835132, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.5930511868331774, + "language_loss": 0.63329601, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.65487063, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11853027, + "step": 7789, + "time_per_iteration": 2.4889330863952637 + }, + { + "auxiliary_loss_clip": 0.01075796, + "auxiliary_loss_mlp": 0.01010079, + "balance_loss_clip": 1.04878998, + "balance_loss_mlp": 1.00847268, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.6957131803571732, + "language_loss": 0.61876798, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63962674, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.01605225, + "step": 7790, + "time_per_iteration": 3.157376527786255 + }, + { + "auxiliary_loss_clip": 0.01127144, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.05154335, + "balance_loss_mlp": 1.01918709, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.6956803238437108, + "language_loss": 0.79578412, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81737244, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.125, + "step": 7791, + "time_per_iteration": 2.454620838165283 + }, + { + "auxiliary_loss_clip": 0.01140261, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.05951166, + "balance_loss_mlp": 1.02234232, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.6174398310914906, + "language_loss": 0.75393963, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77569598, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.1305542, + "step": 7792, + "time_per_iteration": 2.5184097290039062 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.04803348, + "balance_loss_mlp": 1.0175631, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.5033764917176273, + "language_loss": 0.67957765, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70111775, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12072754, + "step": 7793, + "time_per_iteration": 2.531309127807617 + }, + { + "auxiliary_loss_clip": 0.01132918, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.05636573, + "balance_loss_mlp": 1.01977503, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.5603178805640712, + "language_loss": 0.74039447, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.7620458, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12451172, + "step": 7794, + "time_per_iteration": 2.515052080154419 + }, + { + "auxiliary_loss_clip": 0.01126684, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.0479852, + "balance_loss_mlp": 1.02823734, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5789643084236173, + "language_loss": 0.63606334, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65775031, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13763428, + "step": 7795, + "time_per_iteration": 2.5699167251586914 + }, + { + "auxiliary_loss_clip": 0.01122207, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.04667342, + "balance_loss_mlp": 1.02084076, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.755775876807329, + "language_loss": 0.68258721, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70416605, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.14819336, + "step": 7796, + "time_per_iteration": 2.614427328109741 + }, + { + "auxiliary_loss_clip": 0.01129738, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.05047774, + "balance_loss_mlp": 1.02069652, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.8286002412916442, + "language_loss": 0.7021361, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72376806, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12744141, + "step": 7797, + "time_per_iteration": 2.555511474609375 + }, + { + "auxiliary_loss_clip": 0.01138547, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.05802059, + "balance_loss_mlp": 1.02352214, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 2.0308701555456663, + "language_loss": 0.67256719, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69432938, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14141846, + "step": 7798, + "time_per_iteration": 2.4404070377349854 + }, + { + "auxiliary_loss_clip": 0.01067173, + "auxiliary_loss_mlp": 0.0100683, + "balance_loss_clip": 1.0395174, + "balance_loss_mlp": 1.00518095, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9345422738747255, + "language_loss": 0.64553773, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66627777, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01652527, + "step": 7799, + "time_per_iteration": 3.3258402347564697 + }, + { + "auxiliary_loss_clip": 0.01130097, + "auxiliary_loss_mlp": 0.01029412, + "balance_loss_clip": 1.05055237, + "balance_loss_mlp": 1.01709175, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.469725209788213, + "language_loss": 0.72096586, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74256098, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12329102, + "step": 7800, + "time_per_iteration": 2.5090112686157227 + }, + { + "auxiliary_loss_clip": 0.01129079, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.05316913, + "balance_loss_mlp": 1.02206945, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.7891360118547766, + "language_loss": 0.72819662, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74982131, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11322021, + "step": 7801, + "time_per_iteration": 3.9237186908721924 + }, + { + "auxiliary_loss_clip": 0.01137222, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.05377781, + "balance_loss_mlp": 1.02319932, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.009446562524778, + "language_loss": 0.62884641, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.65059209, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14160156, + "step": 7802, + "time_per_iteration": 2.4321959018707275 + }, + { + "auxiliary_loss_clip": 0.01133814, + "auxiliary_loss_mlp": 0.01037526, + "balance_loss_clip": 1.05397511, + "balance_loss_mlp": 1.02510417, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.9489582744200784, + "language_loss": 0.73667109, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75838453, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12420654, + "step": 7803, + "time_per_iteration": 2.524913787841797 + }, + { + "auxiliary_loss_clip": 0.01132068, + "auxiliary_loss_mlp": 0.01038176, + "balance_loss_clip": 1.05351388, + "balance_loss_mlp": 1.02562928, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 1.7172509231585964, + "language_loss": 0.77815974, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79986215, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12548828, + "step": 7804, + "time_per_iteration": 2.4280316829681396 + }, + { + "auxiliary_loss_clip": 0.01139289, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.06103885, + "balance_loss_mlp": 1.01744902, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8362169698948054, + "language_loss": 0.76930964, + "learning_rate": 2.295104163929305e-06, + "loss": 0.7910012, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12414551, + "step": 7805, + "time_per_iteration": 2.5403668880462646 + }, + { + "auxiliary_loss_clip": 0.01152197, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.06663752, + "balance_loss_mlp": 1.0275979, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.6529382142267741, + "language_loss": 0.8276608, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.84960055, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.1418457, + "step": 7806, + "time_per_iteration": 2.5303022861480713 + }, + { + "auxiliary_loss_clip": 0.01133184, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.05373609, + "balance_loss_mlp": 1.02445865, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 2.7981596760620966, + "language_loss": 0.77254057, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79425251, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13525391, + "step": 7807, + "time_per_iteration": 2.6424076557159424 + }, + { + "auxiliary_loss_clip": 0.01142299, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.06420004, + "balance_loss_mlp": 1.02139223, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.9063116019826933, + "language_loss": 0.52156448, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.54333895, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13757324, + "step": 7808, + "time_per_iteration": 2.448960304260254 + }, + { + "auxiliary_loss_clip": 0.01081796, + "auxiliary_loss_mlp": 0.01010867, + "balance_loss_clip": 1.05456793, + "balance_loss_mlp": 1.00904202, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.7874787341615349, + "language_loss": 0.57673019, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59765685, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.01826477, + "step": 7809, + "time_per_iteration": 3.0445003509521484 + }, + { + "auxiliary_loss_clip": 0.01142267, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.06047368, + "balance_loss_mlp": 1.02503252, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 1.8926269532902724, + "language_loss": 0.7123127, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73411626, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.1305542, + "step": 7810, + "time_per_iteration": 2.4909300804138184 + }, + { + "auxiliary_loss_clip": 0.0113971, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.05890799, + "balance_loss_mlp": 1.02305365, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 1.9810204637468363, + "language_loss": 0.81006742, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83183175, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13665771, + "step": 7811, + "time_per_iteration": 2.4741597175598145 + }, + { + "auxiliary_loss_clip": 0.01141719, + "auxiliary_loss_mlp": 0.01036392, + "balance_loss_clip": 1.06090379, + "balance_loss_mlp": 1.02226615, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.6127211293360217, + "language_loss": 0.80641532, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82819641, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14123535, + "step": 7812, + "time_per_iteration": 3.88179087638855 + }, + { + "auxiliary_loss_clip": 0.01131411, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.05386996, + "balance_loss_mlp": 1.01863313, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.919034131392965, + "language_loss": 0.74115503, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76277947, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12408447, + "step": 7813, + "time_per_iteration": 2.4487993717193604 + }, + { + "auxiliary_loss_clip": 0.01135047, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.05714583, + "balance_loss_mlp": 1.01789796, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.479718702022034, + "language_loss": 0.84808874, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86975074, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13256836, + "step": 7814, + "time_per_iteration": 2.486905336380005 + }, + { + "auxiliary_loss_clip": 0.01124607, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.04887795, + "balance_loss_mlp": 1.02556062, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 2.106544944113027, + "language_loss": 0.81749117, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83912319, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13031006, + "step": 7815, + "time_per_iteration": 2.4109528064727783 + }, + { + "auxiliary_loss_clip": 0.01134753, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.0565809, + "balance_loss_mlp": 1.02160597, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 3.5848393855893605, + "language_loss": 0.77366281, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.7953651, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13879395, + "step": 7816, + "time_per_iteration": 2.5121688842773438 + }, + { + "auxiliary_loss_clip": 0.01064527, + "auxiliary_loss_mlp": 0.0100301, + "balance_loss_clip": 1.03765655, + "balance_loss_mlp": 1.00153184, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8581742217239718, + "language_loss": 0.59062302, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61129838, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.26855469, + "router_z_loss_mlp": 0.01478577, + "step": 7817, + "time_per_iteration": 3.083564043045044 + }, + { + "auxiliary_loss_clip": 0.01124998, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.04898918, + "balance_loss_mlp": 1.02196169, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.8993224087595781, + "language_loss": 0.79406106, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81566113, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13031006, + "step": 7818, + "time_per_iteration": 2.4757862091064453 + }, + { + "auxiliary_loss_clip": 0.01135075, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.05611157, + "balance_loss_mlp": 1.02127457, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.812412184212999, + "language_loss": 0.84271103, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86440367, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12921143, + "step": 7819, + "time_per_iteration": 2.4853084087371826 + }, + { + "auxiliary_loss_clip": 0.0113057, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.05004334, + "balance_loss_mlp": 1.02423942, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 3.313078477725511, + "language_loss": 0.76636642, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78805828, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14373779, + "step": 7820, + "time_per_iteration": 2.548807144165039 + }, + { + "auxiliary_loss_clip": 0.01122665, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.04815769, + "balance_loss_mlp": 1.02472723, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 2.198851465431297, + "language_loss": 0.74728566, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76889706, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13763428, + "step": 7821, + "time_per_iteration": 3.9779000282287598 + }, + { + "auxiliary_loss_clip": 0.01132252, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.05515671, + "balance_loss_mlp": 1.02880788, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 2.0591002238522234, + "language_loss": 0.89083302, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91256785, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12414551, + "step": 7822, + "time_per_iteration": 2.516549825668335 + }, + { + "auxiliary_loss_clip": 0.0112547, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.04856586, + "balance_loss_mlp": 1.02106786, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.7416091824541946, + "language_loss": 0.80021262, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.82180274, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12457275, + "step": 7823, + "time_per_iteration": 2.5230391025543213 + }, + { + "auxiliary_loss_clip": 0.01077682, + "auxiliary_loss_mlp": 0.01004212, + "balance_loss_clip": 1.04921794, + "balance_loss_mlp": 1.00251877, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.9640813627212136, + "language_loss": 0.56688797, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58770686, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.28466797, + "router_z_loss_mlp": 0.016922, + "step": 7824, + "time_per_iteration": 3.1594526767730713 + }, + { + "auxiliary_loss_clip": 0.01129183, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.04969049, + "balance_loss_mlp": 1.02528262, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.6994710300992326, + "language_loss": 0.81596339, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83765256, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.14453125, + "step": 7825, + "time_per_iteration": 2.4947502613067627 + }, + { + "auxiliary_loss_clip": 0.01134891, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.05570149, + "balance_loss_mlp": 1.01986504, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.666680147948987, + "language_loss": 0.66571808, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68740261, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13702393, + "step": 7826, + "time_per_iteration": 2.4851202964782715 + }, + { + "auxiliary_loss_clip": 0.01131418, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.05246878, + "balance_loss_mlp": 1.02703738, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 2.121198512934346, + "language_loss": 0.83932221, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86104476, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13812256, + "step": 7827, + "time_per_iteration": 2.4893958568573 + }, + { + "auxiliary_loss_clip": 0.0106283, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.03625059, + "balance_loss_mlp": 1.00099826, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8053227116509419, + "language_loss": 0.55675995, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57741451, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.26660156, + "router_z_loss_mlp": 0.01628113, + "step": 7828, + "time_per_iteration": 3.0997071266174316 + }, + { + "auxiliary_loss_clip": 0.01129505, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.05187964, + "balance_loss_mlp": 1.01983798, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.7230860080475188, + "language_loss": 0.80716437, + "learning_rate": 2.285856204861245e-06, + "loss": 0.82878423, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12646484, + "step": 7829, + "time_per_iteration": 3.8688273429870605 + }, + { + "auxiliary_loss_clip": 0.01133178, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.05525982, + "balance_loss_mlp": 1.02168047, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3391698045617784, + "language_loss": 0.76109087, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78276503, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12561035, + "step": 7830, + "time_per_iteration": 2.6060092449188232 + }, + { + "auxiliary_loss_clip": 0.01128716, + "auxiliary_loss_mlp": 0.0103159, + "balance_loss_clip": 1.05287743, + "balance_loss_mlp": 1.01808357, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 2.0941306596500735, + "language_loss": 0.78943086, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81103396, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1348877, + "step": 7831, + "time_per_iteration": 2.4529852867126465 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.05058086, + "balance_loss_mlp": 1.02147365, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.879183561367225, + "language_loss": 0.75752956, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.7792089, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14135742, + "step": 7832, + "time_per_iteration": 2.611654281616211 + }, + { + "auxiliary_loss_clip": 0.01124783, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.04848456, + "balance_loss_mlp": 1.01414013, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.72738132553965, + "language_loss": 0.74907464, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.7705853, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12139893, + "step": 7833, + "time_per_iteration": 2.4940826892852783 + }, + { + "auxiliary_loss_clip": 0.01125188, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.04759479, + "balance_loss_mlp": 1.02176917, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.5943629759036242, + "language_loss": 0.7597931, + "learning_rate": 2.283928754133762e-06, + "loss": 0.78139639, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13372803, + "step": 7834, + "time_per_iteration": 2.507007360458374 + }, + { + "auxiliary_loss_clip": 0.01126807, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.04999864, + "balance_loss_mlp": 1.02194512, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.3376351780100906, + "language_loss": 0.66119576, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68280882, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12561035, + "step": 7835, + "time_per_iteration": 2.6205296516418457 + }, + { + "auxiliary_loss_clip": 0.01071264, + "auxiliary_loss_mlp": 0.01020013, + "balance_loss_clip": 1.04434967, + "balance_loss_mlp": 1.01857042, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8745835541444164, + "language_loss": 0.62162197, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64253473, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.26904297, + "router_z_loss_mlp": 0.01441956, + "step": 7836, + "time_per_iteration": 3.082822561264038 + }, + { + "auxiliary_loss_clip": 0.0113131, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.04899728, + "balance_loss_mlp": 1.02208495, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.7572625286614136, + "language_loss": 0.6969015, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71857214, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13659668, + "step": 7837, + "time_per_iteration": 2.479844570159912 + }, + { + "auxiliary_loss_clip": 0.0112999, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.05173278, + "balance_loss_mlp": 1.02162075, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.8807015457846354, + "language_loss": 0.66697824, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68864, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14562988, + "step": 7838, + "time_per_iteration": 2.500142812728882 + }, + { + "auxiliary_loss_clip": 0.01121164, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.04137933, + "balance_loss_mlp": 1.01631272, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 2.0438171461888337, + "language_loss": 0.77652639, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79803419, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1328125, + "step": 7839, + "time_per_iteration": 2.5221283435821533 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.04791784, + "balance_loss_mlp": 1.02027774, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 1.8098729868728665, + "language_loss": 0.72990996, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75147593, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13415527, + "step": 7840, + "time_per_iteration": 2.5188353061676025 + }, + { + "auxiliary_loss_clip": 0.01125477, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.04803026, + "balance_loss_mlp": 1.01630592, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 3.557613011070671, + "language_loss": 0.75343293, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77498001, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12915039, + "step": 7841, + "time_per_iteration": 2.4672367572784424 + }, + { + "auxiliary_loss_clip": 0.01120441, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.04210734, + "balance_loss_mlp": 1.01887691, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.8624299795782184, + "language_loss": 0.70768511, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72921079, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13256836, + "step": 7842, + "time_per_iteration": 2.503603935241699 + }, + { + "auxiliary_loss_clip": 0.0113164, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.05460668, + "balance_loss_mlp": 1.02043688, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.238485139971394, + "language_loss": 0.78707129, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80872154, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.1295166, + "step": 7843, + "time_per_iteration": 2.5580639839172363 + }, + { + "auxiliary_loss_clip": 0.01130452, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.05336571, + "balance_loss_mlp": 1.01608276, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.7315494142146983, + "language_loss": 0.74464577, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76624465, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13360596, + "step": 7844, + "time_per_iteration": 2.5152461528778076 + }, + { + "auxiliary_loss_clip": 0.01128141, + "auxiliary_loss_mlp": 0.0104124, + "balance_loss_clip": 1.04970932, + "balance_loss_mlp": 1.02775192, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.4687498972918873, + "language_loss": 0.78271627, + "learning_rate": 2.279687417645088e-06, + "loss": 0.8044101, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13500977, + "step": 7845, + "time_per_iteration": 3.9459068775177 + }, + { + "auxiliary_loss_clip": 0.0112571, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.0493803, + "balance_loss_mlp": 1.02072823, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.4523849064448044, + "language_loss": 0.73076618, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75235558, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.125, + "step": 7846, + "time_per_iteration": 2.5992209911346436 + }, + { + "auxiliary_loss_clip": 0.01128301, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.05211449, + "balance_loss_mlp": 1.02175331, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.373990987150029, + "language_loss": 0.74265593, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76428318, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12677002, + "step": 7847, + "time_per_iteration": 2.591167449951172 + }, + { + "auxiliary_loss_clip": 0.01127154, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.05052924, + "balance_loss_mlp": 1.02073348, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.797041235769581, + "language_loss": 0.80394948, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82555389, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12567139, + "step": 7848, + "time_per_iteration": 2.4663352966308594 + }, + { + "auxiliary_loss_clip": 0.0113466, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.05603373, + "balance_loss_mlp": 1.02044213, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.9082705815759728, + "language_loss": 0.70348608, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72516495, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12792969, + "step": 7849, + "time_per_iteration": 2.5000193119049072 + }, + { + "auxiliary_loss_clip": 0.01152591, + "auxiliary_loss_mlp": 0.01043932, + "balance_loss_clip": 1.06667578, + "balance_loss_mlp": 1.0290848, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.450758982992539, + "language_loss": 0.69472772, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71669292, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14862061, + "step": 7850, + "time_per_iteration": 2.450965166091919 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.05079627, + "balance_loss_mlp": 1.01520205, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 2.073961200946647, + "language_loss": 0.746589, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.7681765, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13464355, + "step": 7851, + "time_per_iteration": 2.4783337116241455 + }, + { + "auxiliary_loss_clip": 0.01127464, + "auxiliary_loss_mlp": 0.01037835, + "balance_loss_clip": 1.04815698, + "balance_loss_mlp": 1.02365458, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 2.4344440046238973, + "language_loss": 0.7636404, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78529334, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1418457, + "step": 7852, + "time_per_iteration": 2.4587607383728027 + }, + { + "auxiliary_loss_clip": 0.01126629, + "auxiliary_loss_mlp": 0.01037119, + "balance_loss_clip": 1.04970145, + "balance_loss_mlp": 1.02161622, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.5426225634616362, + "language_loss": 0.68999058, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71162808, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.15496826, + "step": 7853, + "time_per_iteration": 2.5915088653564453 + }, + { + "auxiliary_loss_clip": 0.01061127, + "auxiliary_loss_mlp": 0.01008051, + "balance_loss_clip": 1.03490138, + "balance_loss_mlp": 1.00615859, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7414114948598739, + "language_loss": 0.50210482, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52279657, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.0189209, + "step": 7854, + "time_per_iteration": 3.1951744556427 + }, + { + "auxiliary_loss_clip": 0.01130484, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.0512991, + "balance_loss_mlp": 1.01954365, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 2.0933427863166836, + "language_loss": 0.64509392, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66674131, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1472168, + "step": 7855, + "time_per_iteration": 2.530653238296509 + }, + { + "auxiliary_loss_clip": 0.01128896, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.04982114, + "balance_loss_mlp": 1.02457809, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.951974299685577, + "language_loss": 0.75649107, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7781685, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.1427002, + "step": 7856, + "time_per_iteration": 3.9502811431884766 + }, + { + "auxiliary_loss_clip": 0.01134539, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.05810356, + "balance_loss_mlp": 1.02116632, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.6383427597048938, + "language_loss": 0.74751568, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76919609, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12341309, + "step": 7857, + "time_per_iteration": 2.637410879135132 + }, + { + "auxiliary_loss_clip": 0.01130236, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.05180633, + "balance_loss_mlp": 1.03261054, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.499395846637822, + "language_loss": 0.65086907, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.67262435, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12683105, + "step": 7858, + "time_per_iteration": 2.622072219848633 + }, + { + "auxiliary_loss_clip": 0.01126988, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.04999661, + "balance_loss_mlp": 1.01859903, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.7025401453721394, + "language_loss": 0.70430392, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72589368, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13397217, + "step": 7859, + "time_per_iteration": 2.4836323261260986 + }, + { + "auxiliary_loss_clip": 0.01132494, + "auxiliary_loss_mlp": 0.01037369, + "balance_loss_clip": 1.05059409, + "balance_loss_mlp": 1.0233326, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.8848308539257743, + "language_loss": 0.62386489, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64556354, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14038086, + "step": 7860, + "time_per_iteration": 2.5210041999816895 + }, + { + "auxiliary_loss_clip": 0.01143483, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_clip": 1.06163859, + "balance_loss_mlp": 1.03040075, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.3453410371627323, + "language_loss": 0.72033137, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.74220717, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13696289, + "step": 7861, + "time_per_iteration": 2.5846378803253174 + }, + { + "auxiliary_loss_clip": 0.01131017, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.05232227, + "balance_loss_mlp": 1.01924825, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8428783655170626, + "language_loss": 0.84939432, + "learning_rate": 2.273130107677896e-06, + "loss": 0.8710289, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13195801, + "step": 7862, + "time_per_iteration": 2.482400894165039 + }, + { + "auxiliary_loss_clip": 0.01134382, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.05341721, + "balance_loss_mlp": 1.01964116, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 2.111403388570806, + "language_loss": 0.84690994, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86858433, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13433838, + "step": 7863, + "time_per_iteration": 2.452670097351074 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.05231309, + "balance_loss_mlp": 1.02161741, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.9014890089721868, + "language_loss": 0.66047978, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68213129, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13122559, + "step": 7864, + "time_per_iteration": 3.8284687995910645 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.05157781, + "balance_loss_mlp": 1.01675165, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 2.007609761385213, + "language_loss": 0.65596992, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67759717, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13983154, + "step": 7865, + "time_per_iteration": 2.491698741912842 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.05340123, + "balance_loss_mlp": 1.02144766, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.6424406308984816, + "language_loss": 0.7420764, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76374948, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13806152, + "step": 7866, + "time_per_iteration": 2.449172258377075 + }, + { + "auxiliary_loss_clip": 0.01133013, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.05246973, + "balance_loss_mlp": 1.01626432, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 4.830303995820544, + "language_loss": 0.83263099, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85425425, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13037109, + "step": 7867, + "time_per_iteration": 2.557605266571045 + }, + { + "auxiliary_loss_clip": 0.01123788, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.04675436, + "balance_loss_mlp": 1.02151608, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 2.0884893536433053, + "language_loss": 0.79215419, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81373656, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1293335, + "step": 7868, + "time_per_iteration": 2.4727563858032227 + }, + { + "auxiliary_loss_clip": 0.01136992, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.0527482, + "balance_loss_mlp": 1.02333617, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 1.7829029226427315, + "language_loss": 0.75086021, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.77260673, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14318848, + "step": 7869, + "time_per_iteration": 2.480703353881836 + }, + { + "auxiliary_loss_clip": 0.01135371, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.05534244, + "balance_loss_mlp": 1.02582049, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.6608607898466725, + "language_loss": 0.73587257, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75763375, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.14916992, + "step": 7870, + "time_per_iteration": 2.4886248111724854 + }, + { + "auxiliary_loss_clip": 0.01132603, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.05053663, + "balance_loss_mlp": 1.02138352, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 2.036290924392077, + "language_loss": 0.81205696, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83374393, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14709473, + "step": 7871, + "time_per_iteration": 2.581696033477783 + }, + { + "auxiliary_loss_clip": 0.01126901, + "auxiliary_loss_mlp": 0.010304, + "balance_loss_clip": 1.04844403, + "balance_loss_mlp": 1.01691723, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.7973822331822136, + "language_loss": 0.7572611, + "learning_rate": 2.269271463701879e-06, + "loss": 0.7788341, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13482666, + "step": 7872, + "time_per_iteration": 2.502791166305542 + }, + { + "auxiliary_loss_clip": 0.01123924, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.04518151, + "balance_loss_mlp": 1.01941824, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.780505496713303, + "language_loss": 0.67668855, + "learning_rate": 2.268885542903428e-06, + "loss": 0.69826865, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14660645, + "step": 7873, + "time_per_iteration": 4.05984354019165 + }, + { + "auxiliary_loss_clip": 0.01126712, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.0484637, + "balance_loss_mlp": 1.01713395, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.5710232197261662, + "language_loss": 0.73080599, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75237823, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13378906, + "step": 7874, + "time_per_iteration": 2.462400197982788 + }, + { + "auxiliary_loss_clip": 0.01128093, + "auxiliary_loss_mlp": 0.01043107, + "balance_loss_clip": 1.04807401, + "balance_loss_mlp": 1.02769923, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.170046007261939, + "language_loss": 0.65280032, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67451233, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.1539917, + "step": 7875, + "time_per_iteration": 2.5314884185791016 + }, + { + "auxiliary_loss_clip": 0.01134708, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.05387247, + "balance_loss_mlp": 1.02379394, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.082377784139141, + "language_loss": 0.80758733, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.8293184, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14605713, + "step": 7876, + "time_per_iteration": 2.5007002353668213 + }, + { + "auxiliary_loss_clip": 0.01136017, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.05582964, + "balance_loss_mlp": 1.02240157, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.8351770566377097, + "language_loss": 0.78900218, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81071746, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13122559, + "step": 7877, + "time_per_iteration": 2.624290943145752 + }, + { + "auxiliary_loss_clip": 0.01131231, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.05350661, + "balance_loss_mlp": 1.0190984, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 2.0431875111205082, + "language_loss": 0.71303368, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.73466736, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13043213, + "step": 7878, + "time_per_iteration": 2.4469876289367676 + }, + { + "auxiliary_loss_clip": 0.01124875, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.0484767, + "balance_loss_mlp": 1.02284908, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.535161787538887, + "language_loss": 0.75192827, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77353764, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13226318, + "step": 7879, + "time_per_iteration": 2.515838384628296 + }, + { + "auxiliary_loss_clip": 0.0108126, + "auxiliary_loss_mlp": 0.01004298, + "balance_loss_clip": 1.05432427, + "balance_loss_mlp": 1.00203657, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7321012248747809, + "language_loss": 0.61303389, + "learning_rate": 2.266183812641164e-06, + "loss": 0.6338895, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.02261353, + "step": 7880, + "time_per_iteration": 3.098357915878296 + }, + { + "auxiliary_loss_clip": 0.01127967, + "auxiliary_loss_mlp": 0.01038509, + "balance_loss_clip": 1.04896164, + "balance_loss_mlp": 1.02388203, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5019977539957512, + "language_loss": 0.67910826, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.700773, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14624023, + "step": 7881, + "time_per_iteration": 2.485511302947998 + }, + { + "auxiliary_loss_clip": 0.01141373, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.05939698, + "balance_loss_mlp": 1.01911831, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.8277902024896058, + "language_loss": 0.77250218, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79423505, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12805176, + "step": 7882, + "time_per_iteration": 2.4368789196014404 + }, + { + "auxiliary_loss_clip": 0.01133415, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.05437851, + "balance_loss_mlp": 1.01971412, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.4232505909061555, + "language_loss": 0.76186818, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78354192, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14245605, + "step": 7883, + "time_per_iteration": 2.5198206901550293 + }, + { + "auxiliary_loss_clip": 0.01128263, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.04822826, + "balance_loss_mlp": 1.02209616, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7582859572209306, + "language_loss": 0.71994746, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74158221, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13122559, + "step": 7884, + "time_per_iteration": 2.4362690448760986 + }, + { + "auxiliary_loss_clip": 0.01144616, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.05580533, + "balance_loss_mlp": 1.02469301, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 1.8291838505103597, + "language_loss": 0.82168704, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84352851, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14801025, + "step": 7885, + "time_per_iteration": 2.5014805793762207 + }, + { + "auxiliary_loss_clip": 0.01133985, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.05535614, + "balance_loss_mlp": 1.02121067, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.7819276725876938, + "language_loss": 0.73843682, + "learning_rate": 2.263867649999751e-06, + "loss": 0.7601254, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13671875, + "step": 7886, + "time_per_iteration": 2.4370992183685303 + }, + { + "auxiliary_loss_clip": 0.01140989, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.05732203, + "balance_loss_mlp": 1.02121449, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.178312596654974, + "language_loss": 0.73884469, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76061785, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.15118408, + "step": 7887, + "time_per_iteration": 2.540794610977173 + }, + { + "auxiliary_loss_clip": 0.01127559, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.05031657, + "balance_loss_mlp": 1.01691175, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 2.5838632903931376, + "language_loss": 0.76991105, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79148531, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12957764, + "step": 7888, + "time_per_iteration": 3.9842803478240967 + }, + { + "auxiliary_loss_clip": 0.01127199, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.0482825, + "balance_loss_mlp": 1.01991916, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.7526025065731121, + "language_loss": 0.72715616, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74876708, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.1395874, + "step": 7889, + "time_per_iteration": 2.5463292598724365 + }, + { + "auxiliary_loss_clip": 0.01054312, + "auxiliary_loss_mlp": 0.01007498, + "balance_loss_clip": 1.02786493, + "balance_loss_mlp": 1.00618351, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7136327791731364, + "language_loss": 0.56042635, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58104444, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01315308, + "step": 7890, + "time_per_iteration": 3.2475645542144775 + }, + { + "auxiliary_loss_clip": 0.01122508, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.04377103, + "balance_loss_mlp": 1.02367949, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.099449929481268, + "language_loss": 0.65704596, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67865676, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14880371, + "step": 7891, + "time_per_iteration": 2.4893338680267334 + }, + { + "auxiliary_loss_clip": 0.01129845, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.04723263, + "balance_loss_mlp": 1.01714039, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.234365644321304, + "language_loss": 0.71014106, + "learning_rate": 2.26155112714642e-06, + "loss": 0.73176086, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14990234, + "step": 7892, + "time_per_iteration": 2.535841941833496 + }, + { + "auxiliary_loss_clip": 0.01088328, + "auxiliary_loss_mlp": 0.0100808, + "balance_loss_clip": 1.06155586, + "balance_loss_mlp": 1.00619102, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8039809049823653, + "language_loss": 0.58568883, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60665298, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.26806641, + "router_z_loss_mlp": 0.0189209, + "step": 7893, + "time_per_iteration": 3.2224440574645996 + }, + { + "auxiliary_loss_clip": 0.01134464, + "auxiliary_loss_mlp": 0.01037745, + "balance_loss_clip": 1.05451822, + "balance_loss_mlp": 1.0240953, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.7824555670714857, + "language_loss": 0.77289766, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.7946198, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13659668, + "step": 7894, + "time_per_iteration": 2.461548089981079 + }, + { + "auxiliary_loss_clip": 0.01124407, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.04634821, + "balance_loss_mlp": 1.02329326, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 2.7709374364561623, + "language_loss": 0.74517989, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76679164, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13470459, + "step": 7895, + "time_per_iteration": 2.461045265197754 + }, + { + "auxiliary_loss_clip": 0.01136425, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.05860412, + "balance_loss_mlp": 1.01607013, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 2.3955874236749306, + "language_loss": 0.82415068, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84581321, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13769531, + "step": 7896, + "time_per_iteration": 2.4599084854125977 + }, + { + "auxiliary_loss_clip": 0.01124883, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.04615045, + "balance_loss_mlp": 1.02127504, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.8467646369499253, + "language_loss": 0.75592101, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77753794, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.15527344, + "step": 7897, + "time_per_iteration": 2.4869072437286377 + }, + { + "auxiliary_loss_clip": 0.01133659, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.04904687, + "balance_loss_mlp": 1.02196598, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.57693856684939, + "language_loss": 0.63876903, + "learning_rate": 2.25923424724351e-06, + "loss": 0.66046977, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14459229, + "step": 7898, + "time_per_iteration": 2.4927382469177246 + }, + { + "auxiliary_loss_clip": 0.01132355, + "auxiliary_loss_mlp": 0.01041305, + "balance_loss_clip": 1.05336106, + "balance_loss_mlp": 1.02658308, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.5047417028374355, + "language_loss": 0.70246136, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72419798, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14703369, + "step": 7899, + "time_per_iteration": 2.4773197174072266 + }, + { + "auxiliary_loss_clip": 0.01134763, + "auxiliary_loss_mlp": 0.0104242, + "balance_loss_clip": 1.05220437, + "balance_loss_mlp": 1.02780533, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.863312963923759, + "language_loss": 0.68613642, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70790827, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14624023, + "step": 7900, + "time_per_iteration": 4.005927801132202 + }, + { + "auxiliary_loss_clip": 0.01131058, + "auxiliary_loss_mlp": 0.01034874, + "balance_loss_clip": 1.0517627, + "balance_loss_mlp": 1.02111101, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 2.5267814757759934, + "language_loss": 0.70849466, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.73015392, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13763428, + "step": 7901, + "time_per_iteration": 2.593020439147949 + }, + { + "auxiliary_loss_clip": 0.01125973, + "auxiliary_loss_mlp": 0.01045741, + "balance_loss_clip": 1.04612541, + "balance_loss_mlp": 1.03125691, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.6049384898179049, + "language_loss": 0.73377556, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75549275, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14483643, + "step": 7902, + "time_per_iteration": 2.461702346801758 + }, + { + "auxiliary_loss_clip": 0.0113251, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.05431294, + "balance_loss_mlp": 1.01959538, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 2.0837448456409113, + "language_loss": 0.69178241, + "learning_rate": 2.257303243526688e-06, + "loss": 0.7134304, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12689209, + "step": 7903, + "time_per_iteration": 2.4622795581817627 + }, + { + "auxiliary_loss_clip": 0.01125888, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.05041683, + "balance_loss_mlp": 1.01596904, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.575604261200684, + "language_loss": 0.72062755, + "learning_rate": 2.256917013453848e-06, + "loss": 0.742172, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12585449, + "step": 7904, + "time_per_iteration": 2.4889755249023438 + }, + { + "auxiliary_loss_clip": 0.01120122, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.04320288, + "balance_loss_mlp": 1.02560759, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.4973565396222783, + "language_loss": 0.86121953, + "learning_rate": 2.25653077363869e-06, + "loss": 0.8828249, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.14794922, + "step": 7905, + "time_per_iteration": 2.488605499267578 + }, + { + "auxiliary_loss_clip": 0.01121375, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.04617989, + "balance_loss_mlp": 1.0203526, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.671273330831661, + "language_loss": 0.82308704, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84463036, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12609863, + "step": 7906, + "time_per_iteration": 2.6682169437408447 + }, + { + "auxiliary_loss_clip": 0.01069924, + "auxiliary_loss_mlp": 0.01003175, + "balance_loss_clip": 1.04375041, + "balance_loss_mlp": 1.00157642, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6696158733693324, + "language_loss": 0.58952034, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61025137, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.26123047, + "router_z_loss_mlp": 0.01600647, + "step": 7907, + "time_per_iteration": 3.2078332901000977 + }, + { + "auxiliary_loss_clip": 0.01128276, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.04943883, + "balance_loss_mlp": 1.02111149, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.934653670328148, + "language_loss": 0.81110525, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83273345, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13427734, + "step": 7908, + "time_per_iteration": 2.4395132064819336 + }, + { + "auxiliary_loss_clip": 0.01131618, + "auxiliary_loss_mlp": 0.01038107, + "balance_loss_clip": 1.05228806, + "balance_loss_mlp": 1.02407634, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.7108426298651884, + "language_loss": 0.73698795, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75868517, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.14025879, + "step": 7909, + "time_per_iteration": 3.855677366256714 + }, + { + "auxiliary_loss_clip": 0.01131039, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.05307496, + "balance_loss_mlp": 1.01975071, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.7011315303143197, + "language_loss": 0.75661999, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77826381, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13586426, + "step": 7910, + "time_per_iteration": 2.5644567012786865 + }, + { + "auxiliary_loss_clip": 0.01122586, + "auxiliary_loss_mlp": 0.01029779, + "balance_loss_clip": 1.04581618, + "balance_loss_mlp": 1.01627851, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.9181765968846594, + "language_loss": 0.7939918, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.8155154, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1350708, + "step": 7911, + "time_per_iteration": 2.517233371734619 + }, + { + "auxiliary_loss_clip": 0.01129754, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.04824793, + "balance_loss_mlp": 1.02653491, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.8522305195730226, + "language_loss": 0.76315141, + "learning_rate": 2.253826823377983e-06, + "loss": 0.78487158, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.15722656, + "step": 7912, + "time_per_iteration": 2.5391757488250732 + }, + { + "auxiliary_loss_clip": 0.01124567, + "auxiliary_loss_mlp": 0.01047508, + "balance_loss_clip": 1.04513681, + "balance_loss_mlp": 1.03293455, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.5005934319519887, + "language_loss": 0.74255908, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76427978, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.14581299, + "step": 7913, + "time_per_iteration": 2.5263819694519043 + }, + { + "auxiliary_loss_clip": 0.01129624, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.05156183, + "balance_loss_mlp": 1.01978195, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 1.8646163034097278, + "language_loss": 0.72180492, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74343884, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13983154, + "step": 7914, + "time_per_iteration": 2.51202392578125 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01031329, + "balance_loss_clip": 1.05434358, + "balance_loss_mlp": 1.01833498, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 2.3028509538238584, + "language_loss": 0.64936262, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67101771, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13000488, + "step": 7915, + "time_per_iteration": 2.4607303142547607 + }, + { + "auxiliary_loss_clip": 0.01141157, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.06223369, + "balance_loss_mlp": 1.021047, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.7801780578849538, + "language_loss": 0.76964712, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79140246, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13330078, + "step": 7916, + "time_per_iteration": 3.9957382678985596 + }, + { + "auxiliary_loss_clip": 0.01124855, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.04676664, + "balance_loss_mlp": 1.02247488, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.8167740636609493, + "language_loss": 0.64550722, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66710871, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1282959, + "step": 7917, + "time_per_iteration": 2.5027506351470947 + }, + { + "auxiliary_loss_clip": 0.01051503, + "auxiliary_loss_mlp": 0.01002632, + "balance_loss_clip": 1.02525353, + "balance_loss_mlp": 1.00124788, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8432321133529554, + "language_loss": 0.65640604, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67694736, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.26318359, + "router_z_loss_mlp": 0.01383972, + "step": 7918, + "time_per_iteration": 3.121676206588745 + }, + { + "auxiliary_loss_clip": 0.01136011, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.0561043, + "balance_loss_mlp": 1.01958752, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.614103043836464, + "language_loss": 0.69389522, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.71558058, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.1295166, + "step": 7919, + "time_per_iteration": 2.6102681159973145 + }, + { + "auxiliary_loss_clip": 0.01128169, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.04541945, + "balance_loss_mlp": 1.02146411, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 2.412131415289197, + "language_loss": 0.74791199, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.76954168, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13342285, + "step": 7920, + "time_per_iteration": 2.524813175201416 + }, + { + "auxiliary_loss_clip": 0.0113609, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.05338383, + "balance_loss_mlp": 1.0214479, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4949143743549314, + "language_loss": 0.77114576, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.7928673, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.1461792, + "step": 7921, + "time_per_iteration": 2.5120224952697754 + }, + { + "auxiliary_loss_clip": 0.01135503, + "auxiliary_loss_mlp": 0.01038814, + "balance_loss_clip": 1.05486751, + "balance_loss_mlp": 1.02396035, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 3.110525120059704, + "language_loss": 0.78086126, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80260444, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14855957, + "step": 7922, + "time_per_iteration": 2.5205278396606445 + }, + { + "auxiliary_loss_clip": 0.01131206, + "auxiliary_loss_mlp": 0.01038908, + "balance_loss_clip": 1.04852939, + "balance_loss_mlp": 1.02366734, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.6429747538656516, + "language_loss": 0.72527695, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74697804, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.15240479, + "step": 7923, + "time_per_iteration": 2.500793695449829 + }, + { + "auxiliary_loss_clip": 0.01133469, + "auxiliary_loss_mlp": 0.01040513, + "balance_loss_clip": 1.0510211, + "balance_loss_mlp": 1.02682209, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 1.8852842272289836, + "language_loss": 0.81793463, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.83967441, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13708496, + "step": 7924, + "time_per_iteration": 2.457843542098999 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.05229115, + "balance_loss_mlp": 1.02237439, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.871986830337001, + "language_loss": 0.80659163, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82833916, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.14978027, + "step": 7925, + "time_per_iteration": 2.4997286796569824 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.0104075, + "balance_loss_clip": 1.04502356, + "balance_loss_mlp": 1.02729762, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 3.9628762330298244, + "language_loss": 0.72025216, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74190658, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13452148, + "step": 7926, + "time_per_iteration": 2.5153934955596924 + }, + { + "auxiliary_loss_clip": 0.01131809, + "auxiliary_loss_mlp": 0.01037167, + "balance_loss_clip": 1.04830551, + "balance_loss_mlp": 1.02280211, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 2.390141364631942, + "language_loss": 0.68218911, + "learning_rate": 2.248031062546432e-06, + "loss": 0.70387888, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14343262, + "step": 7927, + "time_per_iteration": 2.545200824737549 + }, + { + "auxiliary_loss_clip": 0.01125276, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.04700243, + "balance_loss_mlp": 1.01805723, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.5158306525042966, + "language_loss": 0.6848048, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70636785, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12988281, + "step": 7928, + "time_per_iteration": 2.5876731872558594 + }, + { + "auxiliary_loss_clip": 0.01130979, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.0512805, + "balance_loss_mlp": 1.02089715, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 3.1246299023690773, + "language_loss": 0.79028344, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81194293, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14080811, + "step": 7929, + "time_per_iteration": 2.4586734771728516 + }, + { + "auxiliary_loss_clip": 0.01133461, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.05286598, + "balance_loss_mlp": 1.02446687, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 2.089635870564486, + "language_loss": 0.66959506, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.69129992, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12548828, + "step": 7930, + "time_per_iteration": 2.586045265197754 + }, + { + "auxiliary_loss_clip": 0.0111965, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.04171324, + "balance_loss_mlp": 1.02167153, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.6432085808170174, + "language_loss": 0.80212545, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82366729, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12860107, + "step": 7931, + "time_per_iteration": 3.923327684402466 + }, + { + "auxiliary_loss_clip": 0.01135596, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.05246067, + "balance_loss_mlp": 1.02172387, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.8247579230222584, + "language_loss": 0.75968552, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78139412, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13537598, + "step": 7932, + "time_per_iteration": 2.4914190769195557 + }, + { + "auxiliary_loss_clip": 0.0113684, + "auxiliary_loss_mlp": 0.01038409, + "balance_loss_clip": 1.05651748, + "balance_loss_mlp": 1.02430081, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 3.5191655114373463, + "language_loss": 0.79336703, + "learning_rate": 2.245712162906593e-06, + "loss": 0.8151195, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.14105225, + "step": 7933, + "time_per_iteration": 2.465718984603882 + }, + { + "auxiliary_loss_clip": 0.01132185, + "auxiliary_loss_mlp": 0.01041744, + "balance_loss_clip": 1.04622722, + "balance_loss_mlp": 1.02611542, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 2.0035581809318845, + "language_loss": 0.73859125, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76033056, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.15625, + "step": 7934, + "time_per_iteration": 2.555202007293701 + }, + { + "auxiliary_loss_clip": 0.01123156, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.04193783, + "balance_loss_mlp": 1.02183366, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.8828180461404223, + "language_loss": 0.79923511, + "learning_rate": 2.244939121664211e-06, + "loss": 0.82082605, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14111328, + "step": 7935, + "time_per_iteration": 2.512427806854248 + }, + { + "auxiliary_loss_clip": 0.01131848, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_clip": 1.04672134, + "balance_loss_mlp": 1.03220034, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.8218652002295106, + "language_loss": 0.71156961, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73337913, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.16906738, + "step": 7936, + "time_per_iteration": 2.5729682445526123 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.01040877, + "balance_loss_clip": 1.04902864, + "balance_loss_mlp": 1.0245986, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 1.8551546736874949, + "language_loss": 0.67482018, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.69653898, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.16290283, + "step": 7937, + "time_per_iteration": 2.5153040885925293 + }, + { + "auxiliary_loss_clip": 0.01059143, + "auxiliary_loss_mlp": 0.01008098, + "balance_loss_clip": 1.03206575, + "balance_loss_mlp": 1.00636172, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7182593560814489, + "language_loss": 0.56369722, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.5843696, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01737976, + "step": 7938, + "time_per_iteration": 3.246284246444702 + }, + { + "auxiliary_loss_clip": 0.01132026, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.04988194, + "balance_loss_mlp": 1.02245963, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.6686872123749952, + "language_loss": 0.88894194, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91063082, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14379883, + "step": 7939, + "time_per_iteration": 2.4698140621185303 + }, + { + "auxiliary_loss_clip": 0.0113312, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.05130804, + "balance_loss_mlp": 1.01928043, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 2.0995109687223894, + "language_loss": 0.77357292, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79522699, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13012695, + "step": 7940, + "time_per_iteration": 2.5284488201141357 + }, + { + "auxiliary_loss_clip": 0.01131115, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.05501902, + "balance_loss_mlp": 1.02623343, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.597878634986959, + "language_loss": 0.84838456, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87008172, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12371826, + "step": 7941, + "time_per_iteration": 2.4432930946350098 + }, + { + "auxiliary_loss_clip": 0.01131129, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.04957485, + "balance_loss_mlp": 1.02246606, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 3.406219407778097, + "language_loss": 0.76003683, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78171551, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.14282227, + "step": 7942, + "time_per_iteration": 2.514659881591797 + }, + { + "auxiliary_loss_clip": 0.01129644, + "auxiliary_loss_mlp": 0.01042894, + "balance_loss_clip": 1.04660821, + "balance_loss_mlp": 1.02827311, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.817639318336995, + "language_loss": 0.64493811, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66666347, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14630127, + "step": 7943, + "time_per_iteration": 2.4202771186828613 + }, + { + "auxiliary_loss_clip": 0.01140082, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.05699968, + "balance_loss_mlp": 1.02095056, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 2.0094852140106068, + "language_loss": 0.73194164, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.7537061, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.15429688, + "step": 7944, + "time_per_iteration": 3.8833200931549072 + }, + { + "auxiliary_loss_clip": 0.01142016, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.05771637, + "balance_loss_mlp": 1.01990724, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 3.483616854074694, + "language_loss": 0.68023801, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.7020117, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.15435791, + "step": 7945, + "time_per_iteration": 2.4459969997406006 + }, + { + "auxiliary_loss_clip": 0.0112889, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.04874325, + "balance_loss_mlp": 1.02083158, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.9801060296192217, + "language_loss": 0.75353259, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77516276, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13287354, + "step": 7946, + "time_per_iteration": 2.5344231128692627 + }, + { + "auxiliary_loss_clip": 0.01137974, + "auxiliary_loss_mlp": 0.01043494, + "balance_loss_clip": 1.05276394, + "balance_loss_mlp": 1.02790189, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.9594419219047767, + "language_loss": 0.79174447, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81355917, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.15600586, + "step": 7947, + "time_per_iteration": 2.4671924114227295 + }, + { + "auxiliary_loss_clip": 0.01129793, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.05263925, + "balance_loss_mlp": 1.01895165, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.82030167325094, + "language_loss": 0.73732847, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75895137, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13549805, + "step": 7948, + "time_per_iteration": 2.4304964542388916 + }, + { + "auxiliary_loss_clip": 0.011329, + "auxiliary_loss_mlp": 0.01037227, + "balance_loss_clip": 1.04992902, + "balance_loss_mlp": 1.02230227, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.6319502746316605, + "language_loss": 0.78176111, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80346239, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14923096, + "step": 7949, + "time_per_iteration": 2.4290783405303955 + }, + { + "auxiliary_loss_clip": 0.01132141, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.052315, + "balance_loss_mlp": 1.0209713, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.0844164183218052, + "language_loss": 0.73770142, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.75936383, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13122559, + "step": 7950, + "time_per_iteration": 2.476992130279541 + }, + { + "auxiliary_loss_clip": 0.01133253, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.05420136, + "balance_loss_mlp": 1.0226022, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 3.4735738920086767, + "language_loss": 0.74598283, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76768935, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.14807129, + "step": 7951, + "time_per_iteration": 2.5469186305999756 + }, + { + "auxiliary_loss_clip": 0.01127094, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.04552126, + "balance_loss_mlp": 1.01719391, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.1581033953536513, + "language_loss": 0.79863894, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82023036, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.14868164, + "step": 7952, + "time_per_iteration": 4.082602500915527 + }, + { + "auxiliary_loss_clip": 0.01136785, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.05545068, + "balance_loss_mlp": 1.02556348, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.733041745274042, + "language_loss": 0.78484583, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80661356, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.14416504, + "step": 7953, + "time_per_iteration": 2.508289098739624 + }, + { + "auxiliary_loss_clip": 0.01146236, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.05957317, + "balance_loss_mlp": 1.01752794, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.7911457726917646, + "language_loss": 0.84041589, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86219358, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14001465, + "step": 7954, + "time_per_iteration": 2.4906604290008545 + }, + { + "auxiliary_loss_clip": 0.01130159, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.05043757, + "balance_loss_mlp": 1.02329063, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.7266064626216335, + "language_loss": 0.70612645, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72779477, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.1338501, + "step": 7955, + "time_per_iteration": 2.539799213409424 + }, + { + "auxiliary_loss_clip": 0.01126775, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.04746008, + "balance_loss_mlp": 1.0233047, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5182767667169597, + "language_loss": 0.82093847, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84257758, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.1383667, + "step": 7956, + "time_per_iteration": 2.5396370887756348 + }, + { + "auxiliary_loss_clip": 0.0113535, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.0565598, + "balance_loss_mlp": 1.01514006, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.9208722048358233, + "language_loss": 0.85075051, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87240243, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.14703369, + "step": 7957, + "time_per_iteration": 2.5450258255004883 + }, + { + "auxiliary_loss_clip": 0.01133599, + "auxiliary_loss_mlp": 0.01035369, + "balance_loss_clip": 1.05401146, + "balance_loss_mlp": 1.02164757, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.745179980738819, + "language_loss": 0.79895175, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.82064146, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13726807, + "step": 7958, + "time_per_iteration": 2.4485528469085693 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.05278611, + "balance_loss_mlp": 1.02661133, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 1.8817019684433436, + "language_loss": 0.82952535, + "learning_rate": 2.235659762404047e-06, + "loss": 0.85127813, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14624023, + "step": 7959, + "time_per_iteration": 2.518388271331787 + }, + { + "auxiliary_loss_clip": 0.01129711, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.05202961, + "balance_loss_mlp": 1.01748359, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.1576996615101423, + "language_loss": 0.73124886, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75284457, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12390137, + "step": 7960, + "time_per_iteration": 3.9408059120178223 + }, + { + "auxiliary_loss_clip": 0.01126735, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.04819226, + "balance_loss_mlp": 1.02591002, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.7482436739679619, + "language_loss": 0.76712316, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.78877926, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12988281, + "step": 7961, + "time_per_iteration": 2.4506592750549316 + }, + { + "auxiliary_loss_clip": 0.01124191, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.04490995, + "balance_loss_mlp": 1.01954544, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.6272742453443734, + "language_loss": 0.77935505, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.80093086, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13848877, + "step": 7962, + "time_per_iteration": 2.4408373832702637 + }, + { + "auxiliary_loss_clip": 0.01129735, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.04862618, + "balance_loss_mlp": 1.02358568, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 2.0055800390836676, + "language_loss": 0.64913845, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.6708113, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13970947, + "step": 7963, + "time_per_iteration": 2.5060198307037354 + }, + { + "auxiliary_loss_clip": 0.01132293, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.05220747, + "balance_loss_mlp": 1.02160466, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.7526879883378872, + "language_loss": 0.77631336, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79798937, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13702393, + "step": 7964, + "time_per_iteration": 2.6862595081329346 + }, + { + "auxiliary_loss_clip": 0.01129081, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.04616928, + "balance_loss_mlp": 1.01646805, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.8814261847818654, + "language_loss": 0.76604515, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78765452, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.15393066, + "step": 7965, + "time_per_iteration": 2.512113571166992 + }, + { + "auxiliary_loss_clip": 0.01126836, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_clip": 1.04809976, + "balance_loss_mlp": 1.02384233, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.6290966886934262, + "language_loss": 0.74490213, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76653981, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.1307373, + "step": 7966, + "time_per_iteration": 2.475510597229004 + }, + { + "auxiliary_loss_clip": 0.01131757, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.05251122, + "balance_loss_mlp": 1.02081072, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.6427708650568942, + "language_loss": 0.73213232, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75380003, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.1418457, + "step": 7967, + "time_per_iteration": 2.53403377532959 + }, + { + "auxiliary_loss_clip": 0.01116227, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.04121065, + "balance_loss_mlp": 1.01974535, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 1.774174154355931, + "language_loss": 0.78901958, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81051832, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13922119, + "step": 7968, + "time_per_iteration": 2.624613046646118 + }, + { + "auxiliary_loss_clip": 0.01054963, + "auxiliary_loss_mlp": 0.01004724, + "balance_loss_clip": 1.02881086, + "balance_loss_mlp": 1.00316226, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7630580230194313, + "language_loss": 0.62219101, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64278781, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01559448, + "step": 7969, + "time_per_iteration": 3.2509894371032715 + }, + { + "auxiliary_loss_clip": 0.01119639, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.04453659, + "balance_loss_mlp": 1.0201776, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.3967798884449607, + "language_loss": 0.76998109, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.7915107, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13134766, + "step": 7970, + "time_per_iteration": 2.516937255859375 + }, + { + "auxiliary_loss_clip": 0.01125679, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.04391718, + "balance_loss_mlp": 1.02803731, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.6043083829360514, + "language_loss": 0.70582914, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72752357, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.15734863, + "step": 7971, + "time_per_iteration": 2.4723381996154785 + }, + { + "auxiliary_loss_clip": 0.01127004, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.04961157, + "balance_loss_mlp": 1.0177927, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.2804711045867219, + "language_loss": 0.80020261, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82179105, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.14050293, + "step": 7972, + "time_per_iteration": 2.5255801677703857 + }, + { + "auxiliary_loss_clip": 0.01133139, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.05180311, + "balance_loss_mlp": 1.01783919, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 1.9418459323582078, + "language_loss": 0.69667864, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71832824, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13977051, + "step": 7973, + "time_per_iteration": 2.4293532371520996 + }, + { + "auxiliary_loss_clip": 0.01128127, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.04907346, + "balance_loss_mlp": 1.02717602, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.8880866155077147, + "language_loss": 0.78993821, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.81162572, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13458252, + "step": 7974, + "time_per_iteration": 3.866412878036499 + }, + { + "auxiliary_loss_clip": 0.01063522, + "auxiliary_loss_mlp": 0.01005078, + "balance_loss_clip": 1.03600335, + "balance_loss_mlp": 1.00358641, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7565378344965137, + "language_loss": 0.54004043, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.5607264, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.01489258, + "step": 7975, + "time_per_iteration": 3.107203722000122 + }, + { + "auxiliary_loss_clip": 0.01139825, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.05702174, + "balance_loss_mlp": 1.02235949, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 1.9152866672392537, + "language_loss": 0.89753592, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.91930765, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14990234, + "step": 7976, + "time_per_iteration": 2.407280206680298 + }, + { + "auxiliary_loss_clip": 0.01132463, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.0493803, + "balance_loss_mlp": 1.0275805, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.2665177341871803, + "language_loss": 0.73805141, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75980651, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.15466309, + "step": 7977, + "time_per_iteration": 2.5144412517547607 + }, + { + "auxiliary_loss_clip": 0.01128491, + "auxiliary_loss_mlp": 0.01039987, + "balance_loss_clip": 1.05274796, + "balance_loss_mlp": 1.02701724, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.7605151108702353, + "language_loss": 0.78458607, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80627084, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12976074, + "step": 7978, + "time_per_iteration": 2.460786819458008 + }, + { + "auxiliary_loss_clip": 0.01128155, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.04998887, + "balance_loss_mlp": 1.02398074, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.6987670353798638, + "language_loss": 0.89784861, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91951197, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.14202881, + "step": 7979, + "time_per_iteration": 2.5351662635803223 + }, + { + "auxiliary_loss_clip": 0.01130009, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.04705167, + "balance_loss_mlp": 1.0261569, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.6128499180567748, + "language_loss": 0.76819801, + "learning_rate": 2.227536093754523e-06, + "loss": 0.78990948, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14984131, + "step": 7980, + "time_per_iteration": 2.522244930267334 + }, + { + "auxiliary_loss_clip": 0.01128657, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.04715073, + "balance_loss_mlp": 1.02456927, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.898107497225736, + "language_loss": 0.71953011, + "learning_rate": 2.227149156404295e-06, + "loss": 0.74121815, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.15582275, + "step": 7981, + "time_per_iteration": 2.6673085689544678 + }, + { + "auxiliary_loss_clip": 0.01128648, + "auxiliary_loss_mlp": 0.01042082, + "balance_loss_clip": 1.05019355, + "balance_loss_mlp": 1.02780044, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.7693849332882807, + "language_loss": 0.70386851, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.7255758, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.1427002, + "step": 7982, + "time_per_iteration": 2.4585113525390625 + }, + { + "auxiliary_loss_clip": 0.01128888, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.05604815, + "balance_loss_mlp": 1.0220145, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 1.6245627302597754, + "language_loss": 0.70918447, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73081106, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11761475, + "step": 7983, + "time_per_iteration": 2.5591964721679688 + }, + { + "auxiliary_loss_clip": 0.01050672, + "auxiliary_loss_mlp": 0.01003838, + "balance_loss_clip": 1.02398264, + "balance_loss_mlp": 1.0023061, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8105087194538301, + "language_loss": 0.59434742, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61489248, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.26660156, + "router_z_loss_mlp": 0.01531982, + "step": 7984, + "time_per_iteration": 3.0296740531921387 + }, + { + "auxiliary_loss_clip": 0.01125912, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.0455122, + "balance_loss_mlp": 1.02588034, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.7975879500152858, + "language_loss": 0.66609031, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68774605, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13793945, + "step": 7985, + "time_per_iteration": 2.4558866024017334 + }, + { + "auxiliary_loss_clip": 0.01130518, + "auxiliary_loss_mlp": 0.01047898, + "balance_loss_clip": 1.0468564, + "balance_loss_mlp": 1.03282404, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.8250409501856555, + "language_loss": 0.70144153, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72322571, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.15075684, + "step": 7986, + "time_per_iteration": 2.4486424922943115 + }, + { + "auxiliary_loss_clip": 0.01135782, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.0525254, + "balance_loss_mlp": 1.02280521, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 1.941985129827676, + "language_loss": 0.79290116, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.81463289, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14569092, + "step": 7987, + "time_per_iteration": 2.4553110599517822 + }, + { + "auxiliary_loss_clip": 0.01137365, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.05548608, + "balance_loss_mlp": 1.02282023, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 2.047407410439345, + "language_loss": 0.74785781, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.7695961, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13641357, + "step": 7988, + "time_per_iteration": 2.44661545753479 + }, + { + "auxiliary_loss_clip": 0.011331, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.05176497, + "balance_loss_mlp": 1.02297103, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 2.046835233224202, + "language_loss": 0.78884709, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81053984, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13201904, + "step": 7989, + "time_per_iteration": 3.8714075088500977 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.04834449, + "balance_loss_mlp": 1.02129495, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.8786289837392898, + "language_loss": 0.73475409, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75641805, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14141846, + "step": 7990, + "time_per_iteration": 2.5806448459625244 + }, + { + "auxiliary_loss_clip": 0.01058052, + "auxiliary_loss_mlp": 0.0100331, + "balance_loss_clip": 1.03100193, + "balance_loss_mlp": 1.00187969, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7753235222117656, + "language_loss": 0.59042281, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61103642, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01429749, + "step": 7991, + "time_per_iteration": 3.1857802867889404 + }, + { + "auxiliary_loss_clip": 0.0112506, + "auxiliary_loss_mlp": 0.01049433, + "balance_loss_clip": 1.04462719, + "balance_loss_mlp": 1.0319922, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.8841808507063316, + "language_loss": 0.67472565, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69647062, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.17449951, + "step": 7992, + "time_per_iteration": 2.538027286529541 + }, + { + "auxiliary_loss_clip": 0.01136839, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.05385339, + "balance_loss_mlp": 1.02180338, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 2.834687160757162, + "language_loss": 0.76075578, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78247851, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.1362915, + "step": 7993, + "time_per_iteration": 2.4628522396087646 + }, + { + "auxiliary_loss_clip": 0.01124299, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.04747248, + "balance_loss_mlp": 1.024472, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.6632265289213954, + "language_loss": 0.78136277, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80298477, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13427734, + "step": 7994, + "time_per_iteration": 2.578458070755005 + }, + { + "auxiliary_loss_clip": 0.01136044, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.05684996, + "balance_loss_mlp": 1.02024579, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 12.692962365947196, + "language_loss": 0.79538459, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81708193, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13458252, + "step": 7995, + "time_per_iteration": 2.4548532962799072 + }, + { + "auxiliary_loss_clip": 0.01133608, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.05235028, + "balance_loss_mlp": 1.02128756, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.5050283835758287, + "language_loss": 0.83111286, + "learning_rate": 2.2213440707461e-06, + "loss": 0.85280275, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.14105225, + "step": 7996, + "time_per_iteration": 3.888826608657837 + }, + { + "auxiliary_loss_clip": 0.011301, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.05247164, + "balance_loss_mlp": 1.02148163, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.647225765331143, + "language_loss": 0.80736774, + "learning_rate": 2.220956997340516e-06, + "loss": 0.8290152, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13171387, + "step": 7997, + "time_per_iteration": 2.4797897338867188 + }, + { + "auxiliary_loss_clip": 0.01133625, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.05285227, + "balance_loss_mlp": 1.02083921, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 5.516565911485071, + "language_loss": 0.72600353, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74768376, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13555908, + "step": 7998, + "time_per_iteration": 2.511082887649536 + }, + { + "auxiliary_loss_clip": 0.01136096, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.0563513, + "balance_loss_mlp": 1.02286077, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.7972368131893277, + "language_loss": 0.70955694, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73128945, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14300537, + "step": 7999, + "time_per_iteration": 2.557335615158081 + }, + { + "auxiliary_loss_clip": 0.01129837, + "auxiliary_loss_mlp": 0.01043812, + "balance_loss_clip": 1.04861426, + "balance_loss_mlp": 1.02992439, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 2.13065923165156, + "language_loss": 0.71363449, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73537099, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13891602, + "step": 8000, + "time_per_iteration": 2.488750457763672 + }, + { + "auxiliary_loss_clip": 0.01129251, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.04880786, + "balance_loss_mlp": 1.02456069, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.341407168813757, + "language_loss": 0.74640012, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.7680769, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13879395, + "step": 8001, + "time_per_iteration": 2.6914706230163574 + }, + { + "auxiliary_loss_clip": 0.01131887, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.04997897, + "balance_loss_mlp": 1.02466953, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.697905436727379, + "language_loss": 0.81667775, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83838195, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13861084, + "step": 8002, + "time_per_iteration": 2.4559779167175293 + }, + { + "auxiliary_loss_clip": 0.01136487, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.0554285, + "balance_loss_mlp": 1.01914835, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.949690092964033, + "language_loss": 0.71383804, + "learning_rate": 2.218634381467819e-06, + "loss": 0.73554575, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.15155029, + "step": 8003, + "time_per_iteration": 4.039965629577637 + }, + { + "auxiliary_loss_clip": 0.01121156, + "auxiliary_loss_mlp": 0.01038915, + "balance_loss_clip": 1.04459059, + "balance_loss_mlp": 1.02617764, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.6301118640821919, + "language_loss": 0.81899881, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84059948, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12744141, + "step": 8004, + "time_per_iteration": 2.53253436088562 + }, + { + "auxiliary_loss_clip": 0.01141556, + "auxiliary_loss_mlp": 0.01041569, + "balance_loss_clip": 1.05475903, + "balance_loss_mlp": 1.02521396, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.295747605026378, + "language_loss": 0.77792084, + "learning_rate": 2.217860109695239e-06, + "loss": 0.79975212, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.16363525, + "step": 8005, + "time_per_iteration": 2.558297872543335 + }, + { + "auxiliary_loss_clip": 0.01122593, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.04299819, + "balance_loss_mlp": 1.02254343, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 1.6758654331160838, + "language_loss": 0.70726377, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72886527, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.15008545, + "step": 8006, + "time_per_iteration": 2.455254316329956 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.04754877, + "balance_loss_mlp": 1.02109849, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.869491013631632, + "language_loss": 0.70398575, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72563434, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13995361, + "step": 8007, + "time_per_iteration": 2.5225367546081543 + }, + { + "auxiliary_loss_clip": 0.01132564, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.05165935, + "balance_loss_mlp": 1.02142429, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.5427645594408474, + "language_loss": 0.7190237, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.74070048, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13690186, + "step": 8008, + "time_per_iteration": 2.4944417476654053 + }, + { + "auxiliary_loss_clip": 0.01135212, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.0501585, + "balance_loss_mlp": 1.03071928, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.7417113246341378, + "language_loss": 0.60944444, + "learning_rate": 2.216311467132199e-06, + "loss": 0.63125467, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15100098, + "step": 8009, + "time_per_iteration": 2.460157871246338 + }, + { + "auxiliary_loss_clip": 0.01063982, + "auxiliary_loss_mlp": 0.0100838, + "balance_loss_clip": 1.03744531, + "balance_loss_mlp": 1.00706303, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8609558565173976, + "language_loss": 0.61296737, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63369101, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01315308, + "step": 8010, + "time_per_iteration": 3.101680278778076 + }, + { + "auxiliary_loss_clip": 0.01130123, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_clip": 1.04877388, + "balance_loss_mlp": 1.03137636, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.7413054769849539, + "language_loss": 0.73405349, + "learning_rate": 2.215537096576639e-06, + "loss": 0.755813, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14453125, + "step": 8011, + "time_per_iteration": 2.468329906463623 + }, + { + "auxiliary_loss_clip": 0.01124131, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.04801226, + "balance_loss_mlp": 1.02300382, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.9411923727597364, + "language_loss": 0.79169613, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81329393, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12652588, + "step": 8012, + "time_per_iteration": 2.483614921569824 + }, + { + "auxiliary_loss_clip": 0.01134938, + "auxiliary_loss_mlp": 0.01049811, + "balance_loss_clip": 1.05422163, + "balance_loss_mlp": 1.03548253, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.8294464039325127, + "language_loss": 0.73473936, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75658685, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14337158, + "step": 8013, + "time_per_iteration": 2.5219526290893555 + }, + { + "auxiliary_loss_clip": 0.01122093, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.04351652, + "balance_loss_mlp": 1.02185392, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 1.9977235831162925, + "language_loss": 0.9114846, + "learning_rate": 2.214375479481094e-06, + "loss": 0.93305516, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13128662, + "step": 8014, + "time_per_iteration": 2.5439164638519287 + }, + { + "auxiliary_loss_clip": 0.01135337, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.05163646, + "balance_loss_mlp": 1.02243257, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 2.2747252650143044, + "language_loss": 0.73849905, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76021904, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14221191, + "step": 8015, + "time_per_iteration": 2.4622950553894043 + }, + { + "auxiliary_loss_clip": 0.01133747, + "auxiliary_loss_mlp": 0.01043713, + "balance_loss_clip": 1.04848051, + "balance_loss_mlp": 1.02709496, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.0921854749182085, + "language_loss": 0.80428052, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82605511, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.1661377, + "step": 8016, + "time_per_iteration": 2.5113742351531982 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.04848981, + "balance_loss_mlp": 1.02290964, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 1.8793323482635886, + "language_loss": 0.77815974, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79975909, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1383667, + "step": 8017, + "time_per_iteration": 2.5427186489105225 + }, + { + "auxiliary_loss_clip": 0.01124433, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.04736423, + "balance_loss_mlp": 1.02337217, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 6.337171098824663, + "language_loss": 0.79947734, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82111478, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.15930176, + "step": 8018, + "time_per_iteration": 3.9524731636047363 + }, + { + "auxiliary_loss_clip": 0.0113405, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.0546515, + "balance_loss_mlp": 1.01894212, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.7723518109028864, + "language_loss": 0.75800622, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.7796694, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13342285, + "step": 8019, + "time_per_iteration": 2.5128672122955322 + }, + { + "auxiliary_loss_clip": 0.01120174, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.04041195, + "balance_loss_mlp": 1.02308941, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.7050580730842262, + "language_loss": 0.78996235, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81155521, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.16015625, + "step": 8020, + "time_per_iteration": 2.585745334625244 + }, + { + "auxiliary_loss_clip": 0.01123631, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.04642081, + "balance_loss_mlp": 1.02552247, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.282451854246159, + "language_loss": 0.69895208, + "learning_rate": 2.211664755756855e-06, + "loss": 0.72059417, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.15057373, + "step": 8021, + "time_per_iteration": 2.4447827339172363 + }, + { + "auxiliary_loss_clip": 0.0114085, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.05506229, + "balance_loss_mlp": 1.0209856, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.9399438774378377, + "language_loss": 0.63026983, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65204054, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.15234375, + "step": 8022, + "time_per_iteration": 2.5081682205200195 + }, + { + "auxiliary_loss_clip": 0.01134161, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.05450523, + "balance_loss_mlp": 1.01837158, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.427304752089608, + "language_loss": 0.66215026, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.6838069, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13134766, + "step": 8023, + "time_per_iteration": 2.4427034854888916 + }, + { + "auxiliary_loss_clip": 0.01133122, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.05305767, + "balance_loss_mlp": 1.01972675, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.8153766350406324, + "language_loss": 0.76197582, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78363729, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13299561, + "step": 8024, + "time_per_iteration": 2.517660140991211 + }, + { + "auxiliary_loss_clip": 0.0113559, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.05050623, + "balance_loss_mlp": 1.02613211, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.6316424663693976, + "language_loss": 0.75335526, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77512312, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.15063477, + "step": 8025, + "time_per_iteration": 2.4832470417022705 + }, + { + "auxiliary_loss_clip": 0.01131343, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.05162501, + "balance_loss_mlp": 1.0197022, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.8817137471332002, + "language_loss": 0.70953494, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73118758, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.14239502, + "step": 8026, + "time_per_iteration": 2.5299689769744873 + }, + { + "auxiliary_loss_clip": 0.01132655, + "auxiliary_loss_mlp": 0.01039298, + "balance_loss_clip": 1.04882979, + "balance_loss_mlp": 1.02403951, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.395630072645912, + "language_loss": 0.7472235, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76894301, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.15264893, + "step": 8027, + "time_per_iteration": 2.4573094844818115 + }, + { + "auxiliary_loss_clip": 0.01137595, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.05578518, + "balance_loss_mlp": 1.0266248, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 8.182697260097392, + "language_loss": 0.67310119, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69487953, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13598633, + "step": 8028, + "time_per_iteration": 2.5539841651916504 + }, + { + "auxiliary_loss_clip": 0.01135047, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.05373406, + "balance_loss_mlp": 1.0226506, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.5133454931806443, + "language_loss": 0.73116827, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75288606, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14074707, + "step": 8029, + "time_per_iteration": 2.439739942550659 + }, + { + "auxiliary_loss_clip": 0.01137235, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.05732131, + "balance_loss_mlp": 1.01696551, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 2.674483131275038, + "language_loss": 0.84983069, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87151587, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14300537, + "step": 8030, + "time_per_iteration": 2.625584363937378 + }, + { + "auxiliary_loss_clip": 0.01126944, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.04845428, + "balance_loss_mlp": 1.01870835, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 5.84486987477153, + "language_loss": 0.7393961, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76097882, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1262207, + "step": 8031, + "time_per_iteration": 2.4314591884613037 + }, + { + "auxiliary_loss_clip": 0.01134108, + "auxiliary_loss_mlp": 0.01045586, + "balance_loss_clip": 1.05057287, + "balance_loss_mlp": 1.03038096, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 4.485407579017944, + "language_loss": 0.72013104, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.74192798, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.15203857, + "step": 8032, + "time_per_iteration": 2.601259231567383 + }, + { + "auxiliary_loss_clip": 0.01134572, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.05187941, + "balance_loss_mlp": 1.02558446, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.4853591367044943, + "language_loss": 0.74104035, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.7627818, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13995361, + "step": 8033, + "time_per_iteration": 3.907986640930176 + }, + { + "auxiliary_loss_clip": 0.0113493, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.05194259, + "balance_loss_mlp": 1.02136588, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.6666529277968156, + "language_loss": 0.83536768, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85706902, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13830566, + "step": 8034, + "time_per_iteration": 2.5239343643188477 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.04867923, + "balance_loss_mlp": 1.01706553, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.6925641300859955, + "language_loss": 0.79287678, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81444466, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13024902, + "step": 8035, + "time_per_iteration": 2.4797184467315674 + }, + { + "auxiliary_loss_clip": 0.0113415, + "auxiliary_loss_mlp": 0.01041155, + "balance_loss_clip": 1.05341625, + "balance_loss_mlp": 1.02618861, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.817368901916393, + "language_loss": 0.69590747, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71766055, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1496582, + "step": 8036, + "time_per_iteration": 2.6069283485412598 + }, + { + "auxiliary_loss_clip": 0.01131603, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.05051124, + "balance_loss_mlp": 1.02172661, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.854361076647604, + "language_loss": 0.72607815, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74774963, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13812256, + "step": 8037, + "time_per_iteration": 2.5092763900756836 + }, + { + "auxiliary_loss_clip": 0.01143124, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.05766273, + "balance_loss_mlp": 1.02152276, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.3809965298808518, + "language_loss": 0.69095749, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71275163, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14770508, + "step": 8038, + "time_per_iteration": 2.452836036682129 + }, + { + "auxiliary_loss_clip": 0.01129507, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.05024195, + "balance_loss_mlp": 1.02278566, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.945930720016963, + "language_loss": 0.79130328, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81296462, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13842773, + "step": 8039, + "time_per_iteration": 3.967386245727539 + }, + { + "auxiliary_loss_clip": 0.01131153, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.05054975, + "balance_loss_mlp": 1.01757216, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.554629379920287, + "language_loss": 0.7744174, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.7960403, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13562012, + "step": 8040, + "time_per_iteration": 2.477023124694824 + }, + { + "auxiliary_loss_clip": 0.01130957, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.04994416, + "balance_loss_mlp": 1.02158368, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.5836767052308172, + "language_loss": 0.75711226, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77878416, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14630127, + "step": 8041, + "time_per_iteration": 2.5755057334899902 + }, + { + "auxiliary_loss_clip": 0.01146611, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.06476736, + "balance_loss_mlp": 1.02182817, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.7906635414956016, + "language_loss": 0.66561174, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68743348, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13726807, + "step": 8042, + "time_per_iteration": 2.5395028591156006 + }, + { + "auxiliary_loss_clip": 0.01058136, + "auxiliary_loss_mlp": 0.01007514, + "balance_loss_clip": 1.03130007, + "balance_loss_mlp": 1.00595272, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.7111183357777436, + "language_loss": 0.58537281, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60602933, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.26855469, + "router_z_loss_mlp": 0.01560974, + "step": 8043, + "time_per_iteration": 3.191051483154297 + }, + { + "auxiliary_loss_clip": 0.01133907, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.05205512, + "balance_loss_mlp": 1.01911807, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 1.9348309058217485, + "language_loss": 0.7197538, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74142331, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13909912, + "step": 8044, + "time_per_iteration": 2.4555392265319824 + }, + { + "auxiliary_loss_clip": 0.0113019, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.05252457, + "balance_loss_mlp": 1.0163312, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.4467739050662511, + "language_loss": 0.75734764, + "learning_rate": 2.202367891004714e-06, + "loss": 0.77895427, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.14135742, + "step": 8045, + "time_per_iteration": 2.494626760482788 + }, + { + "auxiliary_loss_clip": 0.01127084, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.04623699, + "balance_loss_mlp": 1.02296615, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.5855966079878299, + "language_loss": 0.69233787, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71397257, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13421631, + "step": 8046, + "time_per_iteration": 2.5128839015960693 + }, + { + "auxiliary_loss_clip": 0.01132445, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.04969549, + "balance_loss_mlp": 1.02449179, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 1.9223739595355005, + "language_loss": 0.82404327, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84575224, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13970947, + "step": 8047, + "time_per_iteration": 3.8800699710845947 + }, + { + "auxiliary_loss_clip": 0.01126293, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.04689384, + "balance_loss_mlp": 1.02392805, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.8841411657559537, + "language_loss": 0.80423415, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82587969, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.14343262, + "step": 8048, + "time_per_iteration": 2.5632948875427246 + }, + { + "auxiliary_loss_clip": 0.01131267, + "auxiliary_loss_mlp": 0.010393, + "balance_loss_clip": 1.05000174, + "balance_loss_mlp": 1.02566218, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.5279898845002864, + "language_loss": 0.81310326, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83480895, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13635254, + "step": 8049, + "time_per_iteration": 2.4925458431243896 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.05707479, + "balance_loss_mlp": 1.01848578, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.6651573911681987, + "language_loss": 0.7273801, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74902642, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.1237793, + "step": 8050, + "time_per_iteration": 2.5521678924560547 + }, + { + "auxiliary_loss_clip": 0.01049447, + "auxiliary_loss_mlp": 0.0100181, + "balance_loss_clip": 1.02340341, + "balance_loss_mlp": 1.00043046, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7003260502438041, + "language_loss": 0.56383836, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58435094, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01379395, + "step": 8051, + "time_per_iteration": 3.1906068325042725 + }, + { + "auxiliary_loss_clip": 0.01129073, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.048895, + "balance_loss_mlp": 1.02157211, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.4519367852129625, + "language_loss": 0.74956596, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77121329, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14074707, + "step": 8052, + "time_per_iteration": 2.6318278312683105 + }, + { + "auxiliary_loss_clip": 0.01129256, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.04877758, + "balance_loss_mlp": 1.02185237, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.2073188987218195, + "language_loss": 0.66272235, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68436486, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13146973, + "step": 8053, + "time_per_iteration": 2.5461995601654053 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.04336619, + "balance_loss_mlp": 1.0176971, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.9660227345895906, + "language_loss": 0.6913532, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71288401, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1307373, + "step": 8054, + "time_per_iteration": 2.537081718444824 + }, + { + "auxiliary_loss_clip": 0.01123922, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.04318261, + "balance_loss_mlp": 1.02005374, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.5790253344806569, + "language_loss": 0.69551909, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71709353, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13458252, + "step": 8055, + "time_per_iteration": 2.4889321327209473 + }, + { + "auxiliary_loss_clip": 0.01129526, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.04978979, + "balance_loss_mlp": 1.02186954, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.025968035528365, + "language_loss": 0.63653457, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65818685, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13842773, + "step": 8056, + "time_per_iteration": 2.427804946899414 + }, + { + "auxiliary_loss_clip": 0.01131624, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.05239367, + "balance_loss_mlp": 1.02125895, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.895399350571303, + "language_loss": 0.67153418, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69320214, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13916016, + "step": 8057, + "time_per_iteration": 2.5503623485565186 + }, + { + "auxiliary_loss_clip": 0.01128563, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.05010486, + "balance_loss_mlp": 1.02606893, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.9175394672974717, + "language_loss": 0.81531668, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83701468, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.15185547, + "step": 8058, + "time_per_iteration": 2.419605016708374 + }, + { + "auxiliary_loss_clip": 0.01132267, + "auxiliary_loss_mlp": 0.01042666, + "balance_loss_clip": 1.05200696, + "balance_loss_mlp": 1.02740717, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.805675733605371, + "language_loss": 0.80044746, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82219678, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.15264893, + "step": 8059, + "time_per_iteration": 2.4900994300842285 + }, + { + "auxiliary_loss_clip": 0.01134567, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.05131209, + "balance_loss_mlp": 1.02889431, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.0962008698852292, + "language_loss": 0.66804123, + "learning_rate": 2.196555093055352e-06, + "loss": 0.68984783, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.17211914, + "step": 8060, + "time_per_iteration": 2.6113338470458984 + }, + { + "auxiliary_loss_clip": 0.01132876, + "auxiliary_loss_mlp": 0.01049033, + "balance_loss_clip": 1.04962993, + "balance_loss_mlp": 1.03373826, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 3.022017881216579, + "language_loss": 0.67198062, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69379973, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.15283203, + "step": 8061, + "time_per_iteration": 2.529332160949707 + }, + { + "auxiliary_loss_clip": 0.01133362, + "auxiliary_loss_mlp": 0.01048986, + "balance_loss_clip": 1.04972601, + "balance_loss_mlp": 1.03277338, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 1.881084634107338, + "language_loss": 0.82497406, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84679759, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.16223145, + "step": 8062, + "time_per_iteration": 2.4512648582458496 + }, + { + "auxiliary_loss_clip": 0.01127845, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.04787326, + "balance_loss_mlp": 1.02170575, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.5857796961420436, + "language_loss": 0.74531901, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76694632, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13183594, + "step": 8063, + "time_per_iteration": 3.9684507846832275 + }, + { + "auxiliary_loss_clip": 0.01131982, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.04845154, + "balance_loss_mlp": 1.02512074, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.7050065505237146, + "language_loss": 0.79012835, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81184292, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14349365, + "step": 8064, + "time_per_iteration": 2.4936389923095703 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.01033822, + "balance_loss_clip": 1.05886126, + "balance_loss_mlp": 1.02174604, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.8133038135005362, + "language_loss": 0.79313862, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81483811, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12078857, + "step": 8065, + "time_per_iteration": 2.4748125076293945 + }, + { + "auxiliary_loss_clip": 0.0112948, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.05438066, + "balance_loss_mlp": 1.02217007, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.7770442001511297, + "language_loss": 0.76364684, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78528702, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12353516, + "step": 8066, + "time_per_iteration": 2.508496046066284 + }, + { + "auxiliary_loss_clip": 0.01127711, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.05112791, + "balance_loss_mlp": 1.01731515, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.5058098367033015, + "language_loss": 0.72276354, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74434519, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13153076, + "step": 8067, + "time_per_iteration": 2.558365821838379 + }, + { + "auxiliary_loss_clip": 0.0112986, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.04984725, + "balance_loss_mlp": 1.02163219, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 1.9812294971615434, + "language_loss": 0.79630077, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81795394, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13824463, + "step": 8068, + "time_per_iteration": 2.4023513793945312 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.04319263, + "balance_loss_mlp": 1.02198923, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.511503009292539, + "language_loss": 0.84419405, + "learning_rate": 2.193066606145638e-06, + "loss": 0.8657558, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12695312, + "step": 8069, + "time_per_iteration": 2.4719061851501465 + }, + { + "auxiliary_loss_clip": 0.01126392, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.04988122, + "balance_loss_mlp": 1.01435125, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.8266000616851013, + "language_loss": 0.78227502, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80380654, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12414551, + "step": 8070, + "time_per_iteration": 2.494622230529785 + }, + { + "auxiliary_loss_clip": 0.01139335, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.06003547, + "balance_loss_mlp": 1.01785874, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.0214908044072746, + "language_loss": 0.7768963, + "learning_rate": 2.192291305922943e-06, + "loss": 0.79860103, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13275146, + "step": 8071, + "time_per_iteration": 2.480524778366089 + }, + { + "auxiliary_loss_clip": 0.01123839, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.04557693, + "balance_loss_mlp": 1.0149765, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.935494052155602, + "language_loss": 0.72743583, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74896657, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14263916, + "step": 8072, + "time_per_iteration": 2.5071871280670166 + }, + { + "auxiliary_loss_clip": 0.01131892, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.05137599, + "balance_loss_mlp": 1.02352321, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.2620475869929346, + "language_loss": 0.8779645, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.8996588, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14007568, + "step": 8073, + "time_per_iteration": 2.4858713150024414 + }, + { + "auxiliary_loss_clip": 0.01126418, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.05227828, + "balance_loss_mlp": 1.01858377, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.9446267182405206, + "language_loss": 0.60926312, + "learning_rate": 2.19112830093786e-06, + "loss": 0.63084543, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.13238525, + "step": 8074, + "time_per_iteration": 2.506887435913086 + }, + { + "auxiliary_loss_clip": 0.01134789, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.05468655, + "balance_loss_mlp": 1.02265775, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.5999570505438057, + "language_loss": 0.73119521, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75291288, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.14324951, + "step": 8075, + "time_per_iteration": 2.5800695419311523 + }, + { + "auxiliary_loss_clip": 0.01132066, + "auxiliary_loss_mlp": 0.01035709, + "balance_loss_clip": 1.058429, + "balance_loss_mlp": 1.02323353, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.8963985753580985, + "language_loss": 0.81994998, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84162784, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12475586, + "step": 8076, + "time_per_iteration": 2.8837270736694336 + }, + { + "auxiliary_loss_clip": 0.01121887, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.04408836, + "balance_loss_mlp": 1.02181792, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.9007695744527857, + "language_loss": 0.86377239, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88535851, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.14910889, + "step": 8077, + "time_per_iteration": 3.8952436447143555 + }, + { + "auxiliary_loss_clip": 0.01049382, + "auxiliary_loss_mlp": 0.01015411, + "balance_loss_clip": 1.022403, + "balance_loss_mlp": 1.01386619, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9022471551823212, + "language_loss": 0.5852192, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60586715, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.01548767, + "step": 8078, + "time_per_iteration": 3.0678117275238037 + }, + { + "auxiliary_loss_clip": 0.01133029, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.0530858, + "balance_loss_mlp": 1.02271795, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 2.532529807048561, + "language_loss": 0.72012162, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74181032, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13128662, + "step": 8079, + "time_per_iteration": 2.5112929344177246 + }, + { + "auxiliary_loss_clip": 0.01127762, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.04768133, + "balance_loss_mlp": 1.01974654, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 3.174189961606875, + "language_loss": 0.79510081, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81670868, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.1328125, + "step": 8080, + "time_per_iteration": 2.4478647708892822 + }, + { + "auxiliary_loss_clip": 0.01129459, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.04898071, + "balance_loss_mlp": 1.01890707, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 1.7691595576521133, + "language_loss": 0.844347, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86597013, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13977051, + "step": 8081, + "time_per_iteration": 2.424752712249756 + }, + { + "auxiliary_loss_clip": 0.01119971, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.04272425, + "balance_loss_mlp": 1.02041876, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.481146464707177, + "language_loss": 0.83232963, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.8538819, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.14855957, + "step": 8082, + "time_per_iteration": 2.5338644981384277 + }, + { + "auxiliary_loss_clip": 0.01138369, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.06063592, + "balance_loss_mlp": 1.01827753, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.103177448924114, + "language_loss": 0.87453228, + "learning_rate": 2.187638896199746e-06, + "loss": 0.8962245, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12579346, + "step": 8083, + "time_per_iteration": 3.880918264389038 + }, + { + "auxiliary_loss_clip": 0.01125261, + "auxiliary_loss_mlp": 0.01041981, + "balance_loss_clip": 1.04921091, + "balance_loss_mlp": 1.02922559, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.5922702338015693, + "language_loss": 0.81206274, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83373517, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12750244, + "step": 8084, + "time_per_iteration": 2.4748990535736084 + }, + { + "auxiliary_loss_clip": 0.0112514, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.04796672, + "balance_loss_mlp": 1.01941156, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 3.1419502466835105, + "language_loss": 0.68529946, + "learning_rate": 2.186863394279098e-06, + "loss": 0.7068823, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13720703, + "step": 8085, + "time_per_iteration": 2.5212655067443848 + }, + { + "auxiliary_loss_clip": 0.01127743, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.04926276, + "balance_loss_mlp": 1.02549076, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.4858273369585673, + "language_loss": 0.77587813, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79754186, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13140869, + "step": 8086, + "time_per_iteration": 2.5362730026245117 + }, + { + "auxiliary_loss_clip": 0.01124282, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.04779983, + "balance_loss_mlp": 1.01723909, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.6180783475389418, + "language_loss": 0.70096648, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72251475, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13311768, + "step": 8087, + "time_per_iteration": 2.6514854431152344 + }, + { + "auxiliary_loss_clip": 0.01137024, + "auxiliary_loss_mlp": 0.01041782, + "balance_loss_clip": 1.05202174, + "balance_loss_mlp": 1.02691662, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.6760649178984175, + "language_loss": 0.7231614, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74494946, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14868164, + "step": 8088, + "time_per_iteration": 2.5588948726654053 + }, + { + "auxiliary_loss_clip": 0.0113006, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.05309916, + "balance_loss_mlp": 1.03036344, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.5627336255702746, + "language_loss": 0.75477624, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77652669, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.14611816, + "step": 8089, + "time_per_iteration": 2.6150970458984375 + }, + { + "auxiliary_loss_clip": 0.01129275, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.05043495, + "balance_loss_mlp": 1.01489902, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.7241734867462173, + "language_loss": 0.84329498, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86487198, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13537598, + "step": 8090, + "time_per_iteration": 3.9059090614318848 + }, + { + "auxiliary_loss_clip": 0.01126687, + "auxiliary_loss_mlp": 0.01040376, + "balance_loss_clip": 1.0517354, + "balance_loss_mlp": 1.02518904, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.748705752501399, + "language_loss": 0.75869066, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78036129, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.15185547, + "step": 8091, + "time_per_iteration": 2.4913930892944336 + }, + { + "auxiliary_loss_clip": 0.01130781, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.05094993, + "balance_loss_mlp": 1.01532757, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.9971609306384537, + "language_loss": 0.80268228, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82427722, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13391113, + "step": 8092, + "time_per_iteration": 2.494016408920288 + }, + { + "auxiliary_loss_clip": 0.01134594, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.05435991, + "balance_loss_mlp": 1.01837182, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.5369272342119478, + "language_loss": 0.71772224, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73939347, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14147949, + "step": 8093, + "time_per_iteration": 2.4787397384643555 + }, + { + "auxiliary_loss_clip": 0.01127067, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.04896259, + "balance_loss_mlp": 1.02049387, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.683756751564755, + "language_loss": 0.68365479, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70525908, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12872314, + "step": 8094, + "time_per_iteration": 2.552281141281128 + }, + { + "auxiliary_loss_clip": 0.01131654, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.05101967, + "balance_loss_mlp": 1.0237689, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 3.565460052088183, + "language_loss": 0.66755432, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68924785, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13928223, + "step": 8095, + "time_per_iteration": 2.456960916519165 + }, + { + "auxiliary_loss_clip": 0.01127985, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.04828417, + "balance_loss_mlp": 1.02041936, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 3.8098773130942685, + "language_loss": 0.78877544, + "learning_rate": 2.182597630229345e-06, + "loss": 0.81040424, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.1449585, + "step": 8096, + "time_per_iteration": 2.438993215560913 + }, + { + "auxiliary_loss_clip": 0.01122484, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.04651642, + "balance_loss_mlp": 1.02275729, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 2.015431099107421, + "language_loss": 0.67680991, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69839555, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.13317871, + "step": 8097, + "time_per_iteration": 2.4728636741638184 + }, + { + "auxiliary_loss_clip": 0.01123392, + "auxiliary_loss_mlp": 0.01037486, + "balance_loss_clip": 1.04767132, + "balance_loss_mlp": 1.02433705, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.445895758836789, + "language_loss": 0.71203315, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73364198, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13165283, + "step": 8098, + "time_per_iteration": 2.443678379058838 + }, + { + "auxiliary_loss_clip": 0.01131827, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.04816008, + "balance_loss_mlp": 1.02060843, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 4.2251423710202145, + "language_loss": 0.66076058, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68242866, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14361572, + "step": 8099, + "time_per_iteration": 2.636664867401123 + }, + { + "auxiliary_loss_clip": 0.01125198, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.04679537, + "balance_loss_mlp": 1.02257514, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.64819794392328, + "language_loss": 0.66799736, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68960428, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1293335, + "step": 8100, + "time_per_iteration": 2.4791252613067627 + }, + { + "auxiliary_loss_clip": 0.01124082, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.05005968, + "balance_loss_mlp": 1.01908517, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3560642309205748, + "language_loss": 0.76709032, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78864366, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12176514, + "step": 8101, + "time_per_iteration": 2.651085138320923 + }, + { + "auxiliary_loss_clip": 0.01064637, + "auxiliary_loss_mlp": 0.01006962, + "balance_loss_clip": 1.03763843, + "balance_loss_mlp": 1.00533438, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6786661710538153, + "language_loss": 0.52329731, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54401332, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.01629639, + "step": 8102, + "time_per_iteration": 3.318486213684082 + }, + { + "auxiliary_loss_clip": 0.01124671, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.04691875, + "balance_loss_mlp": 1.0195663, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 1.9323304057415553, + "language_loss": 0.73627162, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75784349, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1295166, + "step": 8103, + "time_per_iteration": 2.4455509185791016 + }, + { + "auxiliary_loss_clip": 0.01141047, + "auxiliary_loss_mlp": 0.01043039, + "balance_loss_clip": 1.06016171, + "balance_loss_mlp": 1.02940726, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.504549619258253, + "language_loss": 0.62846637, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.6503073, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.1362915, + "step": 8104, + "time_per_iteration": 2.499178886413574 + }, + { + "auxiliary_loss_clip": 0.01122629, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.04593444, + "balance_loss_mlp": 1.01907754, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 1.570161720460008, + "language_loss": 0.69530833, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71686757, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.14208984, + "step": 8105, + "time_per_iteration": 2.5403199195861816 + }, + { + "auxiliary_loss_clip": 0.01123543, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.04699183, + "balance_loss_mlp": 1.01877713, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.7639706473103993, + "language_loss": 0.73692101, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75847197, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12762451, + "step": 8106, + "time_per_iteration": 3.94307804107666 + }, + { + "auxiliary_loss_clip": 0.01126307, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.04695928, + "balance_loss_mlp": 1.02119565, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 2.074170976690786, + "language_loss": 0.76678121, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78839439, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13818359, + "step": 8107, + "time_per_iteration": 2.533111572265625 + }, + { + "auxiliary_loss_clip": 0.01115559, + "auxiliary_loss_mlp": 0.01027484, + "balance_loss_clip": 1.04257369, + "balance_loss_mlp": 1.01561093, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 2.327866086945829, + "language_loss": 0.75233424, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77376473, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11871338, + "step": 8108, + "time_per_iteration": 2.517803192138672 + }, + { + "auxiliary_loss_clip": 0.0112041, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.04492223, + "balance_loss_mlp": 1.01753044, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.6763918390865473, + "language_loss": 0.73606312, + "learning_rate": 2.177555194083212e-06, + "loss": 0.7575593, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11676025, + "step": 8109, + "time_per_iteration": 2.455329179763794 + }, + { + "auxiliary_loss_clip": 0.01122683, + "auxiliary_loss_mlp": 0.01038722, + "balance_loss_clip": 1.0472393, + "balance_loss_mlp": 1.02432179, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.7423771538278845, + "language_loss": 0.78655905, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80817306, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.144104, + "step": 8110, + "time_per_iteration": 2.466472864151001 + }, + { + "auxiliary_loss_clip": 0.01126525, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_clip": 1.04783344, + "balance_loss_mlp": 1.02948177, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.827106459347184, + "language_loss": 0.71909922, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74079537, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13604736, + "step": 8111, + "time_per_iteration": 2.4266514778137207 + }, + { + "auxiliary_loss_clip": 0.01131648, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.05441403, + "balance_loss_mlp": 1.02148199, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5738996105556173, + "language_loss": 0.75992703, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78158712, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12872314, + "step": 8112, + "time_per_iteration": 2.550755500793457 + }, + { + "auxiliary_loss_clip": 0.01132463, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.04927731, + "balance_loss_mlp": 1.01969361, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.9911602270122943, + "language_loss": 0.74972701, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77138835, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13977051, + "step": 8113, + "time_per_iteration": 2.4598028659820557 + }, + { + "auxiliary_loss_clip": 0.01049665, + "auxiliary_loss_mlp": 0.01002966, + "balance_loss_clip": 1.02294278, + "balance_loss_mlp": 1.00148797, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.790749610377744, + "language_loss": 0.48835799, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50888425, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01477051, + "step": 8114, + "time_per_iteration": 2.9556593894958496 + }, + { + "auxiliary_loss_clip": 0.0112974, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.05159187, + "balance_loss_mlp": 1.02486014, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.505433308747589, + "language_loss": 0.76507783, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78676379, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13995361, + "step": 8115, + "time_per_iteration": 2.5983331203460693 + }, + { + "auxiliary_loss_clip": 0.0112991, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.04961717, + "balance_loss_mlp": 1.02303112, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 1.9876393645649348, + "language_loss": 0.72429132, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74595815, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13745117, + "step": 8116, + "time_per_iteration": 2.488877773284912 + }, + { + "auxiliary_loss_clip": 0.01122637, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.04642403, + "balance_loss_mlp": 1.02236903, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.5988858339017562, + "language_loss": 0.62982279, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65139669, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.1239624, + "step": 8117, + "time_per_iteration": 2.455610990524292 + }, + { + "auxiliary_loss_clip": 0.01125397, + "auxiliary_loss_mlp": 0.01034257, + "balance_loss_clip": 1.04866695, + "balance_loss_mlp": 1.02158475, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.7683317250748543, + "language_loss": 0.79373604, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81533253, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12683105, + "step": 8118, + "time_per_iteration": 2.5400917530059814 + }, + { + "auxiliary_loss_clip": 0.01141293, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.06300914, + "balance_loss_mlp": 1.02224302, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.792083530690382, + "language_loss": 0.63142991, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65319937, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13409424, + "step": 8119, + "time_per_iteration": 2.498626232147217 + }, + { + "auxiliary_loss_clip": 0.01124581, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.04738224, + "balance_loss_mlp": 1.0255307, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.7924088941538445, + "language_loss": 0.72658849, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74822968, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.14001465, + "step": 8120, + "time_per_iteration": 2.499061346054077 + }, + { + "auxiliary_loss_clip": 0.01126134, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.04646349, + "balance_loss_mlp": 1.01831841, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 3.27678919947572, + "language_loss": 0.63509226, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65667462, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13781738, + "step": 8121, + "time_per_iteration": 4.031455993652344 + }, + { + "auxiliary_loss_clip": 0.01138228, + "auxiliary_loss_mlp": 0.01040089, + "balance_loss_clip": 1.05670905, + "balance_loss_mlp": 1.02631426, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 2.6709795608838895, + "language_loss": 0.82664704, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84843022, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13781738, + "step": 8122, + "time_per_iteration": 2.4580740928649902 + }, + { + "auxiliary_loss_clip": 0.01142722, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.06220555, + "balance_loss_mlp": 1.0239495, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 1.7296971807435104, + "language_loss": 0.85330468, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87510717, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13586426, + "step": 8123, + "time_per_iteration": 2.517256498336792 + }, + { + "auxiliary_loss_clip": 0.01131439, + "auxiliary_loss_mlp": 0.01037882, + "balance_loss_clip": 1.05196238, + "balance_loss_mlp": 1.0251503, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.6028530933290341, + "language_loss": 0.85503036, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87672353, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12731934, + "step": 8124, + "time_per_iteration": 2.5678000450134277 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.04944563, + "balance_loss_mlp": 1.02076292, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.175296465110717, + "language_loss": 0.7941674, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81580544, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13183594, + "step": 8125, + "time_per_iteration": 3.9072141647338867 + }, + { + "auxiliary_loss_clip": 0.01127758, + "auxiliary_loss_mlp": 0.01038775, + "balance_loss_clip": 1.0490067, + "balance_loss_mlp": 1.02599001, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 1.9853189689436657, + "language_loss": 0.72896183, + "learning_rate": 2.170959527233356e-06, + "loss": 0.75062716, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12786865, + "step": 8126, + "time_per_iteration": 2.449333667755127 + }, + { + "auxiliary_loss_clip": 0.01147762, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.06564689, + "balance_loss_mlp": 1.02333367, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6476757232709631, + "language_loss": 0.68605161, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70789492, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13250732, + "step": 8127, + "time_per_iteration": 2.526394844055176 + }, + { + "auxiliary_loss_clip": 0.01129467, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.0494535, + "balance_loss_mlp": 1.02297616, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.5630979473359201, + "language_loss": 0.76584816, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78751051, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13775635, + "step": 8128, + "time_per_iteration": 2.5108137130737305 + }, + { + "auxiliary_loss_clip": 0.01129521, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.04905808, + "balance_loss_mlp": 1.02460313, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.6401733620221344, + "language_loss": 0.75860846, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78027737, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12774658, + "step": 8129, + "time_per_iteration": 2.5122087001800537 + }, + { + "auxiliary_loss_clip": 0.01119263, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.04288721, + "balance_loss_mlp": 1.02249074, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.1454923945913107, + "language_loss": 0.6425457, + "learning_rate": 2.169407330666114e-06, + "loss": 0.66409755, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13421631, + "step": 8130, + "time_per_iteration": 2.497894763946533 + }, + { + "auxiliary_loss_clip": 0.01121778, + "auxiliary_loss_mlp": 0.01038322, + "balance_loss_clip": 1.04603541, + "balance_loss_mlp": 1.02581048, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 1.9950954807755763, + "language_loss": 0.72470146, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74630243, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12506104, + "step": 8131, + "time_per_iteration": 2.4905896186828613 + }, + { + "auxiliary_loss_clip": 0.011327, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.05161858, + "balance_loss_mlp": 1.02563024, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.442269237380284, + "language_loss": 0.69682693, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71855783, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.14770508, + "step": 8132, + "time_per_iteration": 2.6661171913146973 + }, + { + "auxiliary_loss_clip": 0.01123292, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_clip": 1.046368, + "balance_loss_mlp": 1.02595544, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.846602699625865, + "language_loss": 0.70069957, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72234774, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.15563965, + "step": 8133, + "time_per_iteration": 2.480725049972534 + }, + { + "auxiliary_loss_clip": 0.01132361, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.05634522, + "balance_loss_mlp": 1.02240825, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.6092776919930345, + "language_loss": 0.70980132, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.73147643, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1272583, + "step": 8134, + "time_per_iteration": 3.915937662124634 + }, + { + "auxiliary_loss_clip": 0.01132011, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_clip": 1.05039668, + "balance_loss_mlp": 1.03186262, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.024489913831466, + "language_loss": 0.80638677, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82818764, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.16210938, + "step": 8135, + "time_per_iteration": 2.4668357372283936 + }, + { + "auxiliary_loss_clip": 0.01129989, + "auxiliary_loss_mlp": 0.01036896, + "balance_loss_clip": 1.05320203, + "balance_loss_mlp": 1.02467692, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.5796658211193444, + "language_loss": 0.74621493, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76788378, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12231445, + "step": 8136, + "time_per_iteration": 2.555302858352661 + }, + { + "auxiliary_loss_clip": 0.01129126, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.05319166, + "balance_loss_mlp": 1.02357149, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 1.470515147812886, + "language_loss": 0.73401439, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75566548, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12420654, + "step": 8137, + "time_per_iteration": 2.4625132083892822 + }, + { + "auxiliary_loss_clip": 0.01127362, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.04663825, + "balance_loss_mlp": 1.03007627, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 1.970968109386261, + "language_loss": 0.75014889, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77188706, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.16387939, + "step": 8138, + "time_per_iteration": 2.47702693939209 + }, + { + "auxiliary_loss_clip": 0.01129922, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.05100465, + "balance_loss_mlp": 1.0235765, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.5290656525652935, + "language_loss": 0.7475844, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76925015, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13098145, + "step": 8139, + "time_per_iteration": 2.4490156173706055 + }, + { + "auxiliary_loss_clip": 0.01125041, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.04653478, + "balance_loss_mlp": 1.02193522, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.6617482743506653, + "language_loss": 0.62021756, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64181483, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12756348, + "step": 8140, + "time_per_iteration": 2.509550094604492 + }, + { + "auxiliary_loss_clip": 0.01133813, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.0531615, + "balance_loss_mlp": 1.02692032, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 2.3102780941449854, + "language_loss": 0.82229674, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84404725, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14312744, + "step": 8141, + "time_per_iteration": 2.450064182281494 + }, + { + "auxiliary_loss_clip": 0.01132323, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.0524658, + "balance_loss_mlp": 1.01865697, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.6541759168917651, + "language_loss": 0.72482097, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74647117, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14050293, + "step": 8142, + "time_per_iteration": 2.571972370147705 + }, + { + "auxiliary_loss_clip": 0.01128253, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.05184078, + "balance_loss_mlp": 1.0220716, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.9353642067463548, + "language_loss": 0.67026353, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.69189191, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12506104, + "step": 8143, + "time_per_iteration": 2.548729658126831 + }, + { + "auxiliary_loss_clip": 0.01130079, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.05308807, + "balance_loss_mlp": 1.01492751, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.4640199733339254, + "language_loss": 0.75145489, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77303094, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12609863, + "step": 8144, + "time_per_iteration": 2.562932252883911 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.04914296, + "balance_loss_mlp": 1.01647282, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.7871569476108213, + "language_loss": 0.7570374, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77860868, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.1328125, + "step": 8145, + "time_per_iteration": 2.4305756092071533 + }, + { + "auxiliary_loss_clip": 0.01131264, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.05174196, + "balance_loss_mlp": 1.02211976, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.780220758063929, + "language_loss": 0.80103707, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82270664, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13574219, + "step": 8146, + "time_per_iteration": 2.4870383739471436 + }, + { + "auxiliary_loss_clip": 0.01119994, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.04539597, + "balance_loss_mlp": 1.01737452, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.772330534828802, + "language_loss": 0.74092221, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76242149, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12554932, + "step": 8147, + "time_per_iteration": 2.452967882156372 + }, + { + "auxiliary_loss_clip": 0.01130005, + "auxiliary_loss_mlp": 0.0103611, + "balance_loss_clip": 1.05040741, + "balance_loss_mlp": 1.02269852, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.190424693985677, + "language_loss": 0.82830894, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8499701, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.1340332, + "step": 8148, + "time_per_iteration": 2.4398133754730225 + }, + { + "auxiliary_loss_clip": 0.01123956, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.05123186, + "balance_loss_mlp": 1.01760316, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 8.597508688249688, + "language_loss": 0.73886764, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76039428, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11114502, + "step": 8149, + "time_per_iteration": 2.4636378288269043 + }, + { + "auxiliary_loss_clip": 0.01135385, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.05345666, + "balance_loss_mlp": 1.01894593, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.680200355577321, + "language_loss": 0.75964344, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78132915, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14245605, + "step": 8150, + "time_per_iteration": 3.957857370376587 + }, + { + "auxiliary_loss_clip": 0.01130314, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.05224061, + "balance_loss_mlp": 1.02349055, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 1.8038115242998436, + "language_loss": 0.72668117, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.74835479, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13555908, + "step": 8151, + "time_per_iteration": 2.452016592025757 + }, + { + "auxiliary_loss_clip": 0.01052096, + "auxiliary_loss_mlp": 0.01004615, + "balance_loss_clip": 1.02541912, + "balance_loss_mlp": 1.00295472, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8243696174724455, + "language_loss": 0.54293185, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56349897, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01660156, + "step": 8152, + "time_per_iteration": 3.063856840133667 + }, + { + "auxiliary_loss_clip": 0.01127353, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.04707742, + "balance_loss_mlp": 1.02017164, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8221384209922398, + "language_loss": 0.60804212, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.62964988, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13269043, + "step": 8153, + "time_per_iteration": 2.713130235671997 + }, + { + "auxiliary_loss_clip": 0.0111889, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.04191911, + "balance_loss_mlp": 1.02454925, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.616106910442504, + "language_loss": 0.76816219, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78972846, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.13183594, + "step": 8154, + "time_per_iteration": 2.489051342010498 + }, + { + "auxiliary_loss_clip": 0.01057328, + "auxiliary_loss_mlp": 0.01005651, + "balance_loss_clip": 1.03010774, + "balance_loss_mlp": 1.00427318, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 1.0107382758270504, + "language_loss": 0.6703341, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69096386, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01377869, + "step": 8155, + "time_per_iteration": 3.1419107913970947 + }, + { + "auxiliary_loss_clip": 0.01135719, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.05829799, + "balance_loss_mlp": 1.01962757, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.8391611973330522, + "language_loss": 0.76571673, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78739041, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12005615, + "step": 8156, + "time_per_iteration": 2.46895170211792 + }, + { + "auxiliary_loss_clip": 0.01122429, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.0457325, + "balance_loss_mlp": 1.01656151, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.059036465732909, + "language_loss": 0.83482927, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85634255, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12322998, + "step": 8157, + "time_per_iteration": 2.486255407333374 + }, + { + "auxiliary_loss_clip": 0.0113487, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.05702806, + "balance_loss_mlp": 1.01926684, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.8043851916898799, + "language_loss": 0.79569948, + "learning_rate": 2.158539129514956e-06, + "loss": 0.81737006, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.1293335, + "step": 8158, + "time_per_iteration": 2.4657227993011475 + }, + { + "auxiliary_loss_clip": 0.01134707, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.05701208, + "balance_loss_mlp": 1.01916671, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.8612546486090553, + "language_loss": 0.6929785, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71464443, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12713623, + "step": 8159, + "time_per_iteration": 2.5046958923339844 + }, + { + "auxiliary_loss_clip": 0.01130919, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.05180144, + "balance_loss_mlp": 1.02204776, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 2.216295542438994, + "language_loss": 0.73194945, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75360668, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12744141, + "step": 8160, + "time_per_iteration": 2.4504377841949463 + }, + { + "auxiliary_loss_clip": 0.01137628, + "auxiliary_loss_mlp": 0.0103723, + "balance_loss_clip": 1.05569792, + "balance_loss_mlp": 1.02340782, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 2.1984007719801806, + "language_loss": 0.71636426, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73811281, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13806152, + "step": 8161, + "time_per_iteration": 2.5486838817596436 + }, + { + "auxiliary_loss_clip": 0.01129595, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.05290556, + "balance_loss_mlp": 1.02391338, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.7598590187945937, + "language_loss": 0.68612802, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70778775, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12469482, + "step": 8162, + "time_per_iteration": 2.5215375423431396 + }, + { + "auxiliary_loss_clip": 0.01130349, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.0483458, + "balance_loss_mlp": 1.02005148, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.7862993699796204, + "language_loss": 0.63652223, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65816939, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14331055, + "step": 8163, + "time_per_iteration": 2.485272169113159 + }, + { + "auxiliary_loss_clip": 0.01126715, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.0497117, + "balance_loss_mlp": 1.02317762, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 2.868478734188607, + "language_loss": 0.77084708, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79248333, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13726807, + "step": 8164, + "time_per_iteration": 2.5365238189697266 + }, + { + "auxiliary_loss_clip": 0.01129354, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.05089819, + "balance_loss_mlp": 1.01253772, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.7262418913982227, + "language_loss": 0.76982492, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.79138011, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13616943, + "step": 8165, + "time_per_iteration": 3.9391937255859375 + }, + { + "auxiliary_loss_clip": 0.01133273, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.05523825, + "balance_loss_mlp": 1.01746166, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.7572661765419315, + "language_loss": 0.77909958, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.80073857, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13146973, + "step": 8166, + "time_per_iteration": 2.4808309078216553 + }, + { + "auxiliary_loss_clip": 0.01064933, + "auxiliary_loss_mlp": 0.01012315, + "balance_loss_clip": 1.03838253, + "balance_loss_mlp": 1.010746, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7992579845927509, + "language_loss": 0.54187328, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56264573, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01570129, + "step": 8167, + "time_per_iteration": 3.0568108558654785 + }, + { + "auxiliary_loss_clip": 0.01126, + "auxiliary_loss_mlp": 0.01030669, + "balance_loss_clip": 1.04944396, + "balance_loss_mlp": 1.01768744, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.7462408158006024, + "language_loss": 0.85755396, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.87912071, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12969971, + "step": 8168, + "time_per_iteration": 2.5141353607177734 + }, + { + "auxiliary_loss_clip": 0.01125789, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.05203199, + "balance_loss_mlp": 1.02004492, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 2.002369385996043, + "language_loss": 0.7339859, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75557125, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12713623, + "step": 8169, + "time_per_iteration": 2.4926488399505615 + }, + { + "auxiliary_loss_clip": 0.01128962, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.04981232, + "balance_loss_mlp": 1.01594841, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.4912456432550096, + "language_loss": 0.77982199, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.8013922, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12097168, + "step": 8170, + "time_per_iteration": 3.8861565589904785 + }, + { + "auxiliary_loss_clip": 0.01130905, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.05108547, + "balance_loss_mlp": 1.02152252, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.522470116650462, + "language_loss": 0.76149899, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78314793, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12475586, + "step": 8171, + "time_per_iteration": 2.4676172733306885 + }, + { + "auxiliary_loss_clip": 0.01130846, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.0494616, + "balance_loss_mlp": 1.02348161, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 3.26992523302029, + "language_loss": 0.81286633, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83453619, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12658691, + "step": 8172, + "time_per_iteration": 2.4204020500183105 + }, + { + "auxiliary_loss_clip": 0.0105886, + "auxiliary_loss_mlp": 0.0100688, + "balance_loss_clip": 1.03106892, + "balance_loss_mlp": 1.00536561, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6868124100179758, + "language_loss": 0.53302079, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55367815, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.27783203, + "router_z_loss_mlp": 0.01516724, + "step": 8173, + "time_per_iteration": 3.146533489227295 + }, + { + "auxiliary_loss_clip": 0.0113587, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.05517113, + "balance_loss_mlp": 1.02141333, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.86200900844008, + "language_loss": 0.62960219, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65131307, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13800049, + "step": 8174, + "time_per_iteration": 2.4575576782226562 + }, + { + "auxiliary_loss_clip": 0.01133804, + "auxiliary_loss_mlp": 0.01034484, + "balance_loss_clip": 1.05442333, + "balance_loss_mlp": 1.02103138, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 5.259525628088965, + "language_loss": 0.69417512, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71585798, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13439941, + "step": 8175, + "time_per_iteration": 2.547852039337158 + }, + { + "auxiliary_loss_clip": 0.01128703, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.05221713, + "balance_loss_mlp": 1.01635468, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.718472573502271, + "language_loss": 0.739784, + "learning_rate": 2.151549919570068e-06, + "loss": 0.7613591, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12469482, + "step": 8176, + "time_per_iteration": 2.4373278617858887 + }, + { + "auxiliary_loss_clip": 0.01128315, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.05020118, + "balance_loss_mlp": 1.02360964, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.5778735841224132, + "language_loss": 0.70442343, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72607791, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13525391, + "step": 8177, + "time_per_iteration": 2.511352062225342 + }, + { + "auxiliary_loss_clip": 0.01066138, + "auxiliary_loss_mlp": 0.01004278, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.00235891, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6905679559480102, + "language_loss": 0.46202439, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48272857, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.01919556, + "step": 8178, + "time_per_iteration": 4.474030017852783 + }, + { + "auxiliary_loss_clip": 0.01135238, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.0546217, + "balance_loss_mlp": 1.02126551, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 2.774389426561919, + "language_loss": 0.65619612, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.67790079, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13977051, + "step": 8179, + "time_per_iteration": 2.473780632019043 + }, + { + "auxiliary_loss_clip": 0.01126698, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.04547524, + "balance_loss_mlp": 1.02032793, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8477890507146528, + "language_loss": 0.70213407, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72374171, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13745117, + "step": 8180, + "time_per_iteration": 2.5182816982269287 + }, + { + "auxiliary_loss_clip": 0.01130715, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.05400372, + "balance_loss_mlp": 1.01997805, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.8818718188637713, + "language_loss": 0.84454131, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86618835, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.14001465, + "step": 8181, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01123048, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.04932833, + "balance_loss_mlp": 1.0189631, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 1.884117795779161, + "language_loss": 0.72819626, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.7497409, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12445068, + "step": 8182, + "time_per_iteration": 2.499331474304199 + }, + { + "auxiliary_loss_clip": 0.0112732, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.04998994, + "balance_loss_mlp": 1.02198291, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 1.9191001006198312, + "language_loss": 0.72719264, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.7488184, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13275146, + "step": 8183, + "time_per_iteration": 2.4839024543762207 + }, + { + "auxiliary_loss_clip": 0.01138625, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.05612755, + "balance_loss_mlp": 1.01685452, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 3.8112554342276392, + "language_loss": 0.77097142, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79267395, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14758301, + "step": 8184, + "time_per_iteration": 2.531459331512451 + }, + { + "auxiliary_loss_clip": 0.01123799, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.04759622, + "balance_loss_mlp": 1.02177227, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.6243255715284555, + "language_loss": 0.70543385, + "learning_rate": 2.148054610995789e-06, + "loss": 0.72701645, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12677002, + "step": 8185, + "time_per_iteration": 2.497318744659424 + }, + { + "auxiliary_loss_clip": 0.01129087, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.0485357, + "balance_loss_mlp": 1.02050233, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.9594374527041454, + "language_loss": 0.75125563, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77289611, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14447021, + "step": 8186, + "time_per_iteration": 2.598970413208008 + }, + { + "auxiliary_loss_clip": 0.01134687, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.05249012, + "balance_loss_mlp": 1.02025449, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.1881568149697923, + "language_loss": 0.67848206, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.70016783, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13635254, + "step": 8187, + "time_per_iteration": 2.459994316101074 + }, + { + "auxiliary_loss_clip": 0.01136342, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.05755663, + "balance_loss_mlp": 1.01955819, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.4572153996710089, + "language_loss": 0.66675991, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.68845308, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13433838, + "step": 8188, + "time_per_iteration": 2.4984445571899414 + }, + { + "auxiliary_loss_clip": 0.01128798, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.04857111, + "balance_loss_mlp": 1.02088356, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.6437805575158668, + "language_loss": 0.74536031, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76698333, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.1260376, + "step": 8189, + "time_per_iteration": 2.4965577125549316 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.05041265, + "balance_loss_mlp": 1.01825476, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.828717380984766, + "language_loss": 0.64131558, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66290021, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13330078, + "step": 8190, + "time_per_iteration": 2.6231212615966797 + }, + { + "auxiliary_loss_clip": 0.01122926, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.04680824, + "balance_loss_mlp": 1.01636815, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 3.631729065323594, + "language_loss": 0.71638083, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73790431, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13049316, + "step": 8191, + "time_per_iteration": 2.435882329940796 + }, + { + "auxiliary_loss_clip": 0.01133832, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.05441761, + "balance_loss_mlp": 1.01865482, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5425424437621111, + "language_loss": 0.71930206, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.74096107, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13415527, + "step": 8192, + "time_per_iteration": 2.6584742069244385 + }, + { + "auxiliary_loss_clip": 0.01042647, + "auxiliary_loss_mlp": 0.01001031, + "balance_loss_clip": 1.01623416, + "balance_loss_mlp": 0.99958277, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7164878651105862, + "language_loss": 0.52144372, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54188049, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.26416016, + "router_z_loss_mlp": 0.01448059, + "step": 8193, + "time_per_iteration": 3.1066391468048096 + }, + { + "auxiliary_loss_clip": 0.01137051, + "auxiliary_loss_mlp": 0.0104364, + "balance_loss_clip": 1.05718124, + "balance_loss_mlp": 1.02974057, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.4503382229362067, + "language_loss": 0.77239484, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79420173, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13903809, + "step": 8194, + "time_per_iteration": 3.9076428413391113 + }, + { + "auxiliary_loss_clip": 0.01124671, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.0474776, + "balance_loss_mlp": 1.01764202, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 2.1143033151881885, + "language_loss": 0.70347911, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72503328, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13116455, + "step": 8195, + "time_per_iteration": 2.496776580810547 + }, + { + "auxiliary_loss_clip": 0.0113199, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.05373359, + "balance_loss_mlp": 1.01703167, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 2.1663018478284912, + "language_loss": 0.80931175, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83092332, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12133789, + "step": 8196, + "time_per_iteration": 2.4844372272491455 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.05309272, + "balance_loss_mlp": 1.02331686, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.7995454086652363, + "language_loss": 0.70667017, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72840053, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14019775, + "step": 8197, + "time_per_iteration": 2.5612480640411377 + }, + { + "auxiliary_loss_clip": 0.01131992, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.05276906, + "balance_loss_mlp": 1.02445292, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 2.028272705386647, + "language_loss": 0.84440219, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86609262, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12609863, + "step": 8198, + "time_per_iteration": 2.4095804691314697 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01042034, + "balance_loss_clip": 1.0525521, + "balance_loss_mlp": 1.0279789, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.7768081431355813, + "language_loss": 0.76093054, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78271651, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14056396, + "step": 8199, + "time_per_iteration": 2.4623525142669678 + }, + { + "auxiliary_loss_clip": 0.011337, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.05164921, + "balance_loss_mlp": 1.02218962, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 2.1076449729736413, + "language_loss": 0.60210842, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62381458, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14727783, + "step": 8200, + "time_per_iteration": 2.495596170425415 + }, + { + "auxiliary_loss_clip": 0.01125877, + "auxiliary_loss_mlp": 0.01036662, + "balance_loss_clip": 1.05188775, + "balance_loss_mlp": 1.02395451, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4239387721089685, + "language_loss": 0.79316324, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81478858, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12701416, + "step": 8201, + "time_per_iteration": 2.50663685798645 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.05353105, + "balance_loss_mlp": 1.02023709, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 5.042593731326544, + "language_loss": 0.67819077, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69991344, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14544678, + "step": 8202, + "time_per_iteration": 2.476223945617676 + }, + { + "auxiliary_loss_clip": 0.011259, + "auxiliary_loss_mlp": 0.0102938, + "balance_loss_clip": 1.04778576, + "balance_loss_mlp": 1.01673234, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 1.8643230543824663, + "language_loss": 0.75452399, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77607685, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12664795, + "step": 8203, + "time_per_iteration": 2.529062509536743 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.06004691, + "balance_loss_mlp": 1.0206008, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.1378444203859024, + "language_loss": 0.80287802, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82460219, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1383667, + "step": 8204, + "time_per_iteration": 2.4323642253875732 + }, + { + "auxiliary_loss_clip": 0.01119132, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.04376423, + "balance_loss_mlp": 1.02374685, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 3.8589366519147195, + "language_loss": 0.66007316, + "learning_rate": 2.140285646139455e-06, + "loss": 0.6816386, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13659668, + "step": 8205, + "time_per_iteration": 2.5126254558563232 + }, + { + "auxiliary_loss_clip": 0.01130302, + "auxiliary_loss_mlp": 0.01039383, + "balance_loss_clip": 1.04753125, + "balance_loss_mlp": 1.02265811, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 1.9333543112982223, + "language_loss": 0.66310847, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68480539, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.16723633, + "step": 8206, + "time_per_iteration": 2.4581942558288574 + }, + { + "auxiliary_loss_clip": 0.01141221, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.05750513, + "balance_loss_mlp": 1.02477837, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.5918684602856685, + "language_loss": 0.76665127, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.7884407, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.12945557, + "step": 8207, + "time_per_iteration": 2.549015998840332 + }, + { + "auxiliary_loss_clip": 0.01132822, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.05259669, + "balance_loss_mlp": 1.02260983, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.6147456159819176, + "language_loss": 0.60049766, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.6221925, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14050293, + "step": 8208, + "time_per_iteration": 3.8914244174957275 + }, + { + "auxiliary_loss_clip": 0.01129059, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.04992712, + "balance_loss_mlp": 1.02117813, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 2.0914560090374885, + "language_loss": 0.78597546, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.807612, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13421631, + "step": 8209, + "time_per_iteration": 2.5354397296905518 + }, + { + "auxiliary_loss_clip": 0.01130069, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.05117095, + "balance_loss_mlp": 1.021034, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 4.090964910082745, + "language_loss": 0.78833538, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80998707, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.14068604, + "step": 8210, + "time_per_iteration": 2.519756555557251 + }, + { + "auxiliary_loss_clip": 0.01144051, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.06119132, + "balance_loss_mlp": 1.02613878, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 1.8046240803742624, + "language_loss": 0.81194901, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83379483, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14385986, + "step": 8211, + "time_per_iteration": 2.537929058074951 + }, + { + "auxiliary_loss_clip": 0.01138836, + "auxiliary_loss_mlp": 0.01038176, + "balance_loss_clip": 1.05773926, + "balance_loss_mlp": 1.02437139, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.4226721113346326, + "language_loss": 0.91478801, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93655813, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13812256, + "step": 8212, + "time_per_iteration": 2.4985315799713135 + }, + { + "auxiliary_loss_clip": 0.01126473, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.04666591, + "balance_loss_mlp": 1.02356291, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 2.7405403116295513, + "language_loss": 0.65082955, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.6724596, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12982178, + "step": 8213, + "time_per_iteration": 4.1962151527404785 + }, + { + "auxiliary_loss_clip": 0.0112316, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.04559612, + "balance_loss_mlp": 1.01831532, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.8149361573522922, + "language_loss": 0.75663614, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77819252, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.14160156, + "step": 8214, + "time_per_iteration": 2.5944595336914062 + }, + { + "auxiliary_loss_clip": 0.01129428, + "auxiliary_loss_mlp": 0.01040692, + "balance_loss_clip": 1.04963231, + "balance_loss_mlp": 1.02488446, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 2.0091692836406634, + "language_loss": 0.8438915, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86559272, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.15802002, + "step": 8215, + "time_per_iteration": 2.444017171859741 + }, + { + "auxiliary_loss_clip": 0.01139993, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.06417799, + "balance_loss_mlp": 1.01691055, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.587840176148066, + "language_loss": 0.83471417, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85640162, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11828613, + "step": 8216, + "time_per_iteration": 2.5953140258789062 + }, + { + "auxiliary_loss_clip": 0.01131256, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.05164874, + "balance_loss_mlp": 1.0271914, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.4672963243855355, + "language_loss": 0.7483055, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.77002192, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13201904, + "step": 8217, + "time_per_iteration": 2.445467948913574 + }, + { + "auxiliary_loss_clip": 0.01129971, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.0542655, + "balance_loss_mlp": 1.02312493, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.822452719697743, + "language_loss": 0.78590888, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80757415, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13421631, + "step": 8218, + "time_per_iteration": 2.524888515472412 + }, + { + "auxiliary_loss_clip": 0.01134127, + "auxiliary_loss_mlp": 0.01036498, + "balance_loss_clip": 1.05927157, + "balance_loss_mlp": 1.02373064, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.2243686128259785, + "language_loss": 0.76897478, + "learning_rate": 2.134846097653142e-06, + "loss": 0.790681, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12762451, + "step": 8219, + "time_per_iteration": 2.386704444885254 + }, + { + "auxiliary_loss_clip": 0.01131995, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.05026472, + "balance_loss_mlp": 1.02487195, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6311898318219766, + "language_loss": 0.62379694, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64550996, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14440918, + "step": 8220, + "time_per_iteration": 2.5650858879089355 + }, + { + "auxiliary_loss_clip": 0.01132166, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.05249679, + "balance_loss_mlp": 1.0174396, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.6953036229910037, + "language_loss": 0.72359973, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74523169, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13586426, + "step": 8221, + "time_per_iteration": 2.477416515350342 + }, + { + "auxiliary_loss_clip": 0.01121358, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.04810178, + "balance_loss_mlp": 1.02573156, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.6879867497000882, + "language_loss": 0.79272234, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81431329, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12011719, + "step": 8222, + "time_per_iteration": 4.081455230712891 + }, + { + "auxiliary_loss_clip": 0.01126571, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.04868221, + "balance_loss_mlp": 1.02465653, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 3.1745173517276677, + "language_loss": 0.71904725, + "learning_rate": 2.133291755093088e-06, + "loss": 0.74069846, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13897705, + "step": 8223, + "time_per_iteration": 2.5313589572906494 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.05906463, + "balance_loss_mlp": 1.02487075, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.604400106674424, + "language_loss": 0.75349903, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77529812, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14147949, + "step": 8224, + "time_per_iteration": 2.45591402053833 + }, + { + "auxiliary_loss_clip": 0.01142904, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.06361616, + "balance_loss_mlp": 1.01946664, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 1.8397106432448682, + "language_loss": 0.6360575, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65781355, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13250732, + "step": 8225, + "time_per_iteration": 2.520857095718384 + }, + { + "auxiliary_loss_clip": 0.0113467, + "auxiliary_loss_mlp": 0.01038528, + "balance_loss_clip": 1.05584669, + "balance_loss_mlp": 1.02496815, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 5.762505424741279, + "language_loss": 0.76189184, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78362381, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13568115, + "step": 8226, + "time_per_iteration": 2.4673404693603516 + }, + { + "auxiliary_loss_clip": 0.01131795, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.0489428, + "balance_loss_mlp": 1.0275178, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.7323346769779007, + "language_loss": 0.70907557, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73082125, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.15258789, + "step": 8227, + "time_per_iteration": 2.5485432147979736 + }, + { + "auxiliary_loss_clip": 0.01146195, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.06230974, + "balance_loss_mlp": 1.01814938, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.5443543910149578, + "language_loss": 0.71472144, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73649657, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1317749, + "step": 8228, + "time_per_iteration": 2.5362393856048584 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.04666162, + "balance_loss_mlp": 1.01672292, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.6714149973964312, + "language_loss": 0.83621514, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85774076, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13330078, + "step": 8229, + "time_per_iteration": 2.5333564281463623 + }, + { + "auxiliary_loss_clip": 0.01128677, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.04761124, + "balance_loss_mlp": 1.02281296, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.051439951379451, + "language_loss": 0.74823225, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76989794, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.15075684, + "step": 8230, + "time_per_iteration": 2.438953161239624 + }, + { + "auxiliary_loss_clip": 0.01131607, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.05003238, + "balance_loss_mlp": 1.01976776, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 1.9793335598549655, + "language_loss": 0.79969227, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.82134724, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.14117432, + "step": 8231, + "time_per_iteration": 2.512531280517578 + }, + { + "auxiliary_loss_clip": 0.01072646, + "auxiliary_loss_mlp": 0.01006468, + "balance_loss_clip": 1.04657936, + "balance_loss_mlp": 1.00468564, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7503915744053317, + "language_loss": 0.6019429, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62273407, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01779175, + "step": 8232, + "time_per_iteration": 3.2324180603027344 + }, + { + "auxiliary_loss_clip": 0.01137781, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.05553079, + "balance_loss_mlp": 1.02419758, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.7159909807571585, + "language_loss": 0.69512117, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71688277, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14172363, + "step": 8233, + "time_per_iteration": 2.540661573410034 + }, + { + "auxiliary_loss_clip": 0.01124045, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.04658306, + "balance_loss_mlp": 1.02298164, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 1.9053020116613493, + "language_loss": 0.66592461, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68754375, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.14862061, + "step": 8234, + "time_per_iteration": 2.5798068046569824 + }, + { + "auxiliary_loss_clip": 0.01054802, + "auxiliary_loss_mlp": 0.01004549, + "balance_loss_clip": 1.02817321, + "balance_loss_mlp": 1.00263596, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8008549844117498, + "language_loss": 0.58055019, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60114372, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.26660156, + "router_z_loss_mlp": 0.01916504, + "step": 8235, + "time_per_iteration": 3.0331878662109375 + }, + { + "auxiliary_loss_clip": 0.01132376, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.05146575, + "balance_loss_mlp": 1.02251697, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.8094867386829474, + "language_loss": 0.77181029, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.7935015, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14233398, + "step": 8236, + "time_per_iteration": 2.44457745552063 + }, + { + "auxiliary_loss_clip": 0.01139814, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.06099534, + "balance_loss_mlp": 1.0216136, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.696090297636482, + "language_loss": 0.72938889, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.75113678, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13354492, + "step": 8237, + "time_per_iteration": 4.002668857574463 + }, + { + "auxiliary_loss_clip": 0.01137151, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.05918705, + "balance_loss_mlp": 1.02044928, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.7598780106488285, + "language_loss": 0.75856161, + "learning_rate": 2.127462257935406e-06, + "loss": 0.78026843, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13085938, + "step": 8238, + "time_per_iteration": 2.497864246368408 + }, + { + "auxiliary_loss_clip": 0.01134865, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.05255485, + "balance_loss_mlp": 1.02451491, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.208058390306511, + "language_loss": 0.73912323, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.76086372, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14672852, + "step": 8239, + "time_per_iteration": 2.4291248321533203 + }, + { + "auxiliary_loss_clip": 0.01138757, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.05599451, + "balance_loss_mlp": 1.01847422, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.148264529028651, + "language_loss": 0.78181028, + "learning_rate": 2.126684908394552e-06, + "loss": 0.8035422, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.15966797, + "step": 8240, + "time_per_iteration": 2.493220806121826 + }, + { + "auxiliary_loss_clip": 0.01134061, + "auxiliary_loss_mlp": 0.01037358, + "balance_loss_clip": 1.05613828, + "balance_loss_mlp": 1.02466857, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 1.967498806348461, + "language_loss": 0.85881162, + "learning_rate": 2.126296226410898e-06, + "loss": 0.88052583, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12683105, + "step": 8241, + "time_per_iteration": 2.385441303253174 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01033033, + "balance_loss_clip": 1.06159043, + "balance_loss_mlp": 1.02053428, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.8229471985872048, + "language_loss": 0.77260751, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79432112, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12506104, + "step": 8242, + "time_per_iteration": 2.4622981548309326 + }, + { + "auxiliary_loss_clip": 0.01124995, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0464313, + "balance_loss_mlp": 1.01934981, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.632164991866018, + "language_loss": 0.67439735, + "learning_rate": 2.125518848090833e-06, + "loss": 0.69597846, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13769531, + "step": 8243, + "time_per_iteration": 2.498863458633423 + }, + { + "auxiliary_loss_clip": 0.01128068, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.04986966, + "balance_loss_mlp": 1.02129674, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.5709417666274312, + "language_loss": 0.67915696, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70077956, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12902832, + "step": 8244, + "time_per_iteration": 2.5625085830688477 + }, + { + "auxiliary_loss_clip": 0.01135055, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.05437779, + "balance_loss_mlp": 1.023139, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.775142313711743, + "language_loss": 0.74968088, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77140933, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14666748, + "step": 8245, + "time_per_iteration": 2.4319818019866943 + }, + { + "auxiliary_loss_clip": 0.01132253, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.0549022, + "balance_loss_mlp": 1.02354205, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 1.7335256103704415, + "language_loss": 0.8163895, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83808923, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.14172363, + "step": 8246, + "time_per_iteration": 2.520200252532959 + }, + { + "auxiliary_loss_clip": 0.01133529, + "auxiliary_loss_mlp": 0.01038062, + "balance_loss_clip": 1.05267572, + "balance_loss_mlp": 1.02352405, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 2.0882555452762963, + "language_loss": 0.83558834, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85730433, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14538574, + "step": 8247, + "time_per_iteration": 2.496458053588867 + }, + { + "auxiliary_loss_clip": 0.01132675, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.05158055, + "balance_loss_mlp": 1.01846194, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.1466119559824213, + "language_loss": 0.83470285, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85634696, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13269043, + "step": 8248, + "time_per_iteration": 2.5533576011657715 + }, + { + "auxiliary_loss_clip": 0.01140623, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.059147, + "balance_loss_mlp": 1.01846874, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.0365807616150455, + "language_loss": 0.73618817, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75791776, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1385498, + "step": 8249, + "time_per_iteration": 2.515927791595459 + }, + { + "auxiliary_loss_clip": 0.01134197, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.0533185, + "balance_loss_mlp": 1.02937365, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 2.0956447562636837, + "language_loss": 0.7578631, + "learning_rate": 2.122797874814289e-06, + "loss": 0.77963996, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14111328, + "step": 8250, + "time_per_iteration": 2.4863696098327637 + }, + { + "auxiliary_loss_clip": 0.01130765, + "auxiliary_loss_mlp": 0.01041128, + "balance_loss_clip": 1.05106127, + "balance_loss_mlp": 1.02691877, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.7389165140159455, + "language_loss": 0.69917142, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72089028, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.14190674, + "step": 8251, + "time_per_iteration": 3.8863019943237305 + }, + { + "auxiliary_loss_clip": 0.01128022, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.05028129, + "balance_loss_mlp": 1.01859224, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.7826920768209404, + "language_loss": 0.79831815, + "learning_rate": 2.122020411748461e-06, + "loss": 0.81991518, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13085938, + "step": 8252, + "time_per_iteration": 2.476614475250244 + }, + { + "auxiliary_loss_clip": 0.01140789, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.05924869, + "balance_loss_mlp": 1.01723027, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.8103101286376093, + "language_loss": 0.81567556, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83740902, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.15313721, + "step": 8253, + "time_per_iteration": 2.418337821960449 + }, + { + "auxiliary_loss_clip": 0.01129362, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.05223382, + "balance_loss_mlp": 1.01720321, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.4254283917993795, + "language_loss": 0.67340302, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69499516, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12658691, + "step": 8254, + "time_per_iteration": 2.559133529663086 + }, + { + "auxiliary_loss_clip": 0.01129114, + "auxiliary_loss_mlp": 0.01041701, + "balance_loss_clip": 1.04680467, + "balance_loss_mlp": 1.02747929, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.8546781075676029, + "language_loss": 0.74350959, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76521778, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14233398, + "step": 8255, + "time_per_iteration": 2.4982235431671143 + }, + { + "auxiliary_loss_clip": 0.01132633, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.053864, + "balance_loss_mlp": 1.0229665, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.7222962235861008, + "language_loss": 0.81005824, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83176994, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.15551758, + "step": 8256, + "time_per_iteration": 2.444768190383911 + }, + { + "auxiliary_loss_clip": 0.01124384, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.04765058, + "balance_loss_mlp": 1.02095616, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.6664851838367265, + "language_loss": 0.81128371, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83288395, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.14709473, + "step": 8257, + "time_per_iteration": 3.8815386295318604 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.05349302, + "balance_loss_mlp": 1.02429736, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 1.8033430508393549, + "language_loss": 0.66414881, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68592298, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15881348, + "step": 8258, + "time_per_iteration": 2.4301676750183105 + }, + { + "auxiliary_loss_clip": 0.01121016, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.04468012, + "balance_loss_mlp": 1.01756573, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.4012570701628926, + "language_loss": 0.7757991, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79731846, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13354492, + "step": 8259, + "time_per_iteration": 2.5028905868530273 + }, + { + "auxiliary_loss_clip": 0.01126309, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.04939294, + "balance_loss_mlp": 1.02318752, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.844051658953479, + "language_loss": 0.7843082, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80595016, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.14691162, + "step": 8260, + "time_per_iteration": 2.4677035808563232 + }, + { + "auxiliary_loss_clip": 0.01132619, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.05055285, + "balance_loss_mlp": 1.02409637, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 6.137549606465708, + "language_loss": 0.7675305, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78923798, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.14038086, + "step": 8261, + "time_per_iteration": 2.431763172149658 + }, + { + "auxiliary_loss_clip": 0.01139784, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.05978179, + "balance_loss_mlp": 1.0192287, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.9117535926627736, + "language_loss": 0.89507341, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91679394, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13037109, + "step": 8262, + "time_per_iteration": 2.561831474304199 + }, + { + "auxiliary_loss_clip": 0.01130993, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.05287075, + "balance_loss_mlp": 1.01859164, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.425458082063261, + "language_loss": 0.73926961, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76089561, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13012695, + "step": 8263, + "time_per_iteration": 2.519061803817749 + }, + { + "auxiliary_loss_clip": 0.01144334, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.06235409, + "balance_loss_mlp": 1.01974511, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.27204157717091, + "language_loss": 0.69266599, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71445364, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.14691162, + "step": 8264, + "time_per_iteration": 2.4472827911376953 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.0496521, + "balance_loss_mlp": 1.01667488, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.7013259975977761, + "language_loss": 0.64882994, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.6704405, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13262939, + "step": 8265, + "time_per_iteration": 3.861186981201172 + }, + { + "auxiliary_loss_clip": 0.01068555, + "auxiliary_loss_mlp": 0.0100497, + "balance_loss_clip": 1.04081655, + "balance_loss_mlp": 1.00344539, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.9571767266222905, + "language_loss": 0.53507483, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55581009, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01527405, + "step": 8266, + "time_per_iteration": 3.1380691528320312 + }, + { + "auxiliary_loss_clip": 0.01134011, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.05784702, + "balance_loss_mlp": 1.01794553, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.6699314253140114, + "language_loss": 0.7951839, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81683189, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12835693, + "step": 8267, + "time_per_iteration": 2.512930393218994 + }, + { + "auxiliary_loss_clip": 0.0113262, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.05227196, + "balance_loss_mlp": 1.02066767, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.9194576387568536, + "language_loss": 0.75250894, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.77418411, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.14233398, + "step": 8268, + "time_per_iteration": 2.5380587577819824 + }, + { + "auxiliary_loss_clip": 0.01132148, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.05116618, + "balance_loss_mlp": 1.02863085, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.5570470142131188, + "language_loss": 0.68304849, + "learning_rate": 2.115411240328073e-06, + "loss": 0.70479751, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.14123535, + "step": 8269, + "time_per_iteration": 2.6657352447509766 + }, + { + "auxiliary_loss_clip": 0.01137756, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.06087875, + "balance_loss_mlp": 1.02228343, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.5414016308345635, + "language_loss": 0.85498041, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87671286, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13214111, + "step": 8270, + "time_per_iteration": 2.5091936588287354 + }, + { + "auxiliary_loss_clip": 0.01135374, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.05574417, + "balance_loss_mlp": 1.01664662, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.6297124984355942, + "language_loss": 0.70724535, + "learning_rate": 2.114633606196899e-06, + "loss": 0.72888064, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.11505127, + "step": 8271, + "time_per_iteration": 2.60363507270813 + }, + { + "auxiliary_loss_clip": 0.01128583, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.05086899, + "balance_loss_mlp": 1.01954794, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.8434780194531792, + "language_loss": 0.78179598, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80341083, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13342285, + "step": 8272, + "time_per_iteration": 2.5964958667755127 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.05778193, + "balance_loss_mlp": 1.02252924, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.262694762158386, + "language_loss": 0.66581118, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.6875416, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13024902, + "step": 8273, + "time_per_iteration": 2.5939064025878906 + }, + { + "auxiliary_loss_clip": 0.01136717, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.05859613, + "balance_loss_mlp": 1.02148759, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.8509398888677782, + "language_loss": 0.77938282, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80109495, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13006592, + "step": 8274, + "time_per_iteration": 2.4680557250976562 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.05574739, + "balance_loss_mlp": 1.02038825, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.871865810723383, + "language_loss": 0.75654566, + "learning_rate": 2.113078285889493e-06, + "loss": 0.7782464, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1350708, + "step": 8275, + "time_per_iteration": 2.5045816898345947 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.05111909, + "balance_loss_mlp": 1.02185822, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 2.4460206085516063, + "language_loss": 0.83502835, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85673809, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14892578, + "step": 8276, + "time_per_iteration": 2.4930639266967773 + }, + { + "auxiliary_loss_clip": 0.01126447, + "auxiliary_loss_mlp": 0.01028591, + "balance_loss_clip": 1.05163586, + "balance_loss_mlp": 1.01636004, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3899285866675735, + "language_loss": 0.70232028, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72387069, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12237549, + "step": 8277, + "time_per_iteration": 2.4794845581054688 + }, + { + "auxiliary_loss_clip": 0.01122869, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.04680777, + "balance_loss_mlp": 1.02442193, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 2.0392977620810213, + "language_loss": 0.8226108, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84421897, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.13531494, + "step": 8278, + "time_per_iteration": 2.473756790161133 + }, + { + "auxiliary_loss_clip": 0.01132648, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.053581, + "balance_loss_mlp": 1.01871455, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 2.3180641143607437, + "language_loss": 0.67194712, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69358772, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12695312, + "step": 8279, + "time_per_iteration": 2.4280059337615967 + }, + { + "auxiliary_loss_clip": 0.01119437, + "auxiliary_loss_mlp": 0.01040114, + "balance_loss_clip": 1.04103541, + "balance_loss_mlp": 1.02589262, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 46.34483794343953, + "language_loss": 0.7091229, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73071837, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.14233398, + "step": 8280, + "time_per_iteration": 3.857363700866699 + }, + { + "auxiliary_loss_clip": 0.01124472, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.04603815, + "balance_loss_mlp": 1.02218986, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.8937582442801346, + "language_loss": 0.64660609, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66820866, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13592529, + "step": 8281, + "time_per_iteration": 2.444106101989746 + }, + { + "auxiliary_loss_clip": 0.01134162, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_clip": 1.05543685, + "balance_loss_mlp": 1.02872884, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.1920646753843482, + "language_loss": 0.73208612, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75386035, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.14526367, + "step": 8282, + "time_per_iteration": 2.4330484867095947 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.05254984, + "balance_loss_mlp": 1.02258122, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.6749760948309522, + "language_loss": 0.73604441, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75764894, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11517334, + "step": 8283, + "time_per_iteration": 2.5374019145965576 + }, + { + "auxiliary_loss_clip": 0.011236, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_clip": 1.04708636, + "balance_loss_mlp": 1.03030598, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.6552928258854096, + "language_loss": 0.78685415, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.80853868, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.14544678, + "step": 8284, + "time_per_iteration": 2.462373971939087 + }, + { + "auxiliary_loss_clip": 0.01135323, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.05406868, + "balance_loss_mlp": 1.02127945, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.6828525173609206, + "language_loss": 0.73919678, + "learning_rate": 2.109189687029526e-06, + "loss": 0.76090384, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.14123535, + "step": 8285, + "time_per_iteration": 2.4956507682800293 + }, + { + "auxiliary_loss_clip": 0.01131002, + "auxiliary_loss_mlp": 0.0103611, + "balance_loss_clip": 1.05275953, + "balance_loss_mlp": 1.02259135, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.9840614666040173, + "language_loss": 0.74124581, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76291692, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13549805, + "step": 8286, + "time_per_iteration": 2.4759480953216553 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_clip": 1.05703878, + "balance_loss_mlp": 1.03142262, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.826287339690092, + "language_loss": 0.85718405, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87898314, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12750244, + "step": 8287, + "time_per_iteration": 2.443922758102417 + }, + { + "auxiliary_loss_clip": 0.01129645, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.05286646, + "balance_loss_mlp": 1.01693332, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.80103796120448, + "language_loss": 0.72640443, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74800456, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13439941, + "step": 8288, + "time_per_iteration": 2.5243334770202637 + }, + { + "auxiliary_loss_clip": 0.01137902, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.05598903, + "balance_loss_mlp": 1.01932645, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 3.2459234924949865, + "language_loss": 0.8078109, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82953125, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14794922, + "step": 8289, + "time_per_iteration": 2.413724422454834 + }, + { + "auxiliary_loss_clip": 0.01129374, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.05323291, + "balance_loss_mlp": 1.02324176, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.5860103985690563, + "language_loss": 0.73607653, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75773215, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1295166, + "step": 8290, + "time_per_iteration": 2.5215084552764893 + }, + { + "auxiliary_loss_clip": 0.01125245, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.04769135, + "balance_loss_mlp": 1.02028275, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.7112901416084274, + "language_loss": 0.84376216, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86536795, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.1505127, + "step": 8291, + "time_per_iteration": 2.4782371520996094 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.05889559, + "balance_loss_mlp": 1.02193475, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 5.6341149563475605, + "language_loss": 0.66816705, + "learning_rate": 2.106467420591409e-06, + "loss": 0.68992549, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13922119, + "step": 8292, + "time_per_iteration": 2.4732744693756104 + }, + { + "auxiliary_loss_clip": 0.01118751, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.04275513, + "balance_loss_mlp": 1.01972985, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.6301481691576807, + "language_loss": 0.67205894, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69356656, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1229248, + "step": 8293, + "time_per_iteration": 2.4410316944122314 + }, + { + "auxiliary_loss_clip": 0.01132238, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.05241883, + "balance_loss_mlp": 1.01330245, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 2.8823473800038246, + "language_loss": 0.82049805, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84208483, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13134766, + "step": 8294, + "time_per_iteration": 3.9159741401672363 + }, + { + "auxiliary_loss_clip": 0.01119936, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.04259181, + "balance_loss_mlp": 1.01716471, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.7896693671550081, + "language_loss": 0.73228288, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.75378853, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13470459, + "step": 8295, + "time_per_iteration": 2.526810884475708 + }, + { + "auxiliary_loss_clip": 0.01125602, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.04874396, + "balance_loss_mlp": 1.02179801, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 2.2554634092384873, + "language_loss": 0.67575914, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69735813, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12506104, + "step": 8296, + "time_per_iteration": 2.4958291053771973 + }, + { + "auxiliary_loss_clip": 0.01125525, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.04596198, + "balance_loss_mlp": 1.02686703, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.7339796014542903, + "language_loss": 0.64645869, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66812128, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13842773, + "step": 8297, + "time_per_iteration": 2.5567455291748047 + }, + { + "auxiliary_loss_clip": 0.01120799, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.04467273, + "balance_loss_mlp": 1.02461982, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.7997121437791077, + "language_loss": 0.69639486, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71796709, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11798096, + "step": 8298, + "time_per_iteration": 2.4542157649993896 + }, + { + "auxiliary_loss_clip": 0.01120616, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.0465064, + "balance_loss_mlp": 1.01984239, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 1.6922062047851574, + "language_loss": 0.84706533, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86859512, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12518311, + "step": 8299, + "time_per_iteration": 2.4843928813934326 + }, + { + "auxiliary_loss_clip": 0.01124513, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.04373276, + "balance_loss_mlp": 1.02335441, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 2.016953848091201, + "language_loss": 0.69253337, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71415234, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14013672, + "step": 8300, + "time_per_iteration": 2.4834091663360596 + }, + { + "auxiliary_loss_clip": 0.01053681, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.02772033, + "balance_loss_mlp": 1.00708795, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7534540080638396, + "language_loss": 0.51055533, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53117692, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01391602, + "step": 8301, + "time_per_iteration": 4.559696912765503 + }, + { + "auxiliary_loss_clip": 0.01124858, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.05040026, + "balance_loss_mlp": 1.02411544, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.6399858491864192, + "language_loss": 0.84686947, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86849272, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13348389, + "step": 8302, + "time_per_iteration": 2.4585483074188232 + }, + { + "auxiliary_loss_clip": 0.01121684, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.04744172, + "balance_loss_mlp": 1.01618624, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.8633451903089004, + "language_loss": 0.69196773, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71346533, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11907959, + "step": 8303, + "time_per_iteration": 2.4527547359466553 + }, + { + "auxiliary_loss_clip": 0.01125807, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.04690206, + "balance_loss_mlp": 1.02668774, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.7509481260873672, + "language_loss": 0.73105705, + "learning_rate": 2.101800220681144e-06, + "loss": 0.7527318, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.14978027, + "step": 8304, + "time_per_iteration": 2.526726007461548 + }, + { + "auxiliary_loss_clip": 0.01130404, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.05335402, + "balance_loss_mlp": 1.02052236, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.0017577676185017, + "language_loss": 0.80827087, + "learning_rate": 2.10141126191199e-06, + "loss": 0.82990336, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12335205, + "step": 8305, + "time_per_iteration": 2.4511191844940186 + }, + { + "auxiliary_loss_clip": 0.01094916, + "auxiliary_loss_mlp": 0.01012692, + "balance_loss_clip": 1.06864154, + "balance_loss_mlp": 1.01051617, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7101946592841168, + "language_loss": 0.56894827, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.59002435, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.26318359, + "router_z_loss_mlp": 0.02175903, + "step": 8306, + "time_per_iteration": 3.1742632389068604 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.01034932, + "balance_loss_clip": 1.05163336, + "balance_loss_mlp": 1.02026963, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.8810468467548358, + "language_loss": 0.83114791, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.85279, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.14660645, + "step": 8307, + "time_per_iteration": 2.4023149013519287 + }, + { + "auxiliary_loss_clip": 0.01125607, + "auxiliary_loss_mlp": 0.01034226, + "balance_loss_clip": 1.04730797, + "balance_loss_mlp": 1.02044535, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.7159488072405495, + "language_loss": 0.60921085, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63080913, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13775635, + "step": 8308, + "time_per_iteration": 2.517829656600952 + }, + { + "auxiliary_loss_clip": 0.01121273, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.04687738, + "balance_loss_mlp": 1.01738024, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 2.158230301201558, + "language_loss": 0.74659407, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76810437, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12371826, + "step": 8309, + "time_per_iteration": 3.9213340282440186 + }, + { + "auxiliary_loss_clip": 0.01122005, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.04521132, + "balance_loss_mlp": 1.01922941, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.47843124832174, + "language_loss": 0.80153608, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82307506, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12670898, + "step": 8310, + "time_per_iteration": 2.5073142051696777 + }, + { + "auxiliary_loss_clip": 0.01132687, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.05315948, + "balance_loss_mlp": 1.02553058, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.865716735430571, + "language_loss": 0.7095682, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.7312901, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.1395874, + "step": 8311, + "time_per_iteration": 2.508803129196167 + }, + { + "auxiliary_loss_clip": 0.01130768, + "auxiliary_loss_mlp": 0.01032769, + "balance_loss_clip": 1.05432451, + "balance_loss_mlp": 1.02026963, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.8971373166768257, + "language_loss": 0.77511114, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79674649, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12493896, + "step": 8312, + "time_per_iteration": 2.46073842048645 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.04749012, + "balance_loss_mlp": 1.01800203, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.6583057016698395, + "language_loss": 0.84393758, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86549973, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13250732, + "step": 8313, + "time_per_iteration": 2.486816883087158 + }, + { + "auxiliary_loss_clip": 0.01137868, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.05513406, + "balance_loss_mlp": 1.02083921, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 1.7343622770581002, + "language_loss": 0.80473578, + "learning_rate": 2.097910461710939e-06, + "loss": 0.82646251, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13983154, + "step": 8314, + "time_per_iteration": 2.443408966064453 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01042005, + "balance_loss_clip": 1.04924631, + "balance_loss_mlp": 1.02734232, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 1.7487692015897474, + "language_loss": 0.79365766, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81537986, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14660645, + "step": 8315, + "time_per_iteration": 2.4441139698028564 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.05414319, + "balance_loss_mlp": 1.02437651, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.8214036049640954, + "language_loss": 0.7423991, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76409733, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12548828, + "step": 8316, + "time_per_iteration": 2.6746578216552734 + }, + { + "auxiliary_loss_clip": 0.01124685, + "auxiliary_loss_mlp": 0.01034678, + "balance_loss_clip": 1.05081439, + "balance_loss_mlp": 1.02256012, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.4933683736518673, + "language_loss": 0.81095457, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83254826, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12121582, + "step": 8317, + "time_per_iteration": 2.4847893714904785 + }, + { + "auxiliary_loss_clip": 0.01127305, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.04876721, + "balance_loss_mlp": 1.01896191, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.698083745183831, + "language_loss": 0.83596265, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85756814, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.14282227, + "step": 8318, + "time_per_iteration": 2.446558952331543 + }, + { + "auxiliary_loss_clip": 0.01131173, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.05237353, + "balance_loss_mlp": 1.01775777, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.8445550387892151, + "language_loss": 0.81795198, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83956933, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12792969, + "step": 8319, + "time_per_iteration": 2.421041250228882 + }, + { + "auxiliary_loss_clip": 0.01123914, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.04585171, + "balance_loss_mlp": 1.01438046, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.5941273996141698, + "language_loss": 0.71810305, + "learning_rate": 2.095576427171635e-06, + "loss": 0.73960769, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1217041, + "step": 8320, + "time_per_iteration": 2.5542781352996826 + }, + { + "auxiliary_loss_clip": 0.01129087, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.04481268, + "balance_loss_mlp": 1.0310322, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 3.7826357270297613, + "language_loss": 0.76448274, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.7862398, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.15563965, + "step": 8321, + "time_per_iteration": 2.5915489196777344 + }, + { + "auxiliary_loss_clip": 0.01130902, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.05256391, + "balance_loss_mlp": 1.02579355, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.6509552001789274, + "language_loss": 0.82974541, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85144722, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13482666, + "step": 8322, + "time_per_iteration": 2.460846424102783 + }, + { + "auxiliary_loss_clip": 0.01128262, + "auxiliary_loss_mlp": 0.01041354, + "balance_loss_clip": 1.04828382, + "balance_loss_mlp": 1.02547538, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.6576261671200703, + "language_loss": 0.73698997, + "learning_rate": 2.094409360775228e-06, + "loss": 0.75868607, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.15881348, + "step": 8323, + "time_per_iteration": 3.8797552585601807 + }, + { + "auxiliary_loss_clip": 0.01135203, + "auxiliary_loss_mlp": 0.01037013, + "balance_loss_clip": 1.05542219, + "balance_loss_mlp": 1.02365553, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.4959751024420547, + "language_loss": 0.69635558, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71807772, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13372803, + "step": 8324, + "time_per_iteration": 2.5094518661499023 + }, + { + "auxiliary_loss_clip": 0.0112462, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.04637051, + "balance_loss_mlp": 1.02155137, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 2.549204540781245, + "language_loss": 0.72186327, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.74345815, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13305664, + "step": 8325, + "time_per_iteration": 2.480844736099243 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01040648, + "balance_loss_clip": 1.05572104, + "balance_loss_mlp": 1.02558041, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.561135450715482, + "language_loss": 0.73126638, + "learning_rate": 2.093242262158709e-06, + "loss": 0.75304705, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.1505127, + "step": 8326, + "time_per_iteration": 2.4483299255371094 + }, + { + "auxiliary_loss_clip": 0.01132945, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.05481267, + "balance_loss_mlp": 1.01881146, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.4754918046837682, + "language_loss": 0.77845043, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80009425, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1262207, + "step": 8327, + "time_per_iteration": 2.456334352493286 + }, + { + "auxiliary_loss_clip": 0.01130198, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.05111492, + "balance_loss_mlp": 1.02602339, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.093087174361592, + "language_loss": 0.87617874, + "learning_rate": 2.092464178710997e-06, + "loss": 0.89787221, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13116455, + "step": 8328, + "time_per_iteration": 2.397469997406006 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.05690598, + "balance_loss_mlp": 1.02122235, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.6529765440369832, + "language_loss": 0.74350095, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76523352, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13665771, + "step": 8329, + "time_per_iteration": 2.454352378845215 + }, + { + "auxiliary_loss_clip": 0.01139555, + "auxiliary_loss_mlp": 0.01036568, + "balance_loss_clip": 1.06231534, + "balance_loss_mlp": 1.02338314, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 1.77507291051021, + "language_loss": 0.79510653, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81686771, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13189697, + "step": 8330, + "time_per_iteration": 2.4920504093170166 + }, + { + "auxiliary_loss_clip": 0.0107001, + "auxiliary_loss_mlp": 0.01003577, + "balance_loss_clip": 1.04382706, + "balance_loss_mlp": 1.00220788, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7299388668223736, + "language_loss": 0.56092888, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.5816648, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.26147461, + "router_z_loss_mlp": 0.01370239, + "step": 8331, + "time_per_iteration": 2.8929288387298584 + }, + { + "auxiliary_loss_clip": 0.0112685, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.04839242, + "balance_loss_mlp": 1.02126265, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 1.8097467913166156, + "language_loss": 0.64729631, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.66890883, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13153076, + "step": 8332, + "time_per_iteration": 2.4922454357147217 + }, + { + "auxiliary_loss_clip": 0.01125445, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.04942667, + "balance_loss_mlp": 1.02070594, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.4818992643539404, + "language_loss": 0.74646837, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.76805204, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12213135, + "step": 8333, + "time_per_iteration": 2.5744001865386963 + }, + { + "auxiliary_loss_clip": 0.01131147, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.05172753, + "balance_loss_mlp": 1.02018034, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 2.9361021345453926, + "language_loss": 0.80592823, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82757163, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13012695, + "step": 8334, + "time_per_iteration": 2.4174606800079346 + }, + { + "auxiliary_loss_clip": 0.01046102, + "auxiliary_loss_mlp": 0.01005641, + "balance_loss_clip": 1.02020717, + "balance_loss_mlp": 1.00412965, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.9002576693121936, + "language_loss": 0.6267609, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64727831, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.0151062, + "step": 8335, + "time_per_iteration": 3.112091302871704 + }, + { + "auxiliary_loss_clip": 0.01122898, + "auxiliary_loss_mlp": 0.01026792, + "balance_loss_clip": 1.04689741, + "balance_loss_mlp": 1.01398301, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.6554688923056071, + "language_loss": 0.79824674, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81974363, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12805176, + "step": 8336, + "time_per_iteration": 2.4716548919677734 + }, + { + "auxiliary_loss_clip": 0.01129147, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.05027628, + "balance_loss_mlp": 1.02056634, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.827621721261409, + "language_loss": 0.79991078, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82155693, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.14904785, + "step": 8337, + "time_per_iteration": 2.496281147003174 + }, + { + "auxiliary_loss_clip": 0.01132043, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.04740548, + "balance_loss_mlp": 1.02186978, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.9173455483938813, + "language_loss": 0.79280233, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81447989, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13842773, + "step": 8338, + "time_per_iteration": 3.889582872390747 + }, + { + "auxiliary_loss_clip": 0.01134922, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.05412197, + "balance_loss_mlp": 1.01512802, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.8199208654725783, + "language_loss": 0.85117257, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87280846, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13555908, + "step": 8339, + "time_per_iteration": 2.50301194190979 + }, + { + "auxiliary_loss_clip": 0.01132861, + "auxiliary_loss_mlp": 0.01039951, + "balance_loss_clip": 1.0537138, + "balance_loss_mlp": 1.02652776, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.6507264503102044, + "language_loss": 0.7062003, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.7279284, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13433838, + "step": 8340, + "time_per_iteration": 2.528055191040039 + }, + { + "auxiliary_loss_clip": 0.01127972, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.04540181, + "balance_loss_mlp": 1.02104616, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.980404379827992, + "language_loss": 0.78480017, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80643433, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.14398193, + "step": 8341, + "time_per_iteration": 2.460995674133301 + }, + { + "auxiliary_loss_clip": 0.01132927, + "auxiliary_loss_mlp": 0.01040451, + "balance_loss_clip": 1.04853761, + "balance_loss_mlp": 1.02551401, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.509874636428902, + "language_loss": 0.89294088, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91467464, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14941406, + "step": 8342, + "time_per_iteration": 2.4682185649871826 + }, + { + "auxiliary_loss_clip": 0.01124887, + "auxiliary_loss_mlp": 0.01041012, + "balance_loss_clip": 1.04739833, + "balance_loss_mlp": 1.02753568, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.7593706167566117, + "language_loss": 0.76550412, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78716314, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13470459, + "step": 8343, + "time_per_iteration": 2.488966941833496 + }, + { + "auxiliary_loss_clip": 0.01128942, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.05292678, + "balance_loss_mlp": 1.01806676, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 6.188111022218718, + "language_loss": 0.67308885, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69467998, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12097168, + "step": 8344, + "time_per_iteration": 3.9217281341552734 + }, + { + "auxiliary_loss_clip": 0.01122379, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.04335606, + "balance_loss_mlp": 1.0240804, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.872566275405554, + "language_loss": 0.75858337, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.78018147, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13354492, + "step": 8345, + "time_per_iteration": 2.4984853267669678 + }, + { + "auxiliary_loss_clip": 0.01125378, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.04542089, + "balance_loss_mlp": 1.0218941, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 2.1108304445855604, + "language_loss": 0.79015666, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.81177831, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14898682, + "step": 8346, + "time_per_iteration": 2.4380950927734375 + }, + { + "auxiliary_loss_clip": 0.01127328, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.04708827, + "balance_loss_mlp": 1.02098322, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.4787562647844852, + "language_loss": 0.69157332, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71319103, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13464355, + "step": 8347, + "time_per_iteration": 2.40789532661438 + }, + { + "auxiliary_loss_clip": 0.01137361, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.05325007, + "balance_loss_mlp": 1.01836228, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 2.588661131321841, + "language_loss": 0.71372777, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73542213, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13708496, + "step": 8348, + "time_per_iteration": 2.5200042724609375 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.04592907, + "balance_loss_mlp": 1.01771522, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.4420669471627987, + "language_loss": 0.74165779, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76318669, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12554932, + "step": 8349, + "time_per_iteration": 2.4425058364868164 + }, + { + "auxiliary_loss_clip": 0.01132195, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.0501194, + "balance_loss_mlp": 1.02116358, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.0375588754868525, + "language_loss": 0.63885659, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.6605345, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14440918, + "step": 8350, + "time_per_iteration": 2.414036273956299 + }, + { + "auxiliary_loss_clip": 0.01059362, + "auxiliary_loss_mlp": 0.01011333, + "balance_loss_clip": 1.03338432, + "balance_loss_mlp": 1.00980425, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 1.3911382749587482, + "language_loss": 0.59801894, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6187259, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.01527405, + "step": 8351, + "time_per_iteration": 3.216226100921631 + }, + { + "auxiliary_loss_clip": 0.01133016, + "auxiliary_loss_mlp": 0.01042141, + "balance_loss_clip": 1.05060458, + "balance_loss_mlp": 1.02818775, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 3.3326120815256175, + "language_loss": 0.75137269, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77312422, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.1395874, + "step": 8352, + "time_per_iteration": 2.492128849029541 + }, + { + "auxiliary_loss_clip": 0.0112993, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.05086327, + "balance_loss_mlp": 1.02088118, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6915571489317704, + "language_loss": 0.71905601, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74070537, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14117432, + "step": 8353, + "time_per_iteration": 3.827885150909424 + }, + { + "auxiliary_loss_clip": 0.01131862, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.05076599, + "balance_loss_mlp": 1.01808786, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.7237356092648621, + "language_loss": 0.74053204, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76217872, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14715576, + "step": 8354, + "time_per_iteration": 2.4352493286132812 + }, + { + "auxiliary_loss_clip": 0.0113525, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.05545461, + "balance_loss_mlp": 1.02684832, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.4884618516278445, + "language_loss": 0.72507244, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74683303, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13952637, + "step": 8355, + "time_per_iteration": 2.5327439308166504 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01040369, + "balance_loss_clip": 1.04908466, + "balance_loss_mlp": 1.02331042, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.8550322867909477, + "language_loss": 0.81267297, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83439088, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.17053223, + "step": 8356, + "time_per_iteration": 2.4962055683135986 + }, + { + "auxiliary_loss_clip": 0.01135333, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.04912853, + "balance_loss_mlp": 1.01935434, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.081623452773896, + "language_loss": 0.76165682, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78336239, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15875244, + "step": 8357, + "time_per_iteration": 2.560828924179077 + }, + { + "auxiliary_loss_clip": 0.01131464, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.05065978, + "balance_loss_mlp": 1.0216198, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.597547594070686, + "language_loss": 0.76484752, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78652084, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14239502, + "step": 8358, + "time_per_iteration": 2.467078924179077 + }, + { + "auxiliary_loss_clip": 0.01139209, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.05878663, + "balance_loss_mlp": 1.02233553, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.183822052178724, + "language_loss": 0.72329688, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74505597, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.14361572, + "step": 8359, + "time_per_iteration": 2.5722055435180664 + }, + { + "auxiliary_loss_clip": 0.01128933, + "auxiliary_loss_mlp": 0.0104495, + "balance_loss_clip": 1.04827142, + "balance_loss_mlp": 1.0306325, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.5827557938000723, + "language_loss": 0.76875025, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79048908, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14306641, + "step": 8360, + "time_per_iteration": 2.4544012546539307 + }, + { + "auxiliary_loss_clip": 0.01129156, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.04925168, + "balance_loss_mlp": 1.02385879, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.5536128986350428, + "language_loss": 0.76403916, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78570896, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1395874, + "step": 8361, + "time_per_iteration": 2.5689470767974854 + }, + { + "auxiliary_loss_clip": 0.01138094, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.05451596, + "balance_loss_mlp": 1.02122903, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.7254879453512182, + "language_loss": 0.85127795, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87302005, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14892578, + "step": 8362, + "time_per_iteration": 2.4697060585021973 + }, + { + "auxiliary_loss_clip": 0.01127723, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.0476346, + "balance_loss_mlp": 1.01730764, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.7872732034799963, + "language_loss": 0.78080255, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80238777, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.1348877, + "step": 8363, + "time_per_iteration": 2.580056667327881 + }, + { + "auxiliary_loss_clip": 0.01120371, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.04382133, + "balance_loss_mlp": 1.01841009, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 2.655288040986019, + "language_loss": 0.7564947, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77802289, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.14031982, + "step": 8364, + "time_per_iteration": 2.4724786281585693 + }, + { + "auxiliary_loss_clip": 0.01123344, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.04649258, + "balance_loss_mlp": 1.01688719, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.6019454000855322, + "language_loss": 0.69723451, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71876591, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12921143, + "step": 8365, + "time_per_iteration": 2.482938528060913 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01039189, + "balance_loss_clip": 1.04367304, + "balance_loss_mlp": 1.0228337, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.5660694339128332, + "language_loss": 0.73484796, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75650305, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.16369629, + "step": 8366, + "time_per_iteration": 2.4862964153289795 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.06010723, + "balance_loss_mlp": 1.01853585, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4276739512839998, + "language_loss": 0.78409791, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80581462, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13311768, + "step": 8367, + "time_per_iteration": 3.975619077682495 + }, + { + "auxiliary_loss_clip": 0.01127458, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.04960477, + "balance_loss_mlp": 1.01721621, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.612485766790339, + "language_loss": 0.69843227, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72000998, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13104248, + "step": 8368, + "time_per_iteration": 2.4256691932678223 + }, + { + "auxiliary_loss_clip": 0.01067887, + "auxiliary_loss_mlp": 0.01002349, + "balance_loss_clip": 1.03883564, + "balance_loss_mlp": 1.00094104, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.854438849436177, + "language_loss": 0.63334298, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65404534, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.29052734, + "router_z_loss_mlp": 0.0140686, + "step": 8369, + "time_per_iteration": 3.0520336627960205 + }, + { + "auxiliary_loss_clip": 0.01128467, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.04914725, + "balance_loss_mlp": 1.0228864, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.067523119708632, + "language_loss": 0.60643709, + "learning_rate": 2.076121368302263e-06, + "loss": 0.6280815, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13098145, + "step": 8370, + "time_per_iteration": 2.553826332092285 + }, + { + "auxiliary_loss_clip": 0.01134868, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.0536238, + "balance_loss_mlp": 1.02168405, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.6135639815114575, + "language_loss": 0.68276215, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70447934, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.1517334, + "step": 8371, + "time_per_iteration": 2.589358329772949 + }, + { + "auxiliary_loss_clip": 0.01131673, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.05026174, + "balance_loss_mlp": 1.0174228, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 2.420704219755574, + "language_loss": 0.68017191, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.70181179, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.14874268, + "step": 8372, + "time_per_iteration": 2.6122522354125977 + }, + { + "auxiliary_loss_clip": 0.01138756, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.05470502, + "balance_loss_mlp": 1.0226804, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 2.2537808747045744, + "language_loss": 0.67180151, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.69357324, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.1574707, + "step": 8373, + "time_per_iteration": 2.529744863510132 + }, + { + "auxiliary_loss_clip": 0.01133372, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.05398726, + "balance_loss_mlp": 1.01825619, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.7569421112352541, + "language_loss": 0.74788433, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76953959, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13903809, + "step": 8374, + "time_per_iteration": 2.545454740524292 + }, + { + "auxiliary_loss_clip": 0.01135196, + "auxiliary_loss_mlp": 0.01042465, + "balance_loss_clip": 1.05432093, + "balance_loss_mlp": 1.02773643, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.585298981642501, + "language_loss": 0.68072188, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70249844, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1473999, + "step": 8375, + "time_per_iteration": 2.4542205333709717 + }, + { + "auxiliary_loss_clip": 0.011371, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.0522387, + "balance_loss_mlp": 1.02069449, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.8138510020092034, + "language_loss": 0.78638053, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.80811846, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15985107, + "step": 8376, + "time_per_iteration": 2.4729084968566895 + }, + { + "auxiliary_loss_clip": 0.01140948, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.05590105, + "balance_loss_mlp": 1.02257907, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 1.942601032806692, + "language_loss": 0.59598082, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61776376, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14776611, + "step": 8377, + "time_per_iteration": 2.5130183696746826 + }, + { + "auxiliary_loss_clip": 0.01133342, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.05333614, + "balance_loss_mlp": 1.02480888, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 2.203756182053658, + "language_loss": 0.75655955, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.77828228, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14135742, + "step": 8378, + "time_per_iteration": 2.448324203491211 + }, + { + "auxiliary_loss_clip": 0.01121847, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.0435096, + "balance_loss_mlp": 1.02075601, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 2.1959898455410882, + "language_loss": 0.7497887, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.7713511, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13635254, + "step": 8379, + "time_per_iteration": 2.5154097080230713 + }, + { + "auxiliary_loss_clip": 0.0112626, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.04845524, + "balance_loss_mlp": 1.02097273, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 3.3237693087758804, + "language_loss": 0.66710567, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68871403, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13604736, + "step": 8380, + "time_per_iteration": 2.580469846725464 + }, + { + "auxiliary_loss_clip": 0.01132308, + "auxiliary_loss_mlp": 0.01036758, + "balance_loss_clip": 1.05315828, + "balance_loss_mlp": 1.02359104, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 2.7367596662485036, + "language_loss": 0.63677061, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65846127, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.1317749, + "step": 8381, + "time_per_iteration": 3.965348482131958 + }, + { + "auxiliary_loss_clip": 0.01123795, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.04634368, + "balance_loss_mlp": 1.02623367, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.5707019159159998, + "language_loss": 0.67452234, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69615823, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13568115, + "step": 8382, + "time_per_iteration": 2.5947203636169434 + }, + { + "auxiliary_loss_clip": 0.0113626, + "auxiliary_loss_mlp": 0.01041358, + "balance_loss_clip": 1.0504396, + "balance_loss_mlp": 1.0250262, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 1.711657361739396, + "language_loss": 0.623586, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64536214, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.16333008, + "step": 8383, + "time_per_iteration": 2.4685914516448975 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.05393434, + "balance_loss_mlp": 1.02094316, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.8267148114462137, + "language_loss": 0.66970539, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69136989, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.14422607, + "step": 8384, + "time_per_iteration": 2.46024751663208 + }, + { + "auxiliary_loss_clip": 0.01132588, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.05154884, + "balance_loss_mlp": 1.0260309, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.8624775856851337, + "language_loss": 0.7134726, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.73518723, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12841797, + "step": 8385, + "time_per_iteration": 2.568227767944336 + }, + { + "auxiliary_loss_clip": 0.01132704, + "auxiliary_loss_mlp": 0.01027112, + "balance_loss_clip": 1.05573916, + "balance_loss_mlp": 1.01403451, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.7823097394701026, + "language_loss": 0.83166265, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85326076, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13067627, + "step": 8386, + "time_per_iteration": 2.5552170276641846 + }, + { + "auxiliary_loss_clip": 0.01141696, + "auxiliary_loss_mlp": 0.01038078, + "balance_loss_clip": 1.0604167, + "balance_loss_mlp": 1.02384424, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.8085683691272092, + "language_loss": 0.66995275, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.69175047, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14227295, + "step": 8387, + "time_per_iteration": 4.001327276229858 + }, + { + "auxiliary_loss_clip": 0.01141728, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.05980635, + "balance_loss_mlp": 1.02450681, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.436253964445396, + "language_loss": 0.80159402, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.8233875, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13128662, + "step": 8388, + "time_per_iteration": 2.493648052215576 + }, + { + "auxiliary_loss_clip": 0.01139637, + "auxiliary_loss_mlp": 0.0103784, + "balance_loss_clip": 1.05978811, + "balance_loss_mlp": 1.02410722, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 4.109398565081672, + "language_loss": 0.70178825, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72356296, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.137146, + "step": 8389, + "time_per_iteration": 2.5248515605926514 + }, + { + "auxiliary_loss_clip": 0.01123901, + "auxiliary_loss_mlp": 0.0103998, + "balance_loss_clip": 1.04414606, + "balance_loss_mlp": 1.02658105, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 2.3243444049675577, + "language_loss": 0.69449067, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71612948, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1340332, + "step": 8390, + "time_per_iteration": 2.501178503036499 + }, + { + "auxiliary_loss_clip": 0.01060383, + "auxiliary_loss_mlp": 0.01004033, + "balance_loss_clip": 1.03414035, + "balance_loss_mlp": 1.00271392, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8086165696659382, + "language_loss": 0.52898502, + "learning_rate": 2.067947985330974e-06, + "loss": 0.54962921, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01318359, + "step": 8391, + "time_per_iteration": 2.887362241744995 + }, + { + "auxiliary_loss_clip": 0.01054682, + "auxiliary_loss_mlp": 0.01002714, + "balance_loss_clip": 1.02777755, + "balance_loss_mlp": 1.00144458, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8588905891459276, + "language_loss": 0.60725176, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62782562, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.26904297, + "router_z_loss_mlp": 0.01269531, + "step": 8392, + "time_per_iteration": 2.9224255084991455 + }, + { + "auxiliary_loss_clip": 0.01120446, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.04484415, + "balance_loss_mlp": 1.01912999, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.734108930074326, + "language_loss": 0.84418499, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86571074, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13012695, + "step": 8393, + "time_per_iteration": 2.4760663509368896 + }, + { + "auxiliary_loss_clip": 0.01133427, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.05321908, + "balance_loss_mlp": 1.02124882, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.002537284991737, + "language_loss": 0.51276422, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53444493, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13397217, + "step": 8394, + "time_per_iteration": 2.5633599758148193 + }, + { + "auxiliary_loss_clip": 0.01129987, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.04954171, + "balance_loss_mlp": 1.01923859, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.9118551992254003, + "language_loss": 0.75281334, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.77444828, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.14282227, + "step": 8395, + "time_per_iteration": 2.477447032928467 + }, + { + "auxiliary_loss_clip": 0.01132314, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.05222011, + "balance_loss_mlp": 1.01716375, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 1.8326928443817825, + "language_loss": 0.67976558, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70139921, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13885498, + "step": 8396, + "time_per_iteration": 3.7908408641815186 + }, + { + "auxiliary_loss_clip": 0.01134762, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.05581141, + "balance_loss_mlp": 1.01645064, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.7826493867945987, + "language_loss": 0.78600574, + "learning_rate": 2.065612518371792e-06, + "loss": 0.80764502, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12731934, + "step": 8397, + "time_per_iteration": 2.5355732440948486 + }, + { + "auxiliary_loss_clip": 0.01128067, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.05049264, + "balance_loss_mlp": 1.01637042, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.5863782870132466, + "language_loss": 0.66420484, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68577927, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13012695, + "step": 8398, + "time_per_iteration": 2.64228892326355 + }, + { + "auxiliary_loss_clip": 0.01132899, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.0532279, + "balance_loss_mlp": 1.01754546, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.686838415090643, + "language_loss": 0.71657985, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73822308, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13891602, + "step": 8399, + "time_per_iteration": 2.4951975345611572 + }, + { + "auxiliary_loss_clip": 0.01139396, + "auxiliary_loss_mlp": 0.01045592, + "balance_loss_clip": 1.0575639, + "balance_loss_mlp": 1.03145409, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.838266858861589, + "language_loss": 0.8187052, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.84055507, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14123535, + "step": 8400, + "time_per_iteration": 2.4901859760284424 + }, + { + "auxiliary_loss_clip": 0.01126155, + "auxiliary_loss_mlp": 0.01038593, + "balance_loss_clip": 1.04572153, + "balance_loss_mlp": 1.02450812, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.8416174024632233, + "language_loss": 0.78946054, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81110811, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.14080811, + "step": 8401, + "time_per_iteration": 2.5016539096832275 + }, + { + "auxiliary_loss_clip": 0.01126786, + "auxiliary_loss_mlp": 0.0104078, + "balance_loss_clip": 1.04520261, + "balance_loss_mlp": 1.02533662, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.7616373425521759, + "language_loss": 0.6987102, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72038591, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.15435791, + "step": 8402, + "time_per_iteration": 2.5774521827697754 + }, + { + "auxiliary_loss_clip": 0.01124326, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.04345775, + "balance_loss_mlp": 1.02150464, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.608957684796787, + "language_loss": 0.69772398, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71932, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13763428, + "step": 8403, + "time_per_iteration": 2.5156476497650146 + }, + { + "auxiliary_loss_clip": 0.01130649, + "auxiliary_loss_mlp": 0.01042387, + "balance_loss_clip": 1.05356658, + "balance_loss_mlp": 1.02903008, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.4597519785617228, + "language_loss": 0.85826969, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88000011, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13354492, + "step": 8404, + "time_per_iteration": 2.5321896076202393 + }, + { + "auxiliary_loss_clip": 0.01126716, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_clip": 1.04641759, + "balance_loss_mlp": 1.02963924, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 3.0306890162172775, + "language_loss": 0.75524294, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77697074, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.16430664, + "step": 8405, + "time_per_iteration": 2.5742435455322266 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.05269718, + "balance_loss_mlp": 1.01528478, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.5773871032680615, + "language_loss": 0.73116589, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75280678, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.14465332, + "step": 8406, + "time_per_iteration": 2.6372475624084473 + }, + { + "auxiliary_loss_clip": 0.01123392, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.04852843, + "balance_loss_mlp": 1.01939034, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.8553861681585686, + "language_loss": 0.76782334, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.78938007, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12860107, + "step": 8407, + "time_per_iteration": 2.476722002029419 + }, + { + "auxiliary_loss_clip": 0.01127292, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.04758394, + "balance_loss_mlp": 1.01943719, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.7081055948243398, + "language_loss": 0.62749451, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.64908868, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1270752, + "step": 8408, + "time_per_iteration": 2.5527849197387695 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.05487168, + "balance_loss_mlp": 1.01818371, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.7167095220909416, + "language_loss": 0.63652861, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65817565, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13861084, + "step": 8409, + "time_per_iteration": 2.471212863922119 + }, + { + "auxiliary_loss_clip": 0.0112852, + "auxiliary_loss_mlp": 0.01025152, + "balance_loss_clip": 1.05181003, + "balance_loss_mlp": 1.01324332, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.2980778213606143, + "language_loss": 0.70779943, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.72933626, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11907959, + "step": 8410, + "time_per_iteration": 2.5906786918640137 + }, + { + "auxiliary_loss_clip": 0.01126597, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.04604101, + "balance_loss_mlp": 1.02373064, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.4632580599376988, + "language_loss": 0.79026806, + "learning_rate": 2.060162752653113e-06, + "loss": 0.8119064, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13494873, + "step": 8411, + "time_per_iteration": 3.9504847526550293 + }, + { + "auxiliary_loss_clip": 0.0112878, + "auxiliary_loss_mlp": 0.01036922, + "balance_loss_clip": 1.04773581, + "balance_loss_mlp": 1.02171659, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.7367426044838117, + "language_loss": 0.82148814, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.84314519, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.15209961, + "step": 8412, + "time_per_iteration": 2.4446818828582764 + }, + { + "auxiliary_loss_clip": 0.01141769, + "auxiliary_loss_mlp": 0.01038288, + "balance_loss_clip": 1.06282425, + "balance_loss_mlp": 1.02480507, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 1.9214833022202913, + "language_loss": 0.80392885, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82572943, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13482666, + "step": 8413, + "time_per_iteration": 2.555544137954712 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01038924, + "balance_loss_clip": 1.05263996, + "balance_loss_mlp": 1.02433872, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.9416647414981314, + "language_loss": 0.80290008, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82463723, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14581299, + "step": 8414, + "time_per_iteration": 2.4136524200439453 + }, + { + "auxiliary_loss_clip": 0.01126968, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.04695725, + "balance_loss_mlp": 1.01571, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.049148391860121, + "language_loss": 0.61857432, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64013475, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13372803, + "step": 8415, + "time_per_iteration": 2.572084903717041 + }, + { + "auxiliary_loss_clip": 0.01127648, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.04919672, + "balance_loss_mlp": 1.01784873, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.6601308518051023, + "language_loss": 0.81923044, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84082019, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13494873, + "step": 8416, + "time_per_iteration": 2.43000864982605 + }, + { + "auxiliary_loss_clip": 0.01127308, + "auxiliary_loss_mlp": 0.01047336, + "balance_loss_clip": 1.0490644, + "balance_loss_mlp": 1.03409815, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 2.1899320940928617, + "language_loss": 0.79170543, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81345189, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13232422, + "step": 8417, + "time_per_iteration": 2.489380359649658 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.05353272, + "balance_loss_mlp": 1.02601409, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.6975081675747732, + "language_loss": 0.62599027, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.6476922, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1260376, + "step": 8418, + "time_per_iteration": 2.4304800033569336 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.05683231, + "balance_loss_mlp": 1.0188396, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.7502137911585116, + "language_loss": 0.77496624, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79665756, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13165283, + "step": 8419, + "time_per_iteration": 2.587404489517212 + }, + { + "auxiliary_loss_clip": 0.01123488, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.04484963, + "balance_loss_mlp": 1.01995349, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 2.1719379911737517, + "language_loss": 0.76788545, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.78945816, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13830566, + "step": 8420, + "time_per_iteration": 2.5106332302093506 + }, + { + "auxiliary_loss_clip": 0.0113455, + "auxiliary_loss_mlp": 0.01036132, + "balance_loss_clip": 1.0515902, + "balance_loss_mlp": 1.02216029, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.705559479832767, + "language_loss": 0.77518213, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79688889, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13952637, + "step": 8421, + "time_per_iteration": 2.509265899658203 + }, + { + "auxiliary_loss_clip": 0.01124825, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.04720712, + "balance_loss_mlp": 1.0160377, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.848854656575186, + "language_loss": 0.66773808, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68927193, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12518311, + "step": 8422, + "time_per_iteration": 2.4656999111175537 + }, + { + "auxiliary_loss_clip": 0.01131039, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.05437112, + "balance_loss_mlp": 1.01930988, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.7635867239533047, + "language_loss": 0.81817651, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83981085, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13098145, + "step": 8423, + "time_per_iteration": 2.5632736682891846 + }, + { + "auxiliary_loss_clip": 0.01130442, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.05020261, + "balance_loss_mlp": 1.01985073, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.8327921401304559, + "language_loss": 0.74625343, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76789439, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13806152, + "step": 8424, + "time_per_iteration": 2.4640696048736572 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.05340779, + "balance_loss_mlp": 1.02247453, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.4602068917785735, + "language_loss": 0.71337384, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73503149, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13647461, + "step": 8425, + "time_per_iteration": 4.009516000747681 + }, + { + "auxiliary_loss_clip": 0.01125251, + "auxiliary_loss_mlp": 0.01037701, + "balance_loss_clip": 1.04811931, + "balance_loss_mlp": 1.02497506, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.6750364743811703, + "language_loss": 0.78712785, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80875736, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.1272583, + "step": 8426, + "time_per_iteration": 2.4924535751342773 + }, + { + "auxiliary_loss_clip": 0.01133373, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.05470729, + "balance_loss_mlp": 1.02401114, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 10.377773020838843, + "language_loss": 0.78085577, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80255979, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13031006, + "step": 8427, + "time_per_iteration": 2.5130038261413574 + }, + { + "auxiliary_loss_clip": 0.01120228, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.04642153, + "balance_loss_mlp": 1.01603699, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.6876373525079087, + "language_loss": 0.71805799, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73954892, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12817383, + "step": 8428, + "time_per_iteration": 2.474255084991455 + }, + { + "auxiliary_loss_clip": 0.01129588, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.05197346, + "balance_loss_mlp": 1.0192585, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.619013922058978, + "language_loss": 0.82820988, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84982109, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12286377, + "step": 8429, + "time_per_iteration": 2.564723014831543 + }, + { + "auxiliary_loss_clip": 0.01141074, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.05891347, + "balance_loss_mlp": 1.02154195, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 1.832684898072695, + "language_loss": 0.73481476, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75658703, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14624023, + "step": 8430, + "time_per_iteration": 2.5660018920898438 + }, + { + "auxiliary_loss_clip": 0.01126001, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.04835677, + "balance_loss_mlp": 1.02062726, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.5790146949691959, + "language_loss": 0.76734829, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78894186, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.1272583, + "step": 8431, + "time_per_iteration": 2.479607105255127 + }, + { + "auxiliary_loss_clip": 0.01124803, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.04925418, + "balance_loss_mlp": 1.01905823, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4384513472803675, + "language_loss": 0.72420144, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74577045, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13024902, + "step": 8432, + "time_per_iteration": 3.9211032390594482 + }, + { + "auxiliary_loss_clip": 0.01068795, + "auxiliary_loss_mlp": 0.01020005, + "balance_loss_clip": 1.04160166, + "balance_loss_mlp": 1.01846278, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7591906606893692, + "language_loss": 0.63592672, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65681469, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.27197266, + "router_z_loss_mlp": 0.01542664, + "step": 8433, + "time_per_iteration": 3.1001229286193848 + }, + { + "auxiliary_loss_clip": 0.0113555, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.05685544, + "balance_loss_mlp": 1.02686954, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 3.165227794769459, + "language_loss": 0.77561408, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79736948, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13140869, + "step": 8434, + "time_per_iteration": 2.4767260551452637 + }, + { + "auxiliary_loss_clip": 0.0112714, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.04872727, + "balance_loss_mlp": 1.02256966, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.7425107907022865, + "language_loss": 0.70937657, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73100019, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12652588, + "step": 8435, + "time_per_iteration": 2.4722607135772705 + }, + { + "auxiliary_loss_clip": 0.01126489, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_clip": 1.04710889, + "balance_loss_mlp": 1.02768588, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.8258875767285927, + "language_loss": 0.72391641, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74560571, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14758301, + "step": 8436, + "time_per_iteration": 2.484741687774658 + }, + { + "auxiliary_loss_clip": 0.01122305, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.04491043, + "balance_loss_mlp": 1.02273822, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.474098686243099, + "language_loss": 0.83825362, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85984641, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.14245605, + "step": 8437, + "time_per_iteration": 2.4439258575439453 + }, + { + "auxiliary_loss_clip": 0.01129881, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.05165553, + "balance_loss_mlp": 1.01800728, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.4038470358770538, + "language_loss": 0.80717635, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82878649, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13110352, + "step": 8438, + "time_per_iteration": 2.5020220279693604 + }, + { + "auxiliary_loss_clip": 0.01139473, + "auxiliary_loss_mlp": 0.01036379, + "balance_loss_clip": 1.06019235, + "balance_loss_mlp": 1.02157891, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.948639425955923, + "language_loss": 0.79702377, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81878227, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.14819336, + "step": 8439, + "time_per_iteration": 2.474821090698242 + }, + { + "auxiliary_loss_clip": 0.01131344, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.05527544, + "balance_loss_mlp": 1.02031064, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5658679034675325, + "language_loss": 0.71255392, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73418897, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11853027, + "step": 8440, + "time_per_iteration": 3.8762319087982178 + }, + { + "auxiliary_loss_clip": 0.0112861, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.05109143, + "balance_loss_mlp": 1.02024531, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.680027994319538, + "language_loss": 0.71238405, + "learning_rate": 2.048483229511158e-06, + "loss": 0.7340014, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12902832, + "step": 8441, + "time_per_iteration": 2.4966700077056885 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.04870152, + "balance_loss_mlp": 1.02671266, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.6619946149529936, + "language_loss": 0.64013004, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.66184843, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.14355469, + "step": 8442, + "time_per_iteration": 2.4144680500030518 + }, + { + "auxiliary_loss_clip": 0.01124338, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.049577, + "balance_loss_mlp": 1.01749671, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.4882345992932455, + "language_loss": 0.71613497, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73767471, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12145996, + "step": 8443, + "time_per_iteration": 2.6089978218078613 + }, + { + "auxiliary_loss_clip": 0.0113509, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.05421519, + "balance_loss_mlp": 1.02182364, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.2899378857129873, + "language_loss": 0.62106615, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64277929, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14398193, + "step": 8444, + "time_per_iteration": 2.5747485160827637 + }, + { + "auxiliary_loss_clip": 0.01127411, + "auxiliary_loss_mlp": 0.01028338, + "balance_loss_clip": 1.04967511, + "balance_loss_mlp": 1.01537418, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.6204212199206507, + "language_loss": 0.63770807, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65926564, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12957764, + "step": 8445, + "time_per_iteration": 2.5352773666381836 + }, + { + "auxiliary_loss_clip": 0.01067351, + "auxiliary_loss_mlp": 0.01004067, + "balance_loss_clip": 1.04109347, + "balance_loss_mlp": 1.00279784, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8289176159179502, + "language_loss": 0.61938566, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.64009988, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.26318359, + "router_z_loss_mlp": 0.01268005, + "step": 8446, + "time_per_iteration": 3.0882208347320557 + }, + { + "auxiliary_loss_clip": 0.01135579, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.0587244, + "balance_loss_mlp": 1.01857352, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.4909110535506132, + "language_loss": 0.80414581, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82581514, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12768555, + "step": 8447, + "time_per_iteration": 2.4750757217407227 + }, + { + "auxiliary_loss_clip": 0.01136386, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.0594542, + "balance_loss_mlp": 1.02096272, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.3655040781983512, + "language_loss": 0.70320064, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72490549, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13140869, + "step": 8448, + "time_per_iteration": 2.536637783050537 + }, + { + "auxiliary_loss_clip": 0.01123888, + "auxiliary_loss_mlp": 0.01039767, + "balance_loss_clip": 1.04726601, + "balance_loss_mlp": 1.02593851, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.371493580028031, + "language_loss": 0.72092521, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74256182, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13830566, + "step": 8449, + "time_per_iteration": 2.6004855632781982 + }, + { + "auxiliary_loss_clip": 0.01123154, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.04503679, + "balance_loss_mlp": 1.01813662, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4321162905878817, + "language_loss": 0.72906005, + "learning_rate": 2.044979031776844e-06, + "loss": 0.75060248, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1295166, + "step": 8450, + "time_per_iteration": 2.4809508323669434 + }, + { + "auxiliary_loss_clip": 0.01131063, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.05203688, + "balance_loss_mlp": 1.01810026, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.6550722940175753, + "language_loss": 0.76956153, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79118454, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13134766, + "step": 8451, + "time_per_iteration": 2.555453062057495 + }, + { + "auxiliary_loss_clip": 0.01127365, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.04967463, + "balance_loss_mlp": 1.02702141, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.681811348703354, + "language_loss": 0.84885299, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87052196, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12512207, + "step": 8452, + "time_per_iteration": 2.5155606269836426 + }, + { + "auxiliary_loss_clip": 0.01133117, + "auxiliary_loss_mlp": 0.01037293, + "balance_loss_clip": 1.05063379, + "balance_loss_mlp": 1.02141428, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 3.1182782701150304, + "language_loss": 0.77849591, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80019999, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.15881348, + "step": 8453, + "time_per_iteration": 2.470731496810913 + }, + { + "auxiliary_loss_clip": 0.0113647, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.06084871, + "balance_loss_mlp": 1.02342498, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.988098238353446, + "language_loss": 0.76534164, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78706205, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12158203, + "step": 8454, + "time_per_iteration": 3.987852096557617 + }, + { + "auxiliary_loss_clip": 0.01129486, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.05176997, + "balance_loss_mlp": 1.02162075, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.5567690137258248, + "language_loss": 0.89347154, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.9151144, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13171387, + "step": 8455, + "time_per_iteration": 2.4724626541137695 + }, + { + "auxiliary_loss_clip": 0.01131177, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.04909074, + "balance_loss_mlp": 1.0206418, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.7626400229567538, + "language_loss": 0.62602913, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64769495, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14758301, + "step": 8456, + "time_per_iteration": 2.4922499656677246 + }, + { + "auxiliary_loss_clip": 0.01070348, + "auxiliary_loss_mlp": 0.01006853, + "balance_loss_clip": 1.04277849, + "balance_loss_mlp": 1.00554156, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8156361674290918, + "language_loss": 0.62425214, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64502412, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.27636719, + "router_z_loss_mlp": 0.01312256, + "step": 8457, + "time_per_iteration": 3.0001115798950195 + }, + { + "auxiliary_loss_clip": 0.01127824, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.0491612, + "balance_loss_mlp": 1.01902378, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.858864074448887, + "language_loss": 0.67555106, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69716454, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.14477539, + "step": 8458, + "time_per_iteration": 2.685957193374634 + }, + { + "auxiliary_loss_clip": 0.01124706, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.04525352, + "balance_loss_mlp": 1.01957977, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.8040490744326212, + "language_loss": 0.77542108, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79700279, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13885498, + "step": 8459, + "time_per_iteration": 2.482226610183716 + }, + { + "auxiliary_loss_clip": 0.01142772, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.05982029, + "balance_loss_mlp": 1.02142632, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 1.99326120841027, + "language_loss": 0.80887544, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83065403, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13641357, + "step": 8460, + "time_per_iteration": 2.46509051322937 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.05351841, + "balance_loss_mlp": 1.02428746, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 2.2389289044890965, + "language_loss": 0.6881299, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70984656, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12799072, + "step": 8461, + "time_per_iteration": 2.4210493564605713 + }, + { + "auxiliary_loss_clip": 0.01136242, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.05882049, + "balance_loss_mlp": 1.02046132, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.7682266848653327, + "language_loss": 0.75896657, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78066301, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12927246, + "step": 8462, + "time_per_iteration": 2.515864610671997 + }, + { + "auxiliary_loss_clip": 0.01135057, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.05717778, + "balance_loss_mlp": 1.02113628, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 2.410937975838604, + "language_loss": 0.80987018, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83156025, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12811279, + "step": 8463, + "time_per_iteration": 2.446152687072754 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.04488182, + "balance_loss_mlp": 1.02580023, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7331538433197362, + "language_loss": 0.76579875, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78739345, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12390137, + "step": 8464, + "time_per_iteration": 2.5073020458221436 + }, + { + "auxiliary_loss_clip": 0.01066371, + "auxiliary_loss_mlp": 0.01008333, + "balance_loss_clip": 1.04061913, + "balance_loss_mlp": 1.007056, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6876918629162594, + "language_loss": 0.59367061, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61441767, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01278687, + "step": 8465, + "time_per_iteration": 3.199375867843628 + }, + { + "auxiliary_loss_clip": 0.01130626, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.05176032, + "balance_loss_mlp": 1.02072036, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.9622587180719782, + "language_loss": 0.8003273, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82197654, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13562012, + "step": 8466, + "time_per_iteration": 2.5369722843170166 + }, + { + "auxiliary_loss_clip": 0.01127456, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.0506953, + "balance_loss_mlp": 1.01734257, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 2.970715034084036, + "language_loss": 0.78606725, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80764806, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13262939, + "step": 8467, + "time_per_iteration": 2.4748129844665527 + }, + { + "auxiliary_loss_clip": 0.01128144, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.05214822, + "balance_loss_mlp": 1.02159202, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 2.008585655893127, + "language_loss": 0.744295, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76591098, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11871338, + "step": 8468, + "time_per_iteration": 2.5886619091033936 + }, + { + "auxiliary_loss_clip": 0.01126609, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.04836273, + "balance_loss_mlp": 1.01959813, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 2.459886913749417, + "language_loss": 0.77866054, + "learning_rate": 2.03758084040404e-06, + "loss": 0.8002497, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.1270752, + "step": 8469, + "time_per_iteration": 2.446408987045288 + }, + { + "auxiliary_loss_clip": 0.01129922, + "auxiliary_loss_mlp": 0.01036837, + "balance_loss_clip": 1.05186689, + "balance_loss_mlp": 1.02255023, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 2.0588275758540204, + "language_loss": 0.697124, + "learning_rate": 2.037191446774109e-06, + "loss": 0.7187916, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.1428833, + "step": 8470, + "time_per_iteration": 3.964463233947754 + }, + { + "auxiliary_loss_clip": 0.01135888, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.05110669, + "balance_loss_mlp": 1.02109909, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.6768789984047836, + "language_loss": 0.73386633, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75557697, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14068604, + "step": 8471, + "time_per_iteration": 2.444011688232422 + }, + { + "auxiliary_loss_clip": 0.01060393, + "auxiliary_loss_mlp": 0.01008214, + "balance_loss_clip": 1.0337851, + "balance_loss_mlp": 1.00694871, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7488030057518305, + "language_loss": 0.58111215, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60179824, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01264954, + "step": 8472, + "time_per_iteration": 3.063809871673584 + }, + { + "auxiliary_loss_clip": 0.01133836, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.05442321, + "balance_loss_mlp": 1.0269649, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 2.3247973381128606, + "language_loss": 0.69296205, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71469355, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12347412, + "step": 8473, + "time_per_iteration": 2.4972341060638428 + }, + { + "auxiliary_loss_clip": 0.01128079, + "auxiliary_loss_mlp": 0.01039279, + "balance_loss_clip": 1.05148494, + "balance_loss_mlp": 1.02622604, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.959835686992611, + "language_loss": 0.85026956, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87194312, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1305542, + "step": 8474, + "time_per_iteration": 2.5507102012634277 + }, + { + "auxiliary_loss_clip": 0.01130436, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.0510658, + "balance_loss_mlp": 1.02207017, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.0902265512832376, + "language_loss": 0.65207279, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67373204, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13415527, + "step": 8475, + "time_per_iteration": 3.8553214073181152 + }, + { + "auxiliary_loss_clip": 0.01134475, + "auxiliary_loss_mlp": 0.01040469, + "balance_loss_clip": 1.05193424, + "balance_loss_mlp": 1.02616966, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.4370278052641683, + "language_loss": 0.82135588, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84310532, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14306641, + "step": 8476, + "time_per_iteration": 2.5025413036346436 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01042456, + "balance_loss_clip": 1.05673838, + "balance_loss_mlp": 1.02598083, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 5.799100520455098, + "language_loss": 0.80626947, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.82807326, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.16479492, + "step": 8477, + "time_per_iteration": 2.463128089904785 + }, + { + "auxiliary_loss_clip": 0.01127418, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.04758716, + "balance_loss_mlp": 1.01484251, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 2.27414905895629, + "language_loss": 0.61373121, + "learning_rate": 2.034076248204082e-06, + "loss": 0.63530397, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.15008545, + "step": 8478, + "time_per_iteration": 2.529557228088379 + }, + { + "auxiliary_loss_clip": 0.01121537, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.04464877, + "balance_loss_mlp": 1.0257833, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.7633444626880668, + "language_loss": 0.65839958, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.67999756, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12475586, + "step": 8479, + "time_per_iteration": 2.586143970489502 + }, + { + "auxiliary_loss_clip": 0.01122556, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.04551935, + "balance_loss_mlp": 1.02115667, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.4930962789010185, + "language_loss": 0.69254506, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71411532, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13311768, + "step": 8480, + "time_per_iteration": 2.5391838550567627 + }, + { + "auxiliary_loss_clip": 0.011349, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.0530355, + "balance_loss_mlp": 1.01679945, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.8134921959656511, + "language_loss": 0.79402077, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81567049, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13256836, + "step": 8481, + "time_per_iteration": 2.5153067111968994 + }, + { + "auxiliary_loss_clip": 0.01123122, + "auxiliary_loss_mlp": 0.01037542, + "balance_loss_clip": 1.04706466, + "balance_loss_mlp": 1.02376175, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.7861429930404928, + "language_loss": 0.83397561, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85558224, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13763428, + "step": 8482, + "time_per_iteration": 2.478649139404297 + }, + { + "auxiliary_loss_clip": 0.01135596, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.05322397, + "balance_loss_mlp": 1.02354908, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.6519303546107904, + "language_loss": 0.85454035, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87626612, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13433838, + "step": 8483, + "time_per_iteration": 2.5193896293640137 + }, + { + "auxiliary_loss_clip": 0.01120967, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.04345834, + "balance_loss_mlp": 1.01871181, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.9856086800187764, + "language_loss": 0.82971227, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85123831, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12921143, + "step": 8484, + "time_per_iteration": 3.8686273097991943 + }, + { + "auxiliary_loss_clip": 0.01132283, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.05174613, + "balance_loss_mlp": 1.01863062, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 1.8523004255137085, + "language_loss": 0.8145898, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83623606, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.137146, + "step": 8485, + "time_per_iteration": 2.5088531970977783 + }, + { + "auxiliary_loss_clip": 0.01121391, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.04493213, + "balance_loss_mlp": 1.02141976, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 1.8683628019151932, + "language_loss": 0.74082088, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76238084, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13171387, + "step": 8486, + "time_per_iteration": 2.4143006801605225 + }, + { + "auxiliary_loss_clip": 0.01140818, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.05390394, + "balance_loss_mlp": 1.02321267, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.6018163443056634, + "language_loss": 0.70042253, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72220445, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14190674, + "step": 8487, + "time_per_iteration": 2.432629108428955 + }, + { + "auxiliary_loss_clip": 0.01127556, + "auxiliary_loss_mlp": 0.01036423, + "balance_loss_clip": 1.05109906, + "balance_loss_mlp": 1.02239203, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 3.6205047281392813, + "language_loss": 0.72721565, + "learning_rate": 2.030182134581827e-06, + "loss": 0.74885547, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.14038086, + "step": 8488, + "time_per_iteration": 2.480034589767456 + }, + { + "auxiliary_loss_clip": 0.01135385, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.05533957, + "balance_loss_mlp": 1.020702, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.7981847212599287, + "language_loss": 0.69579923, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71749413, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13409424, + "step": 8489, + "time_per_iteration": 2.4412527084350586 + }, + { + "auxiliary_loss_clip": 0.0112558, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.04693627, + "balance_loss_mlp": 1.02439308, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.8488335956775457, + "language_loss": 0.72982979, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75145721, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12780762, + "step": 8490, + "time_per_iteration": 2.4708375930786133 + }, + { + "auxiliary_loss_clip": 0.01127751, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.05155754, + "balance_loss_mlp": 1.01947916, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.6161674957104137, + "language_loss": 0.80556452, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82716388, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12695312, + "step": 8491, + "time_per_iteration": 2.4602692127227783 + }, + { + "auxiliary_loss_clip": 0.01127806, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.05297256, + "balance_loss_mlp": 1.01855326, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.0813264759884813, + "language_loss": 0.78679276, + "learning_rate": 2.028624456259728e-06, + "loss": 0.80838424, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12799072, + "step": 8492, + "time_per_iteration": 2.577706813812256 + }, + { + "auxiliary_loss_clip": 0.01145862, + "auxiliary_loss_mlp": 0.01046529, + "balance_loss_clip": 1.06174684, + "balance_loss_mlp": 1.03242636, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 2.154671274710178, + "language_loss": 0.77822363, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.80014759, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14111328, + "step": 8493, + "time_per_iteration": 2.5127408504486084 + }, + { + "auxiliary_loss_clip": 0.01128774, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.05132318, + "balance_loss_mlp": 1.01911843, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 1.72200302210732, + "language_loss": 0.83548087, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85709989, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.14007568, + "step": 8494, + "time_per_iteration": 2.5174264907836914 + }, + { + "auxiliary_loss_clip": 0.01129099, + "auxiliary_loss_mlp": 0.01040935, + "balance_loss_clip": 1.04898238, + "balance_loss_mlp": 1.02754796, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 2.071244423861919, + "language_loss": 0.79518676, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81688714, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13372803, + "step": 8495, + "time_per_iteration": 2.4831416606903076 + }, + { + "auxiliary_loss_clip": 0.01129491, + "auxiliary_loss_mlp": 0.0103992, + "balance_loss_clip": 1.04896247, + "balance_loss_mlp": 1.02553749, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.753879481138518, + "language_loss": 0.78367418, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.8053683, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14373779, + "step": 8496, + "time_per_iteration": 2.511690139770508 + }, + { + "auxiliary_loss_clip": 0.01133441, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.05525553, + "balance_loss_mlp": 1.01770627, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.9365194567547843, + "language_loss": 0.7876333, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80927503, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13031006, + "step": 8497, + "time_per_iteration": 4.0031538009643555 + }, + { + "auxiliary_loss_clip": 0.01130919, + "auxiliary_loss_mlp": 0.01036579, + "balance_loss_clip": 1.05118871, + "balance_loss_mlp": 1.02307832, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.8872038962219395, + "language_loss": 0.81872225, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84039724, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13519287, + "step": 8498, + "time_per_iteration": 2.4870564937591553 + }, + { + "auxiliary_loss_clip": 0.01129581, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.05426049, + "balance_loss_mlp": 1.0186795, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 2.0299621107964265, + "language_loss": 0.70543039, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72705173, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13867188, + "step": 8499, + "time_per_iteration": 2.530245542526245 + }, + { + "auxiliary_loss_clip": 0.01134441, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.05553985, + "balance_loss_mlp": 1.01915717, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.519135670075484, + "language_loss": 0.72570503, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74737614, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13519287, + "step": 8500, + "time_per_iteration": 2.5444774627685547 + }, + { + "auxiliary_loss_clip": 0.01136245, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.05079329, + "balance_loss_mlp": 1.02352953, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.824282957896109, + "language_loss": 0.62682581, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.64858127, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.15759277, + "step": 8501, + "time_per_iteration": 2.4934728145599365 + }, + { + "auxiliary_loss_clip": 0.01127857, + "auxiliary_loss_mlp": 0.01034343, + "balance_loss_clip": 1.04685545, + "balance_loss_mlp": 1.02076483, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.8030501486810393, + "language_loss": 0.87734044, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89896244, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13555908, + "step": 8502, + "time_per_iteration": 2.423832893371582 + }, + { + "auxiliary_loss_clip": 0.01130863, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.05088866, + "balance_loss_mlp": 1.02031898, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.5397066776801427, + "language_loss": 0.82524508, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84688497, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12811279, + "step": 8503, + "time_per_iteration": 2.5054008960723877 + }, + { + "auxiliary_loss_clip": 0.01062974, + "auxiliary_loss_mlp": 0.01006019, + "balance_loss_clip": 1.03669643, + "balance_loss_mlp": 1.00482595, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8604253850180583, + "language_loss": 0.63807273, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65876269, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.26269531, + "router_z_loss_mlp": 0.01193237, + "step": 8504, + "time_per_iteration": 3.0914525985717773 + }, + { + "auxiliary_loss_clip": 0.01137619, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.05863488, + "balance_loss_mlp": 1.01975393, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 2.1976840391275867, + "language_loss": 0.83682525, + "learning_rate": 2.023561886666816e-06, + "loss": 0.8585363, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13745117, + "step": 8505, + "time_per_iteration": 2.5272016525268555 + }, + { + "auxiliary_loss_clip": 0.01132128, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.05491614, + "balance_loss_mlp": 1.01466644, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 2.00864327984945, + "language_loss": 0.75555336, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77715081, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12939453, + "step": 8506, + "time_per_iteration": 2.537689685821533 + }, + { + "auxiliary_loss_clip": 0.01129372, + "auxiliary_loss_mlp": 0.01040571, + "balance_loss_clip": 1.04866064, + "balance_loss_mlp": 1.02540779, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.6849968391608994, + "language_loss": 0.57759911, + "learning_rate": 2.022783015592131e-06, + "loss": 0.59929854, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.15148926, + "step": 8507, + "time_per_iteration": 2.5244154930114746 + }, + { + "auxiliary_loss_clip": 0.01135988, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_clip": 1.05621147, + "balance_loss_mlp": 1.02842927, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 2.510572245694763, + "language_loss": 0.85726106, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87904608, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14093018, + "step": 8508, + "time_per_iteration": 2.475569248199463 + }, + { + "auxiliary_loss_clip": 0.0113578, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.05641127, + "balance_loss_mlp": 1.02434134, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.5751498041124328, + "language_loss": 0.71825296, + "learning_rate": 2.022004141061709e-06, + "loss": 0.73999894, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14477539, + "step": 8509, + "time_per_iteration": 2.5296151638031006 + }, + { + "auxiliary_loss_clip": 0.01128485, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.05201066, + "balance_loss_mlp": 1.01873755, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 2.0597215952257963, + "language_loss": 0.75676733, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.77836442, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12487793, + "step": 8510, + "time_per_iteration": 2.4347126483917236 + }, + { + "auxiliary_loss_clip": 0.01129099, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.05209386, + "balance_loss_mlp": 1.02279055, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.8835387082060582, + "language_loss": 0.7089377, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73058522, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12866211, + "step": 8511, + "time_per_iteration": 2.590715169906616 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.01030574, + "balance_loss_clip": 1.0456835, + "balance_loss_mlp": 1.01738906, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 1.9666480208394734, + "language_loss": 0.66723549, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68874991, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13165283, + "step": 8512, + "time_per_iteration": 2.489379405975342 + }, + { + "auxiliary_loss_clip": 0.01129315, + "auxiliary_loss_mlp": 0.01041929, + "balance_loss_clip": 1.04783463, + "balance_loss_mlp": 1.02685523, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 2.097772992345104, + "language_loss": 0.67195719, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.69366968, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.15081787, + "step": 8513, + "time_per_iteration": 3.917757987976074 + }, + { + "auxiliary_loss_clip": 0.01134787, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.05799866, + "balance_loss_mlp": 1.02029037, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.4089158630184513, + "language_loss": 0.68819582, + "learning_rate": 2.0200569403921e-06, + "loss": 0.70989144, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.14477539, + "step": 8514, + "time_per_iteration": 2.5364062786102295 + }, + { + "auxiliary_loss_clip": 0.01122268, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.04670954, + "balance_loss_mlp": 1.01784754, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.4866946491003863, + "language_loss": 0.66092026, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68244892, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12762451, + "step": 8515, + "time_per_iteration": 2.5220143795013428 + }, + { + "auxiliary_loss_clip": 0.01122689, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.04694414, + "balance_loss_mlp": 1.01882422, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 1.9208942386958863, + "language_loss": 0.75465435, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77619421, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12481689, + "step": 8516, + "time_per_iteration": 2.5317959785461426 + }, + { + "auxiliary_loss_clip": 0.01127714, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_clip": 1.04895091, + "balance_loss_mlp": 1.02904141, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.941288988253804, + "language_loss": 0.77803415, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.79973948, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13775635, + "step": 8517, + "time_per_iteration": 2.599714517593384 + }, + { + "auxiliary_loss_clip": 0.01125274, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.04555345, + "balance_loss_mlp": 1.02163792, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7367083568474706, + "language_loss": 0.74175382, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76337588, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.1529541, + "step": 8518, + "time_per_iteration": 2.590855598449707 + }, + { + "auxiliary_loss_clip": 0.01125619, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.04709053, + "balance_loss_mlp": 1.02285314, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 2.1192825089736047, + "language_loss": 0.77880049, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80042076, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13543701, + "step": 8519, + "time_per_iteration": 3.9413554668426514 + }, + { + "auxiliary_loss_clip": 0.01129375, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.05182981, + "balance_loss_mlp": 1.02289486, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.6158011657178912, + "language_loss": 0.79587519, + "learning_rate": 2.017720274652497e-06, + "loss": 0.8175354, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13739014, + "step": 8520, + "time_per_iteration": 2.6026110649108887 + }, + { + "auxiliary_loss_clip": 0.01124609, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.04347897, + "balance_loss_mlp": 1.02723694, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.691392332744568, + "language_loss": 0.81322205, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83489442, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.15393066, + "step": 8521, + "time_per_iteration": 2.477602958679199 + }, + { + "auxiliary_loss_clip": 0.01130822, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.05247104, + "balance_loss_mlp": 1.01640105, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.7843625718243077, + "language_loss": 0.68531936, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70692766, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13623047, + "step": 8522, + "time_per_iteration": 2.4930002689361572 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.04713869, + "balance_loss_mlp": 1.022632, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 2.4488387952895785, + "language_loss": 0.61303777, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63473195, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.16918945, + "step": 8523, + "time_per_iteration": 2.5180423259735107 + }, + { + "auxiliary_loss_clip": 0.0112231, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.04493833, + "balance_loss_mlp": 1.02002645, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.0013410600896786, + "language_loss": 0.78295147, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80450022, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12524414, + "step": 8524, + "time_per_iteration": 2.519462823867798 + }, + { + "auxiliary_loss_clip": 0.01126175, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.04958248, + "balance_loss_mlp": 1.02089238, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.842333407313703, + "language_loss": 0.74471456, + "learning_rate": 2.015773034588706e-06, + "loss": 0.76631588, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13061523, + "step": 8525, + "time_per_iteration": 2.4885244369506836 + }, + { + "auxiliary_loss_clip": 0.01125883, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.04564452, + "balance_loss_mlp": 1.02043486, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.6012271181549658, + "language_loss": 0.74385625, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76545942, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13989258, + "step": 8526, + "time_per_iteration": 2.6092443466186523 + }, + { + "auxiliary_loss_clip": 0.01125402, + "auxiliary_loss_mlp": 0.01038902, + "balance_loss_clip": 1.04659343, + "balance_loss_mlp": 1.02593851, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.6562094038538167, + "language_loss": 0.6520704, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67371339, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12969971, + "step": 8527, + "time_per_iteration": 3.915297269821167 + }, + { + "auxiliary_loss_clip": 0.01124392, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.04936934, + "balance_loss_mlp": 1.02532947, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.4672285537440164, + "language_loss": 0.74268258, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76430249, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1227417, + "step": 8528, + "time_per_iteration": 2.43992280960083 + }, + { + "auxiliary_loss_clip": 0.01121835, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.04411113, + "balance_loss_mlp": 1.02226067, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.9804367964647909, + "language_loss": 0.82932365, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85089958, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13476562, + "step": 8529, + "time_per_iteration": 2.5462024211883545 + }, + { + "auxiliary_loss_clip": 0.01121074, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.04476142, + "balance_loss_mlp": 1.02399182, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.774441472955276, + "language_loss": 0.73757505, + "learning_rate": 2.01382577957204e-06, + "loss": 0.7591626, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13696289, + "step": 8530, + "time_per_iteration": 2.42093825340271 + }, + { + "auxiliary_loss_clip": 0.01052751, + "auxiliary_loss_mlp": 0.01004576, + "balance_loss_clip": 1.02684319, + "balance_loss_mlp": 1.00333333, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7491315891244494, + "language_loss": 0.60754502, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62811828, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01242065, + "step": 8531, + "time_per_iteration": 3.232640027999878 + }, + { + "auxiliary_loss_clip": 0.01123285, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.04474688, + "balance_loss_mlp": 1.01878619, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6227545034713835, + "language_loss": 0.77057743, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79213291, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13470459, + "step": 8532, + "time_per_iteration": 2.447547197341919 + }, + { + "auxiliary_loss_clip": 0.01124788, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.04778409, + "balance_loss_mlp": 1.01581836, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.2019371582091916, + "language_loss": 0.67147803, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69302189, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13793945, + "step": 8533, + "time_per_iteration": 2.5765419006347656 + }, + { + "auxiliary_loss_clip": 0.01125508, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.04485083, + "balance_loss_mlp": 1.02389073, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 2.005769859655105, + "language_loss": 0.82108659, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84272969, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.14892578, + "step": 8534, + "time_per_iteration": 2.485553503036499 + }, + { + "auxiliary_loss_clip": 0.01135806, + "auxiliary_loss_mlp": 0.01037038, + "balance_loss_clip": 1.05603814, + "balance_loss_mlp": 1.02224374, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.5199971746364294, + "language_loss": 0.63719994, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65892839, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14788818, + "step": 8535, + "time_per_iteration": 2.5091819763183594 + }, + { + "auxiliary_loss_clip": 0.0113756, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.05552721, + "balance_loss_mlp": 1.01829934, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.6554125799772395, + "language_loss": 0.70051855, + "learning_rate": 2.011489056413418e-06, + "loss": 0.72221571, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.1385498, + "step": 8536, + "time_per_iteration": 2.4245336055755615 + }, + { + "auxiliary_loss_clip": 0.01126319, + "auxiliary_loss_mlp": 0.01040804, + "balance_loss_clip": 1.04623365, + "balance_loss_mlp": 1.02485371, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.8964217895974986, + "language_loss": 0.71312326, + "learning_rate": 2.011099600942669e-06, + "loss": 0.7347945, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.1595459, + "step": 8537, + "time_per_iteration": 2.469921350479126 + }, + { + "auxiliary_loss_clip": 0.01124452, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.04513288, + "balance_loss_mlp": 1.0206989, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 1.8063103740236555, + "language_loss": 0.80117285, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82276958, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.14520264, + "step": 8538, + "time_per_iteration": 2.40889573097229 + }, + { + "auxiliary_loss_clip": 0.01127854, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.05026829, + "balance_loss_mlp": 1.01467156, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 2.9753893885317577, + "language_loss": 0.78412759, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80568314, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13012695, + "step": 8539, + "time_per_iteration": 2.5164220333099365 + }, + { + "auxiliary_loss_clip": 0.01125588, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.04660535, + "balance_loss_mlp": 1.01806843, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.7017601394427175, + "language_loss": 0.76008272, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78165495, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13574219, + "step": 8540, + "time_per_iteration": 3.959045648574829 + }, + { + "auxiliary_loss_clip": 0.01128292, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.04681993, + "balance_loss_mlp": 1.01857889, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 1.805410973616482, + "language_loss": 0.74597168, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76758653, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14624023, + "step": 8541, + "time_per_iteration": 2.442187786102295 + }, + { + "auxiliary_loss_clip": 0.01128587, + "auxiliary_loss_mlp": 0.01035862, + "balance_loss_clip": 1.05100799, + "balance_loss_mlp": 1.02229047, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 3.9061113942042556, + "language_loss": 0.70740235, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72904682, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13568115, + "step": 8542, + "time_per_iteration": 2.48675537109375 + }, + { + "auxiliary_loss_clip": 0.01127887, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.04883504, + "balance_loss_mlp": 1.02057576, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.8884119752494453, + "language_loss": 0.79301393, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81463283, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13439941, + "step": 8543, + "time_per_iteration": 2.5078377723693848 + }, + { + "auxiliary_loss_clip": 0.01130168, + "auxiliary_loss_mlp": 0.01038336, + "balance_loss_clip": 1.04971194, + "balance_loss_mlp": 1.02426898, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.8319958883120473, + "language_loss": 0.67762363, + "learning_rate": 2.008373401689299e-06, + "loss": 0.69930863, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14068604, + "step": 8544, + "time_per_iteration": 2.532386541366577 + }, + { + "auxiliary_loss_clip": 0.0113735, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.05641699, + "balance_loss_mlp": 1.02837276, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.6002217366741807, + "language_loss": 0.72653294, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74832404, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1338501, + "step": 8545, + "time_per_iteration": 2.431448459625244 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.04749489, + "balance_loss_mlp": 1.02537131, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 2.2982988620194673, + "language_loss": 0.82378232, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84548849, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.16772461, + "step": 8546, + "time_per_iteration": 2.467059850692749 + }, + { + "auxiliary_loss_clip": 0.01130219, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.04968989, + "balance_loss_mlp": 1.02282476, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.7696564143344082, + "language_loss": 0.73110491, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75277185, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13665771, + "step": 8547, + "time_per_iteration": 2.4950778484344482 + }, + { + "auxiliary_loss_clip": 0.01131148, + "auxiliary_loss_mlp": 0.01042673, + "balance_loss_clip": 1.05136776, + "balance_loss_mlp": 1.02856445, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.8435013206223783, + "language_loss": 0.73785836, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75959659, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14111328, + "step": 8548, + "time_per_iteration": 2.5986168384552 + }, + { + "auxiliary_loss_clip": 0.01123576, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.04505014, + "balance_loss_mlp": 1.02075744, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 3.2459080631344817, + "language_loss": 0.82278121, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84436333, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13873291, + "step": 8549, + "time_per_iteration": 2.412020444869995 + }, + { + "auxiliary_loss_clip": 0.01129853, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.05122948, + "balance_loss_mlp": 1.01971078, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 2.4560571283862838, + "language_loss": 0.72187996, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74350762, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13208008, + "step": 8550, + "time_per_iteration": 2.445919990539551 + }, + { + "auxiliary_loss_clip": 0.01133185, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.05083311, + "balance_loss_mlp": 1.01968527, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.990722363769327, + "language_loss": 0.75313139, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77480102, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14093018, + "step": 8551, + "time_per_iteration": 2.456069231033325 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.04636645, + "balance_loss_mlp": 1.02124894, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 2.8003539482889366, + "language_loss": 0.69048738, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71205407, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13818359, + "step": 8552, + "time_per_iteration": 2.538802146911621 + }, + { + "auxiliary_loss_clip": 0.01128014, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.04699755, + "balance_loss_mlp": 1.02165854, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.914812064794582, + "language_loss": 0.74901187, + "learning_rate": 2.004868266210965e-06, + "loss": 0.77065003, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14129639, + "step": 8553, + "time_per_iteration": 2.4652554988861084 + }, + { + "auxiliary_loss_clip": 0.01135096, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.054811, + "balance_loss_mlp": 1.02530038, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.6348803109915044, + "language_loss": 0.67792928, + "learning_rate": 2.004478805593435e-06, + "loss": 0.69966543, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13208008, + "step": 8554, + "time_per_iteration": 2.493535280227661 + }, + { + "auxiliary_loss_clip": 0.01131735, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.04973221, + "balance_loss_mlp": 1.02395594, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 1.8527549908352416, + "language_loss": 0.73615211, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75786757, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.15844727, + "step": 8555, + "time_per_iteration": 2.4571406841278076 + }, + { + "auxiliary_loss_clip": 0.01131151, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.05089831, + "balance_loss_mlp": 1.02264071, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.3727487969086405, + "language_loss": 0.74322855, + "learning_rate": 2.003699883863633e-06, + "loss": 0.7649008, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13439941, + "step": 8556, + "time_per_iteration": 3.847752332687378 + }, + { + "auxiliary_loss_clip": 0.01125366, + "auxiliary_loss_mlp": 0.01044151, + "balance_loss_clip": 1.04463148, + "balance_loss_mlp": 1.03035808, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7237443081447268, + "language_loss": 0.86759394, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88928908, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13793945, + "step": 8557, + "time_per_iteration": 2.470604181289673 + }, + { + "auxiliary_loss_clip": 0.01125703, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.05070376, + "balance_loss_mlp": 1.02107179, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.60329393886765, + "language_loss": 0.89028388, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91187465, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12310791, + "step": 8558, + "time_per_iteration": 2.602293014526367 + }, + { + "auxiliary_loss_clip": 0.01123798, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.04845369, + "balance_loss_mlp": 1.02198744, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 4.672503343514982, + "language_loss": 0.65182817, + "learning_rate": 2.002531500253602e-06, + "loss": 0.6734134, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12750244, + "step": 8559, + "time_per_iteration": 2.457180976867676 + }, + { + "auxiliary_loss_clip": 0.01129994, + "auxiliary_loss_mlp": 0.01038111, + "balance_loss_clip": 1.05186331, + "balance_loss_mlp": 1.02477193, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.846399548011037, + "language_loss": 0.63310033, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65478134, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13348389, + "step": 8560, + "time_per_iteration": 2.5955564975738525 + }, + { + "auxiliary_loss_clip": 0.01122046, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.04474401, + "balance_loss_mlp": 1.02275848, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.525943649936212, + "language_loss": 0.69830137, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.71988201, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13256836, + "step": 8561, + "time_per_iteration": 2.499993324279785 + }, + { + "auxiliary_loss_clip": 0.01127613, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.04702437, + "balance_loss_mlp": 1.02037358, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 3.3342067076502633, + "language_loss": 0.66751027, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68912101, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13098145, + "step": 8562, + "time_per_iteration": 3.9855518341064453 + }, + { + "auxiliary_loss_clip": 0.01128124, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.04955268, + "balance_loss_mlp": 1.0182023, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.742685106801357, + "language_loss": 0.77782619, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79942405, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13464355, + "step": 8563, + "time_per_iteration": 2.50178599357605 + }, + { + "auxiliary_loss_clip": 0.01129689, + "auxiliary_loss_mlp": 0.0103484, + "balance_loss_clip": 1.04518151, + "balance_loss_mlp": 1.01948607, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.0056820287420294, + "language_loss": 0.83065766, + "learning_rate": 2.0005841925139e-06, + "loss": 0.85230291, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.15350342, + "step": 8564, + "time_per_iteration": 2.4194586277008057 + }, + { + "auxiliary_loss_clip": 0.01119873, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.03963304, + "balance_loss_mlp": 1.01970804, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.868470328843343, + "language_loss": 0.73054123, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75207955, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14251709, + "step": 8565, + "time_per_iteration": 2.5215201377868652 + }, + { + "auxiliary_loss_clip": 0.01123249, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.04190803, + "balance_loss_mlp": 1.01731849, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 2.1239288774597043, + "language_loss": 0.68099457, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70255387, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.15368652, + "step": 8566, + "time_per_iteration": 2.6466100215911865 + }, + { + "auxiliary_loss_clip": 0.01124587, + "auxiliary_loss_mlp": 0.0103506, + "balance_loss_clip": 1.04259276, + "balance_loss_mlp": 1.02092135, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.6802679264795346, + "language_loss": 0.78383571, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80543214, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14147949, + "step": 8567, + "time_per_iteration": 2.5480940341949463 + }, + { + "auxiliary_loss_clip": 0.01130664, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.0479269, + "balance_loss_mlp": 1.01832068, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 1.9882804328908557, + "language_loss": 0.78863949, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81027842, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14916992, + "step": 8568, + "time_per_iteration": 2.4669227600097656 + }, + { + "auxiliary_loss_clip": 0.01122585, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.04426754, + "balance_loss_mlp": 1.02349401, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.3974218815526798, + "language_loss": 0.9149217, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93653488, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.15252686, + "step": 8569, + "time_per_iteration": 2.4752793312072754 + }, + { + "auxiliary_loss_clip": 0.01135817, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.05422664, + "balance_loss_mlp": 1.02015185, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.5839532096307096, + "language_loss": 0.76230943, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78400981, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.140625, + "step": 8570, + "time_per_iteration": 2.440377950668335 + }, + { + "auxiliary_loss_clip": 0.01128877, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.0495286, + "balance_loss_mlp": 1.02262902, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 2.5369892706491926, + "language_loss": 0.73883337, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.76049787, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.1494751, + "step": 8571, + "time_per_iteration": 4.032979249954224 + }, + { + "auxiliary_loss_clip": 0.01065586, + "auxiliary_loss_mlp": 0.01015273, + "balance_loss_clip": 1.03965592, + "balance_loss_mlp": 1.01383531, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7804225765221999, + "language_loss": 0.52894723, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54975581, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01438904, + "step": 8572, + "time_per_iteration": 3.166236400604248 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.04714847, + "balance_loss_mlp": 1.02282715, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.5482142489563502, + "language_loss": 0.76506853, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78664553, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1260376, + "step": 8573, + "time_per_iteration": 2.4827635288238525 + }, + { + "auxiliary_loss_clip": 0.01131618, + "auxiliary_loss_mlp": 0.01032151, + "balance_loss_clip": 1.05367529, + "balance_loss_mlp": 1.01796496, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.8771707640454998, + "language_loss": 0.76973319, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79137093, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.14196777, + "step": 8574, + "time_per_iteration": 2.5880892276763916 + }, + { + "auxiliary_loss_clip": 0.01122313, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.04573011, + "balance_loss_mlp": 1.0200516, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.7159549001334582, + "language_loss": 0.85275918, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87430513, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12231445, + "step": 8575, + "time_per_iteration": 2.4730124473571777 + }, + { + "auxiliary_loss_clip": 0.01134825, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.05623841, + "balance_loss_mlp": 1.01842976, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.5022505241971729, + "language_loss": 0.76844549, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79011559, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13751221, + "step": 8576, + "time_per_iteration": 2.4925684928894043 + }, + { + "auxiliary_loss_clip": 0.01123783, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.0427413, + "balance_loss_mlp": 1.01567388, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.6093992975210756, + "language_loss": 0.75700474, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77854908, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14978027, + "step": 8577, + "time_per_iteration": 2.422661304473877 + }, + { + "auxiliary_loss_clip": 0.011348, + "auxiliary_loss_mlp": 0.01038082, + "balance_loss_clip": 1.05265725, + "balance_loss_mlp": 1.02307343, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.7888904267922516, + "language_loss": 0.81229442, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83402318, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.15014648, + "step": 8578, + "time_per_iteration": 2.6081156730651855 + }, + { + "auxiliary_loss_clip": 0.01124221, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.04740334, + "balance_loss_mlp": 1.02174306, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 2.022862606683184, + "language_loss": 0.76310241, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78470075, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13867188, + "step": 8579, + "time_per_iteration": 2.484452486038208 + }, + { + "auxiliary_loss_clip": 0.01129199, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.05005074, + "balance_loss_mlp": 1.01607764, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.6384490774656053, + "language_loss": 0.79423118, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81581235, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.1282959, + "step": 8580, + "time_per_iteration": 2.5529932975769043 + }, + { + "auxiliary_loss_clip": 0.01132386, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.04973435, + "balance_loss_mlp": 1.02769196, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.0585257410717333, + "language_loss": 0.72680897, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74855137, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.1416626, + "step": 8581, + "time_per_iteration": 2.3846852779388428 + }, + { + "auxiliary_loss_clip": 0.01126493, + "auxiliary_loss_mlp": 0.01037339, + "balance_loss_clip": 1.04814935, + "balance_loss_mlp": 1.02415419, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.09091202509766, + "language_loss": 0.74800217, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76964045, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13195801, + "step": 8582, + "time_per_iteration": 2.441796064376831 + }, + { + "auxiliary_loss_clip": 0.01125145, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.04792297, + "balance_loss_mlp": 1.02123928, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 1.9511077173092446, + "language_loss": 0.66457772, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68616295, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12145996, + "step": 8583, + "time_per_iteration": 2.436837911605835 + }, + { + "auxiliary_loss_clip": 0.0113567, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.05593264, + "balance_loss_mlp": 1.01857042, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 2.024176062509751, + "language_loss": 0.76135039, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78303194, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13903809, + "step": 8584, + "time_per_iteration": 3.8911380767822266 + }, + { + "auxiliary_loss_clip": 0.01133889, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.05356336, + "balance_loss_mlp": 1.02344322, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 1.9941846311803855, + "language_loss": 0.7898072, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.81151187, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13128662, + "step": 8585, + "time_per_iteration": 2.4958293437957764 + }, + { + "auxiliary_loss_clip": 0.0112286, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.0486567, + "balance_loss_mlp": 1.02350163, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 12.648829819966833, + "language_loss": 0.81428361, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83587909, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.13183594, + "step": 8586, + "time_per_iteration": 2.4228124618530273 + }, + { + "auxiliary_loss_clip": 0.0112457, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.0465157, + "balance_loss_mlp": 1.01869929, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 3.508021039264736, + "language_loss": 0.71933293, + "learning_rate": 1.991626598310701e-06, + "loss": 0.74089706, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13128662, + "step": 8587, + "time_per_iteration": 2.442714214324951 + }, + { + "auxiliary_loss_clip": 0.01065026, + "auxiliary_loss_mlp": 0.01005946, + "balance_loss_clip": 1.03832173, + "balance_loss_mlp": 1.00441575, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7306987431747654, + "language_loss": 0.57825851, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59896827, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01531982, + "step": 8588, + "time_per_iteration": 3.1066997051239014 + }, + { + "auxiliary_loss_clip": 0.01131433, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.05333424, + "balance_loss_mlp": 1.02273846, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.9322284736634785, + "language_loss": 0.75560284, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77728641, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.14190674, + "step": 8589, + "time_per_iteration": 2.6002197265625 + }, + { + "auxiliary_loss_clip": 0.01127686, + "auxiliary_loss_mlp": 0.01041166, + "balance_loss_clip": 1.04693019, + "balance_loss_mlp": 1.02827334, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.506635095069331, + "language_loss": 0.67576915, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69745767, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12884521, + "step": 8590, + "time_per_iteration": 2.476130962371826 + }, + { + "auxiliary_loss_clip": 0.01059457, + "auxiliary_loss_mlp": 0.01014183, + "balance_loss_clip": 1.0323019, + "balance_loss_mlp": 1.01249337, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.8723427331319094, + "language_loss": 0.55897713, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57971346, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.27197266, + "router_z_loss_mlp": 0.01690674, + "step": 8591, + "time_per_iteration": 3.069755792617798 + }, + { + "auxiliary_loss_clip": 0.0111162, + "auxiliary_loss_mlp": 0.01027347, + "balance_loss_clip": 1.0406673, + "balance_loss_mlp": 1.01556361, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.4658995690369643, + "language_loss": 0.81656599, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83795559, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11785889, + "step": 8592, + "time_per_iteration": 2.4954607486724854 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.0490638, + "balance_loss_mlp": 1.01667345, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 2.128105528477407, + "language_loss": 0.83454955, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85608548, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13317871, + "step": 8593, + "time_per_iteration": 2.5129287242889404 + }, + { + "auxiliary_loss_clip": 0.01122593, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.04633629, + "balance_loss_mlp": 1.02294815, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.6486073611174328, + "language_loss": 0.69232953, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71391928, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13452148, + "step": 8594, + "time_per_iteration": 2.542762279510498 + }, + { + "auxiliary_loss_clip": 0.01129189, + "auxiliary_loss_mlp": 0.01027542, + "balance_loss_clip": 1.05216348, + "balance_loss_mlp": 1.01500678, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.565571872260983, + "language_loss": 0.772466, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79403329, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12548828, + "step": 8595, + "time_per_iteration": 2.5033063888549805 + }, + { + "auxiliary_loss_clip": 0.01128912, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.04892564, + "balance_loss_mlp": 1.02422392, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.6968477265441861, + "language_loss": 0.65464008, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67630321, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.1317749, + "step": 8596, + "time_per_iteration": 2.4195523262023926 + }, + { + "auxiliary_loss_clip": 0.01122461, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.04717183, + "balance_loss_mlp": 1.02109289, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.6822553502879332, + "language_loss": 0.75229317, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77387035, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.14196777, + "step": 8597, + "time_per_iteration": 2.4916982650756836 + }, + { + "auxiliary_loss_clip": 0.0112181, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.04531646, + "balance_loss_mlp": 1.01703095, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.4978099417525186, + "language_loss": 0.80998832, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83150363, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12677002, + "step": 8598, + "time_per_iteration": 2.472297191619873 + }, + { + "auxiliary_loss_clip": 0.01124651, + "auxiliary_loss_mlp": 0.01046567, + "balance_loss_clip": 1.04746497, + "balance_loss_mlp": 1.0314275, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.5210699980748505, + "language_loss": 0.75403643, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77574861, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.15148926, + "step": 8599, + "time_per_iteration": 3.881957769393921 + }, + { + "auxiliary_loss_clip": 0.01122066, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.04820931, + "balance_loss_mlp": 1.01880205, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.2184353773677232, + "language_loss": 0.71901798, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74054867, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12207031, + "step": 8600, + "time_per_iteration": 2.4809768199920654 + }, + { + "auxiliary_loss_clip": 0.011242, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.04958892, + "balance_loss_mlp": 1.01686382, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.416694088372588, + "language_loss": 0.7446565, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76620007, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13299561, + "step": 8601, + "time_per_iteration": 2.56071138381958 + }, + { + "auxiliary_loss_clip": 0.01122391, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.04637527, + "balance_loss_mlp": 1.02153718, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 2.292625823257813, + "language_loss": 0.83645862, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85803127, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13354492, + "step": 8602, + "time_per_iteration": 2.5035011768341064 + }, + { + "auxiliary_loss_clip": 0.01123516, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.04757047, + "balance_loss_mlp": 1.0187453, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.8387152302763934, + "language_loss": 0.74987495, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.77143121, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13391113, + "step": 8603, + "time_per_iteration": 2.508793354034424 + }, + { + "auxiliary_loss_clip": 0.01126099, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.04855299, + "balance_loss_mlp": 1.02627015, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.0178786313640478, + "language_loss": 0.72171229, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74337351, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13763428, + "step": 8604, + "time_per_iteration": 2.4320003986358643 + }, + { + "auxiliary_loss_clip": 0.01127047, + "auxiliary_loss_mlp": 0.01042315, + "balance_loss_clip": 1.04425526, + "balance_loss_mlp": 1.02622151, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 2.477000194950796, + "language_loss": 0.85360599, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87529957, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.16113281, + "step": 8605, + "time_per_iteration": 2.4453136920928955 + }, + { + "auxiliary_loss_clip": 0.01123121, + "auxiliary_loss_mlp": 0.01026795, + "balance_loss_clip": 1.04705358, + "balance_loss_mlp": 1.01423657, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4364191931973254, + "language_loss": 0.64866483, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67016399, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12573242, + "step": 8606, + "time_per_iteration": 3.9988393783569336 + }, + { + "auxiliary_loss_clip": 0.01119069, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.04320705, + "balance_loss_mlp": 1.01811886, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.4653144606125577, + "language_loss": 0.77878869, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80028903, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12854004, + "step": 8607, + "time_per_iteration": 2.4429569244384766 + }, + { + "auxiliary_loss_clip": 0.01132395, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.04986, + "balance_loss_mlp": 1.02685547, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 1.7321383477848615, + "language_loss": 0.72036934, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74210078, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13885498, + "step": 8608, + "time_per_iteration": 2.513123035430908 + }, + { + "auxiliary_loss_clip": 0.01136645, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.05289698, + "balance_loss_mlp": 1.01994038, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 7.132869134381313, + "language_loss": 0.86553383, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88723874, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13897705, + "step": 8609, + "time_per_iteration": 2.459850311279297 + }, + { + "auxiliary_loss_clip": 0.01124736, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.04951453, + "balance_loss_mlp": 1.01870739, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.679949840801013, + "language_loss": 0.74074394, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.7622956, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11706543, + "step": 8610, + "time_per_iteration": 2.5283374786376953 + }, + { + "auxiliary_loss_clip": 0.01136683, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.05352771, + "balance_loss_mlp": 1.01743853, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.899951062547151, + "language_loss": 0.67408991, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69577503, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14367676, + "step": 8611, + "time_per_iteration": 2.4184188842773438 + }, + { + "auxiliary_loss_clip": 0.01123765, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.04763663, + "balance_loss_mlp": 1.02422547, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.440417283807909, + "language_loss": 0.77720958, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79882133, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1317749, + "step": 8612, + "time_per_iteration": 2.4928669929504395 + }, + { + "auxiliary_loss_clip": 0.01130604, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.05160666, + "balance_loss_mlp": 1.02711391, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 2.2504547642124533, + "language_loss": 0.817577, + "learning_rate": 1.981500833922294e-06, + "loss": 0.83928668, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13238525, + "step": 8613, + "time_per_iteration": 2.493802547454834 + }, + { + "auxiliary_loss_clip": 0.01137418, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.05918598, + "balance_loss_mlp": 1.02301371, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.5533393928160946, + "language_loss": 0.66194034, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68368304, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13842773, + "step": 8614, + "time_per_iteration": 3.822944164276123 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.05606115, + "balance_loss_mlp": 1.01823545, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.8903080248140247, + "language_loss": 0.86691535, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88858008, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13897705, + "step": 8615, + "time_per_iteration": 2.545210123062134 + }, + { + "auxiliary_loss_clip": 0.01128038, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.05233896, + "balance_loss_mlp": 1.02502155, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.5701001562450305, + "language_loss": 0.80940628, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83105803, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12103271, + "step": 8616, + "time_per_iteration": 2.6139893531799316 + }, + { + "auxiliary_loss_clip": 0.01135811, + "auxiliary_loss_mlp": 0.01042947, + "balance_loss_clip": 1.05576658, + "balance_loss_mlp": 1.02948821, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.8398386195245302, + "language_loss": 0.74993539, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77172291, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13464355, + "step": 8617, + "time_per_iteration": 2.562505006790161 + }, + { + "auxiliary_loss_clip": 0.01127819, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.04981804, + "balance_loss_mlp": 1.0204407, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.830864491089955, + "language_loss": 0.69774258, + "learning_rate": 1.979553617893785e-06, + "loss": 0.71936888, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.14367676, + "step": 8618, + "time_per_iteration": 2.483177661895752 + }, + { + "auxiliary_loss_clip": 0.01057265, + "auxiliary_loss_mlp": 0.01006291, + "balance_loss_clip": 1.03127205, + "balance_loss_mlp": 1.00511122, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9568890274670689, + "language_loss": 0.67259926, + "learning_rate": 1.979164176954999e-06, + "loss": 0.6932348, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.26123047, + "router_z_loss_mlp": 0.01179504, + "step": 8619, + "time_per_iteration": 3.0616676807403564 + }, + { + "auxiliary_loss_clip": 0.01120268, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.04572117, + "balance_loss_mlp": 1.02581096, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.3411517581618093, + "language_loss": 0.80046272, + "learning_rate": 1.97877473680631e-06, + "loss": 0.82206321, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13970947, + "step": 8620, + "time_per_iteration": 2.4298713207244873 + }, + { + "auxiliary_loss_clip": 0.01120942, + "auxiliary_loss_mlp": 0.0103745, + "balance_loss_clip": 1.04693508, + "balance_loss_mlp": 1.02502203, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.220206676078245, + "language_loss": 0.81972241, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84130633, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12432861, + "step": 8621, + "time_per_iteration": 2.4317424297332764 + }, + { + "auxiliary_loss_clip": 0.01132759, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.05612826, + "balance_loss_mlp": 1.02719808, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 4.485390739552127, + "language_loss": 0.6515739, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67329931, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12585449, + "step": 8622, + "time_per_iteration": 2.4768569469451904 + }, + { + "auxiliary_loss_clip": 0.01128061, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.04864192, + "balance_loss_mlp": 1.02813947, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.8386242896280325, + "language_loss": 0.6086179, + "learning_rate": 1.977606421248497e-06, + "loss": 0.63031632, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13647461, + "step": 8623, + "time_per_iteration": 2.481463670730591 + }, + { + "auxiliary_loss_clip": 0.01124786, + "auxiliary_loss_mlp": 0.01041222, + "balance_loss_clip": 1.04526877, + "balance_loss_mlp": 1.02788866, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7611759520388013, + "language_loss": 0.76664442, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78830451, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13342285, + "step": 8624, + "time_per_iteration": 2.525062322616577 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.04912829, + "balance_loss_mlp": 1.02526903, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.2309734241123826, + "language_loss": 0.71522236, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73685336, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.121521, + "step": 8625, + "time_per_iteration": 2.558987855911255 + }, + { + "auxiliary_loss_clip": 0.01124551, + "auxiliary_loss_mlp": 0.01040241, + "balance_loss_clip": 1.04696155, + "balance_loss_mlp": 1.02762866, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.7644703886721402, + "language_loss": 0.67621332, + "learning_rate": 1.976438113333184e-06, + "loss": 0.69786125, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1260376, + "step": 8626, + "time_per_iteration": 2.4730136394500732 + }, + { + "auxiliary_loss_clip": 0.01118451, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.0428586, + "balance_loss_mlp": 1.01858592, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.119605556333518, + "language_loss": 0.70506072, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72655976, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12866211, + "step": 8627, + "time_per_iteration": 3.8884572982788086 + }, + { + "auxiliary_loss_clip": 0.01121737, + "auxiliary_loss_mlp": 0.01044815, + "balance_loss_clip": 1.04245925, + "balance_loss_mlp": 1.03175545, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 2.3140503827652807, + "language_loss": 0.73254192, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75420749, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1307373, + "step": 8628, + "time_per_iteration": 2.4691240787506104 + }, + { + "auxiliary_loss_clip": 0.01130415, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.05432904, + "balance_loss_mlp": 1.02578592, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.8910048345649073, + "language_loss": 0.7785033, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.80019355, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.1282959, + "step": 8629, + "time_per_iteration": 2.549546241760254 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.0469594, + "balance_loss_mlp": 1.02203763, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.015699830848579, + "language_loss": 0.74533629, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76694596, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13983154, + "step": 8630, + "time_per_iteration": 2.4832417964935303 + }, + { + "auxiliary_loss_clip": 0.01121688, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.04489315, + "balance_loss_mlp": 1.02627039, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 2.481485901568265, + "language_loss": 0.80235875, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82397765, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13922119, + "step": 8631, + "time_per_iteration": 2.5005338191986084 + }, + { + "auxiliary_loss_clip": 0.01142013, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.06082523, + "balance_loss_mlp": 1.01980948, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.6288093732204, + "language_loss": 0.74820209, + "learning_rate": 1.974101522024942e-06, + "loss": 0.7699635, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.14337158, + "step": 8632, + "time_per_iteration": 2.5248749256134033 + }, + { + "auxiliary_loss_clip": 0.01118691, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.0453229, + "balance_loss_mlp": 1.0241046, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.9756620942255099, + "language_loss": 0.78824133, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.8097958, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12664795, + "step": 8633, + "time_per_iteration": 2.4745519161224365 + }, + { + "auxiliary_loss_clip": 0.01124379, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.04715669, + "balance_loss_mlp": 1.02504802, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 2.0478384217649954, + "language_loss": 0.80356872, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82519305, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13006592, + "step": 8634, + "time_per_iteration": 2.4525928497314453 + }, + { + "auxiliary_loss_clip": 0.01123871, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.04636955, + "balance_loss_mlp": 1.0318464, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.7763961817501686, + "language_loss": 0.69228494, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.71397352, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13122559, + "step": 8635, + "time_per_iteration": 2.521798849105835 + }, + { + "auxiliary_loss_clip": 0.01125846, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.04666007, + "balance_loss_mlp": 1.02265203, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.767688486123808, + "language_loss": 0.77460372, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79621708, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12860107, + "step": 8636, + "time_per_iteration": 2.4638960361480713 + }, + { + "auxiliary_loss_clip": 0.01121339, + "auxiliary_loss_mlp": 0.01036543, + "balance_loss_clip": 1.04416108, + "balance_loss_mlp": 1.02280998, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.203911654696178, + "language_loss": 0.7199378, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.74151659, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13739014, + "step": 8637, + "time_per_iteration": 2.4927492141723633 + }, + { + "auxiliary_loss_clip": 0.01123353, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.0463618, + "balance_loss_mlp": 1.02218103, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 2.2886425615541515, + "language_loss": 0.75860065, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78019297, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13690186, + "step": 8638, + "time_per_iteration": 2.4230260848999023 + }, + { + "auxiliary_loss_clip": 0.0112623, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.05054331, + "balance_loss_mlp": 1.01828969, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 1.8319094806745024, + "language_loss": 0.74950063, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77107716, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13140869, + "step": 8639, + "time_per_iteration": 2.468007802963257 + }, + { + "auxiliary_loss_clip": 0.01132604, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.05559099, + "balance_loss_mlp": 1.02052367, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.9371746160191863, + "language_loss": 0.77503347, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79670286, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13812256, + "step": 8640, + "time_per_iteration": 2.490781784057617 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.01039871, + "balance_loss_clip": 1.04564929, + "balance_loss_mlp": 1.02590561, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.938415069208449, + "language_loss": 0.66264105, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68423802, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.13964844, + "step": 8641, + "time_per_iteration": 2.5151727199554443 + }, + { + "auxiliary_loss_clip": 0.01125277, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.04850721, + "balance_loss_mlp": 1.02423453, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 2.2323835492644184, + "language_loss": 0.76336825, + "learning_rate": 1.97020728331885e-06, + "loss": 0.7849946, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13140869, + "step": 8642, + "time_per_iteration": 2.5536956787109375 + }, + { + "auxiliary_loss_clip": 0.01122974, + "auxiliary_loss_mlp": 0.01040165, + "balance_loss_clip": 1.04619968, + "balance_loss_mlp": 1.02634823, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.5199344314091112, + "language_loss": 0.83243191, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85406321, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13830566, + "step": 8643, + "time_per_iteration": 3.932206630706787 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_clip": 1.04979205, + "balance_loss_mlp": 1.0302074, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.553478729890264, + "language_loss": 0.70169139, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72341788, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13806152, + "step": 8644, + "time_per_iteration": 2.4766366481781006 + }, + { + "auxiliary_loss_clip": 0.01129197, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.04972732, + "balance_loss_mlp": 1.01980567, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.6855341255941099, + "language_loss": 0.79962468, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82124847, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13391113, + "step": 8645, + "time_per_iteration": 2.488609790802002 + }, + { + "auxiliary_loss_clip": 0.01122474, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.04489827, + "balance_loss_mlp": 1.0154165, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.9643956863111662, + "language_loss": 0.77962434, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80113643, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13330078, + "step": 8646, + "time_per_iteration": 2.4982471466064453 + }, + { + "auxiliary_loss_clip": 0.01129858, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.05176628, + "balance_loss_mlp": 1.02235889, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.987204565354905, + "language_loss": 0.66461027, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68626416, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13183594, + "step": 8647, + "time_per_iteration": 2.430227279663086 + }, + { + "auxiliary_loss_clip": 0.01130358, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.04923582, + "balance_loss_mlp": 1.01962543, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 1.7382930408376585, + "language_loss": 0.71583879, + "learning_rate": 1.967870793377763e-06, + "loss": 0.7374962, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.1574707, + "step": 8648, + "time_per_iteration": 2.5360734462738037 + }, + { + "auxiliary_loss_clip": 0.01138523, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.05815947, + "balance_loss_mlp": 1.01674533, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 4.090388274062969, + "language_loss": 0.64216077, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66387355, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.16009521, + "step": 8649, + "time_per_iteration": 2.461254358291626 + }, + { + "auxiliary_loss_clip": 0.01137237, + "auxiliary_loss_mlp": 0.01040393, + "balance_loss_clip": 1.05394089, + "balance_loss_mlp": 1.02458608, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.9767764336878062, + "language_loss": 0.70597088, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.7277472, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.15808105, + "step": 8650, + "time_per_iteration": 3.9305551052093506 + }, + { + "auxiliary_loss_clip": 0.0113259, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.05532849, + "balance_loss_mlp": 1.0168345, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 3.234528404367746, + "language_loss": 0.7769562, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79858685, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13635254, + "step": 8651, + "time_per_iteration": 2.4046895503997803 + }, + { + "auxiliary_loss_clip": 0.01130579, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.05112362, + "balance_loss_mlp": 1.02249146, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.6259719702946185, + "language_loss": 0.78310108, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80478001, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.14831543, + "step": 8652, + "time_per_iteration": 2.475612163543701 + }, + { + "auxiliary_loss_clip": 0.01133593, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.05212319, + "balance_loss_mlp": 1.02078474, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 1.8046187772795426, + "language_loss": 0.70189357, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72361451, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.17712402, + "step": 8653, + "time_per_iteration": 2.4500136375427246 + }, + { + "auxiliary_loss_clip": 0.01126405, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.04529369, + "balance_loss_mlp": 1.02972484, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 2.0799077839890203, + "language_loss": 0.78411114, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80582011, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14758301, + "step": 8654, + "time_per_iteration": 2.5055158138275146 + }, + { + "auxiliary_loss_clip": 0.01130981, + "auxiliary_loss_mlp": 0.01041902, + "balance_loss_clip": 1.04790699, + "balance_loss_mlp": 1.02658987, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 1.8360711949210122, + "language_loss": 0.84375298, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86548185, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.15325928, + "step": 8655, + "time_per_iteration": 2.4741899967193604 + }, + { + "auxiliary_loss_clip": 0.01123979, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.04784465, + "balance_loss_mlp": 1.02053618, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.2825982006205163, + "language_loss": 0.66069341, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68227839, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13995361, + "step": 8656, + "time_per_iteration": 2.545018196105957 + }, + { + "auxiliary_loss_clip": 0.01131782, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.05043256, + "balance_loss_mlp": 1.02947164, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 1.7381440276487647, + "language_loss": 0.73369074, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75544465, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.14141846, + "step": 8657, + "time_per_iteration": 3.9385147094726562 + }, + { + "auxiliary_loss_clip": 0.01125494, + "auxiliary_loss_mlp": 0.01049768, + "balance_loss_clip": 1.04685652, + "balance_loss_mlp": 1.03166032, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 2.5805096621469805, + "language_loss": 0.71813905, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73989165, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.18103027, + "step": 8658, + "time_per_iteration": 2.422905445098877 + }, + { + "auxiliary_loss_clip": 0.01129241, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.04781961, + "balance_loss_mlp": 1.0215503, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.7259011853984765, + "language_loss": 0.83399189, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85564315, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14331055, + "step": 8659, + "time_per_iteration": 2.5226333141326904 + }, + { + "auxiliary_loss_clip": 0.01139596, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.05227256, + "balance_loss_mlp": 1.02805007, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.8732008331692351, + "language_loss": 0.75713444, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77897251, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.16162109, + "step": 8660, + "time_per_iteration": 2.3899595737457275 + }, + { + "auxiliary_loss_clip": 0.01128392, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.05168641, + "balance_loss_mlp": 1.01642728, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 2.0598133389392066, + "language_loss": 0.77515137, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.79672945, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12988281, + "step": 8661, + "time_per_iteration": 2.4612035751342773 + }, + { + "auxiliary_loss_clip": 0.0115128, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.06522083, + "balance_loss_mlp": 1.01835608, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.6893019945821104, + "language_loss": 0.69946456, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72129768, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.13665771, + "step": 8662, + "time_per_iteration": 2.4740381240844727 + }, + { + "auxiliary_loss_clip": 0.01132877, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.05649161, + "balance_loss_mlp": 1.02130449, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.6000978516280737, + "language_loss": 0.69308639, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71477008, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.1418457, + "step": 8663, + "time_per_iteration": 2.514061689376831 + }, + { + "auxiliary_loss_clip": 0.01143995, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.06003332, + "balance_loss_mlp": 1.02017343, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.5895667035960406, + "language_loss": 0.76949018, + "learning_rate": 1.961640376626072e-06, + "loss": 0.79127622, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14422607, + "step": 8664, + "time_per_iteration": 2.588514566421509 + }, + { + "auxiliary_loss_clip": 0.01129971, + "auxiliary_loss_mlp": 0.01040262, + "balance_loss_clip": 1.0500474, + "balance_loss_mlp": 1.02646995, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 2.308765491844958, + "language_loss": 0.76586175, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78756416, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13812256, + "step": 8665, + "time_per_iteration": 2.5970826148986816 + }, + { + "auxiliary_loss_clip": 0.01139958, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.06009924, + "balance_loss_mlp": 1.02285731, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.6656634679504212, + "language_loss": 0.71978897, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74154669, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12963867, + "step": 8666, + "time_per_iteration": 2.481834650039673 + }, + { + "auxiliary_loss_clip": 0.01141013, + "auxiliary_loss_mlp": 0.01042287, + "balance_loss_clip": 1.05346119, + "balance_loss_mlp": 1.02526391, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 1.9825029949167083, + "language_loss": 0.68798441, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70981741, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.17028809, + "step": 8667, + "time_per_iteration": 2.542625904083252 + }, + { + "auxiliary_loss_clip": 0.01130453, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.0547812, + "balance_loss_mlp": 1.01783156, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 2.0653868105391804, + "language_loss": 0.812837, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83445752, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13769531, + "step": 8668, + "time_per_iteration": 2.461416721343994 + }, + { + "auxiliary_loss_clip": 0.0114344, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.0615871, + "balance_loss_mlp": 1.01798129, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.0391962596960234, + "language_loss": 0.63999712, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.66174775, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13623047, + "step": 8669, + "time_per_iteration": 2.508157730102539 + }, + { + "auxiliary_loss_clip": 0.01138024, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.0593245, + "balance_loss_mlp": 1.02264857, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.4591353699709002, + "language_loss": 0.66681468, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68855906, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13757324, + "step": 8670, + "time_per_iteration": 3.915998697280884 + }, + { + "auxiliary_loss_clip": 0.01121338, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.04537129, + "balance_loss_mlp": 1.02395785, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.5139343392456097, + "language_loss": 0.76349163, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78507584, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13128662, + "step": 8671, + "time_per_iteration": 2.499135971069336 + }, + { + "auxiliary_loss_clip": 0.01144323, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.06386161, + "balance_loss_mlp": 1.02505839, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 1.8909193969421543, + "language_loss": 0.77943718, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80128002, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14904785, + "step": 8672, + "time_per_iteration": 2.4212238788604736 + }, + { + "auxiliary_loss_clip": 0.01122366, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.04529417, + "balance_loss_mlp": 1.02071762, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8875768964296422, + "language_loss": 0.72181803, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74337542, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12670898, + "step": 8673, + "time_per_iteration": 2.4664926528930664 + }, + { + "auxiliary_loss_clip": 0.01132915, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.05418563, + "balance_loss_mlp": 1.01961207, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.6005193638779422, + "language_loss": 0.74431169, + "learning_rate": 1.957746551415166e-06, + "loss": 0.76597679, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13977051, + "step": 8674, + "time_per_iteration": 2.414137601852417 + }, + { + "auxiliary_loss_clip": 0.01132091, + "auxiliary_loss_mlp": 0.01035867, + "balance_loss_clip": 1.0496459, + "balance_loss_mlp": 1.02016735, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.0131893377484045, + "language_loss": 0.8588711, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88055068, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.15698242, + "step": 8675, + "time_per_iteration": 2.4383368492126465 + }, + { + "auxiliary_loss_clip": 0.01062485, + "auxiliary_loss_mlp": 0.01003664, + "balance_loss_clip": 1.03593659, + "balance_loss_mlp": 1.00218773, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.884001999365444, + "language_loss": 0.63168395, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65234536, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01478577, + "step": 8676, + "time_per_iteration": 3.0329720973968506 + }, + { + "auxiliary_loss_clip": 0.01124591, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.04778326, + "balance_loss_mlp": 1.02135313, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.843089929642233, + "language_loss": 0.69015759, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71175236, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13519287, + "step": 8677, + "time_per_iteration": 2.528770685195923 + }, + { + "auxiliary_loss_clip": 0.01127591, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.05007577, + "balance_loss_mlp": 1.02007508, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.9487161080883861, + "language_loss": 0.65253687, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67415071, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13702393, + "step": 8678, + "time_per_iteration": 2.512948751449585 + }, + { + "auxiliary_loss_clip": 0.01134404, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.05308032, + "balance_loss_mlp": 1.01871419, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.5205336694679286, + "language_loss": 0.69024682, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71192783, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14984131, + "step": 8679, + "time_per_iteration": 2.4664018154144287 + }, + { + "auxiliary_loss_clip": 0.01125259, + "auxiliary_loss_mlp": 0.01037788, + "balance_loss_clip": 1.04788041, + "balance_loss_mlp": 1.0239718, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.6036784891039018, + "language_loss": 0.66434544, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.68597591, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13806152, + "step": 8680, + "time_per_iteration": 2.4893720149993896 + }, + { + "auxiliary_loss_clip": 0.01132813, + "auxiliary_loss_mlp": 0.0104024, + "balance_loss_clip": 1.05065203, + "balance_loss_mlp": 1.02544653, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.8909201323148426, + "language_loss": 0.82905596, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85078651, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14801025, + "step": 8681, + "time_per_iteration": 2.4384334087371826 + }, + { + "auxiliary_loss_clip": 0.01127318, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.04837847, + "balance_loss_mlp": 1.01919448, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.8368847669568347, + "language_loss": 0.77597523, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79757845, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13812256, + "step": 8682, + "time_per_iteration": 2.5057668685913086 + }, + { + "auxiliary_loss_clip": 0.01129113, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.05155611, + "balance_loss_mlp": 1.02502656, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 2.0508674949269863, + "language_loss": 0.6932655, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71493328, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12652588, + "step": 8683, + "time_per_iteration": 2.5927748680114746 + }, + { + "auxiliary_loss_clip": 0.0113943, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.05878055, + "balance_loss_mlp": 1.02546465, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 2.6807158934978137, + "language_loss": 0.76169407, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78348327, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.14025879, + "step": 8684, + "time_per_iteration": 2.543848991394043 + }, + { + "auxiliary_loss_clip": 0.01128695, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.05301559, + "balance_loss_mlp": 1.01951647, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.6664821454591692, + "language_loss": 0.75827265, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77988416, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.1295166, + "step": 8685, + "time_per_iteration": 2.449443817138672 + }, + { + "auxiliary_loss_clip": 0.01135147, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.05615151, + "balance_loss_mlp": 1.02771282, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 2.0784349939349265, + "language_loss": 0.80974191, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83150363, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13293457, + "step": 8686, + "time_per_iteration": 2.529604196548462 + }, + { + "auxiliary_loss_clip": 0.01130739, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.05615115, + "balance_loss_mlp": 1.01845908, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.7347303665539655, + "language_loss": 0.70232052, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72393823, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12573242, + "step": 8687, + "time_per_iteration": 2.4815804958343506 + }, + { + "auxiliary_loss_clip": 0.01120514, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.0459919, + "balance_loss_mlp": 1.02229905, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 1.9394516971485178, + "language_loss": 0.82997966, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85153091, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12329102, + "step": 8688, + "time_per_iteration": 3.8708441257476807 + }, + { + "auxiliary_loss_clip": 0.01129804, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.05360436, + "balance_loss_mlp": 1.02156067, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.3802092275095204, + "language_loss": 0.73529994, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75694931, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13568115, + "step": 8689, + "time_per_iteration": 2.4016611576080322 + }, + { + "auxiliary_loss_clip": 0.01122974, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.04870546, + "balance_loss_mlp": 1.02028215, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8368238675224438, + "language_loss": 0.82780373, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.84936529, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12896729, + "step": 8690, + "time_per_iteration": 2.474254608154297 + }, + { + "auxiliary_loss_clip": 0.01118245, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.04169345, + "balance_loss_mlp": 1.02121043, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.1727288816981947, + "language_loss": 0.78653723, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.80807722, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.14538574, + "step": 8691, + "time_per_iteration": 2.493389368057251 + }, + { + "auxiliary_loss_clip": 0.01130607, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.0480206, + "balance_loss_mlp": 1.03074086, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 1.973433641414484, + "language_loss": 0.76502627, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78681463, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.17492676, + "step": 8692, + "time_per_iteration": 2.448776960372925 + }, + { + "auxiliary_loss_clip": 0.01127774, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.05271423, + "balance_loss_mlp": 1.02106667, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.8987520737455128, + "language_loss": 0.72994137, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75154799, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1184082, + "step": 8693, + "time_per_iteration": 2.494375705718994 + }, + { + "auxiliary_loss_clip": 0.01132231, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.05198264, + "balance_loss_mlp": 1.02107418, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 2.5833592076510645, + "language_loss": 0.8190856, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84076345, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.14477539, + "step": 8694, + "time_per_iteration": 3.8894364833831787 + }, + { + "auxiliary_loss_clip": 0.01068198, + "auxiliary_loss_mlp": 0.01006716, + "balance_loss_clip": 1.04134464, + "balance_loss_mlp": 1.00474763, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.767713339638516, + "language_loss": 0.55712992, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57787907, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01966858, + "step": 8695, + "time_per_iteration": 3.1005966663360596 + }, + { + "auxiliary_loss_clip": 0.01120434, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.04416943, + "balance_loss_mlp": 1.02574492, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.7718634543318612, + "language_loss": 0.73491609, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75650716, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12927246, + "step": 8696, + "time_per_iteration": 2.4535584449768066 + }, + { + "auxiliary_loss_clip": 0.01122816, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.04549956, + "balance_loss_mlp": 1.02253294, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.5449212017153005, + "language_loss": 0.7171281, + "learning_rate": 1.948791385766319e-06, + "loss": 0.7387197, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13824463, + "step": 8697, + "time_per_iteration": 2.4859848022460938 + }, + { + "auxiliary_loss_clip": 0.01116422, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.04328549, + "balance_loss_mlp": 1.01918197, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 2.7602887006097587, + "language_loss": 0.80691254, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82839286, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12432861, + "step": 8698, + "time_per_iteration": 2.4825682640075684 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.05413687, + "balance_loss_mlp": 1.02365327, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.9463270263307049, + "language_loss": 0.74342036, + "learning_rate": 1.948012721672093e-06, + "loss": 0.7650938, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12945557, + "step": 8699, + "time_per_iteration": 2.5020978450775146 + }, + { + "auxiliary_loss_clip": 0.01131442, + "auxiliary_loss_mlp": 0.01040144, + "balance_loss_clip": 1.05098474, + "balance_loss_mlp": 1.02555287, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.5934977432488422, + "language_loss": 0.73338377, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75509959, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.14581299, + "step": 8700, + "time_per_iteration": 2.4508414268493652 + }, + { + "auxiliary_loss_clip": 0.01132262, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.05320597, + "balance_loss_mlp": 1.02121615, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.799354538382259, + "language_loss": 0.66746223, + "learning_rate": 1.947234065463318e-06, + "loss": 0.68913954, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.14251709, + "step": 8701, + "time_per_iteration": 3.9206576347351074 + }, + { + "auxiliary_loss_clip": 0.01127985, + "auxiliary_loss_mlp": 0.01035338, + "balance_loss_clip": 1.05021846, + "balance_loss_mlp": 1.02196836, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 1.9970629018133497, + "language_loss": 0.67326021, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.69489348, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13372803, + "step": 8702, + "time_per_iteration": 2.4849140644073486 + }, + { + "auxiliary_loss_clip": 0.01131201, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.05222929, + "balance_loss_mlp": 1.02543652, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 2.231158947646243, + "language_loss": 0.7650063, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78671503, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14251709, + "step": 8703, + "time_per_iteration": 2.461923837661743 + }, + { + "auxiliary_loss_clip": 0.01132929, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.05032802, + "balance_loss_mlp": 1.02792144, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 1.9393871334069472, + "language_loss": 0.77412868, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79588979, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.15264893, + "step": 8704, + "time_per_iteration": 2.559025764465332 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.01040086, + "balance_loss_clip": 1.04765868, + "balance_loss_mlp": 1.02704477, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.7861331711026065, + "language_loss": 0.78377247, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80540037, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13049316, + "step": 8705, + "time_per_iteration": 2.531569719314575 + }, + { + "auxiliary_loss_clip": 0.01125198, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.04551411, + "balance_loss_mlp": 1.02251136, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.023177226465285, + "language_loss": 0.69941884, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72103953, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.14349365, + "step": 8706, + "time_per_iteration": 2.4618945121765137 + }, + { + "auxiliary_loss_clip": 0.01062338, + "auxiliary_loss_mlp": 0.0100652, + "balance_loss_clip": 1.03557777, + "balance_loss_mlp": 1.00468719, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6771964834147379, + "language_loss": 0.52499759, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54568619, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01831055, + "step": 8707, + "time_per_iteration": 3.1331520080566406 + }, + { + "auxiliary_loss_clip": 0.0112896, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.04909694, + "balance_loss_mlp": 1.02592611, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.6417522635345907, + "language_loss": 0.74569011, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76736856, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12969971, + "step": 8708, + "time_per_iteration": 2.4887161254882812 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.01026132, + "balance_loss_clip": 1.05637383, + "balance_loss_mlp": 1.01320386, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.5250325423865314, + "language_loss": 0.77365947, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79525512, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1293335, + "step": 8709, + "time_per_iteration": 2.4621782302856445 + }, + { + "auxiliary_loss_clip": 0.01135234, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.04939938, + "balance_loss_mlp": 1.02464712, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 2.169493658135587, + "language_loss": 0.83372623, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85548753, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.16229248, + "step": 8710, + "time_per_iteration": 2.5057554244995117 + }, + { + "auxiliary_loss_clip": 0.01132927, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.05562651, + "balance_loss_mlp": 1.01775146, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.8835344236572535, + "language_loss": 0.69536901, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71700299, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12719727, + "step": 8711, + "time_per_iteration": 2.477289915084839 + }, + { + "auxiliary_loss_clip": 0.0112945, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.05149269, + "balance_loss_mlp": 1.02264547, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.6563838020707364, + "language_loss": 0.83145183, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85310847, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13580322, + "step": 8712, + "time_per_iteration": 2.480976104736328 + }, + { + "auxiliary_loss_clip": 0.01130197, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.05062258, + "balance_loss_mlp": 1.02326405, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.8083020767827094, + "language_loss": 0.69839531, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72007471, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.14477539, + "step": 8713, + "time_per_iteration": 2.472468852996826 + }, + { + "auxiliary_loss_clip": 0.01140266, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.05750847, + "balance_loss_mlp": 1.01732802, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.655413463282962, + "language_loss": 0.76482558, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.7865504, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14892578, + "step": 8714, + "time_per_iteration": 3.9535555839538574 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.05421472, + "balance_loss_mlp": 1.01783204, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 2.0230747279830408, + "language_loss": 0.75622702, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.77788335, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14520264, + "step": 8715, + "time_per_iteration": 2.4526526927948 + }, + { + "auxiliary_loss_clip": 0.01128683, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.05271935, + "balance_loss_mlp": 1.0182445, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.8561421933633862, + "language_loss": 0.71529281, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73689628, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13409424, + "step": 8716, + "time_per_iteration": 2.5638270378112793 + }, + { + "auxiliary_loss_clip": 0.0113687, + "auxiliary_loss_mlp": 0.0103829, + "balance_loss_clip": 1.05875421, + "balance_loss_mlp": 1.02556443, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 1.9795865384178206, + "language_loss": 0.86962545, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89137703, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12713623, + "step": 8717, + "time_per_iteration": 2.471836566925049 + }, + { + "auxiliary_loss_clip": 0.01120799, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.04338801, + "balance_loss_mlp": 1.01858807, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.7853529532613428, + "language_loss": 0.61044037, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63197041, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.1361084, + "step": 8718, + "time_per_iteration": 2.5116662979125977 + }, + { + "auxiliary_loss_clip": 0.01137372, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.05582881, + "balance_loss_mlp": 1.02092195, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.750293925408334, + "language_loss": 0.72038794, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74210882, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13812256, + "step": 8719, + "time_per_iteration": 2.4608676433563232 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.05194366, + "balance_loss_mlp": 1.01651502, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 2.2650236182082577, + "language_loss": 0.73341054, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.75497448, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12390137, + "step": 8720, + "time_per_iteration": 2.5096163749694824 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.05654979, + "balance_loss_mlp": 1.02136803, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.912616152800641, + "language_loss": 0.70319313, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72490776, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13964844, + "step": 8721, + "time_per_iteration": 2.537397623062134 + }, + { + "auxiliary_loss_clip": 0.01131401, + "auxiliary_loss_mlp": 0.01037378, + "balance_loss_clip": 1.05296922, + "balance_loss_mlp": 1.02371621, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 2.316957484162457, + "language_loss": 0.86584496, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88753277, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13653564, + "step": 8722, + "time_per_iteration": 2.5347774028778076 + }, + { + "auxiliary_loss_clip": 0.01125682, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.04744112, + "balance_loss_mlp": 1.01720643, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.69283919185857, + "language_loss": 0.79938817, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82096374, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.14666748, + "step": 8723, + "time_per_iteration": 2.447652578353882 + }, + { + "auxiliary_loss_clip": 0.01134943, + "auxiliary_loss_mlp": 0.01044155, + "balance_loss_clip": 1.05489397, + "balance_loss_mlp": 1.03039253, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.7333338188255465, + "language_loss": 0.7475512, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.76934218, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13757324, + "step": 8724, + "time_per_iteration": 2.51829195022583 + }, + { + "auxiliary_loss_clip": 0.01135896, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.05161667, + "balance_loss_mlp": 1.02003002, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.9773409207093093, + "language_loss": 0.70196831, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72368646, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.15881348, + "step": 8725, + "time_per_iteration": 2.5253074169158936 + }, + { + "auxiliary_loss_clip": 0.0105561, + "auxiliary_loss_mlp": 0.01001992, + "balance_loss_clip": 1.02896011, + "balance_loss_mlp": 1.00064015, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7520139048349507, + "language_loss": 0.55600584, + "learning_rate": 1.937501576352568e-06, + "loss": 0.5765819, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.26611328, + "router_z_loss_mlp": 0.01350403, + "step": 8726, + "time_per_iteration": 3.070807695388794 + }, + { + "auxiliary_loss_clip": 0.01059951, + "auxiliary_loss_mlp": 0.01000719, + "balance_loss_clip": 1.03213477, + "balance_loss_mlp": 0.99920636, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7928707351059134, + "language_loss": 0.58347005, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60407674, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.27832031, + "router_z_loss_mlp": 0.01513672, + "step": 8727, + "time_per_iteration": 3.0731778144836426 + }, + { + "auxiliary_loss_clip": 0.01137335, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.05622172, + "balance_loss_mlp": 1.02797723, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3896509732612152, + "language_loss": 0.70765555, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72945058, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14190674, + "step": 8728, + "time_per_iteration": 2.492889404296875 + }, + { + "auxiliary_loss_clip": 0.01128831, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.0505681, + "balance_loss_mlp": 1.01738143, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.4507742001157309, + "language_loss": 0.69905448, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.72063833, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12176514, + "step": 8729, + "time_per_iteration": 2.49149227142334 + }, + { + "auxiliary_loss_clip": 0.0113242, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.05115414, + "balance_loss_mlp": 1.02375162, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.743742705126844, + "language_loss": 0.83806396, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85976219, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13665771, + "step": 8730, + "time_per_iteration": 2.4524829387664795 + }, + { + "auxiliary_loss_clip": 0.01122128, + "auxiliary_loss_mlp": 0.01037969, + "balance_loss_clip": 1.04418445, + "balance_loss_mlp": 1.0242238, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 1.9672358675064738, + "language_loss": 0.79090983, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81251085, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13745117, + "step": 8731, + "time_per_iteration": 3.864095687866211 + }, + { + "auxiliary_loss_clip": 0.01120311, + "auxiliary_loss_mlp": 0.01040286, + "balance_loss_clip": 1.04540873, + "balance_loss_mlp": 1.02632666, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 2.7081852576811007, + "language_loss": 0.83444291, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85604882, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.13946533, + "step": 8732, + "time_per_iteration": 2.49802827835083 + }, + { + "auxiliary_loss_clip": 0.01126411, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.04691279, + "balance_loss_mlp": 1.02394509, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.6258177743187379, + "language_loss": 0.77509952, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79673171, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12878418, + "step": 8733, + "time_per_iteration": 2.471060037612915 + }, + { + "auxiliary_loss_clip": 0.01132451, + "auxiliary_loss_mlp": 0.01043949, + "balance_loss_clip": 1.05021191, + "balance_loss_mlp": 1.02904785, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.8377708833267972, + "language_loss": 0.81679845, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83856243, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.14904785, + "step": 8734, + "time_per_iteration": 2.4130382537841797 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.05516124, + "balance_loss_mlp": 1.02151966, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.360986505236718, + "language_loss": 0.76979613, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79147863, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1385498, + "step": 8735, + "time_per_iteration": 2.5544705390930176 + }, + { + "auxiliary_loss_clip": 0.01134278, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.0538733, + "balance_loss_mlp": 1.02527976, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.6459528637655862, + "language_loss": 0.80342078, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82514977, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13354492, + "step": 8736, + "time_per_iteration": 2.485470771789551 + }, + { + "auxiliary_loss_clip": 0.01128268, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.0486269, + "balance_loss_mlp": 1.03248358, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.1163268581189802, + "language_loss": 0.70718646, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.72895885, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.16491699, + "step": 8737, + "time_per_iteration": 2.573324203491211 + }, + { + "auxiliary_loss_clip": 0.01136122, + "auxiliary_loss_mlp": 0.01041397, + "balance_loss_clip": 1.05226469, + "balance_loss_mlp": 1.02718151, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4924016828076585, + "language_loss": 0.77626699, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79804218, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14208984, + "step": 8738, + "time_per_iteration": 3.882927656173706 + }, + { + "auxiliary_loss_clip": 0.01059991, + "auxiliary_loss_mlp": 0.0100647, + "balance_loss_clip": 1.031986, + "balance_loss_mlp": 1.00495315, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7500687102199596, + "language_loss": 0.54482216, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56548679, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.28027344, + "router_z_loss_mlp": 0.01516724, + "step": 8739, + "time_per_iteration": 3.046821355819702 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01039235, + "balance_loss_clip": 1.05204225, + "balance_loss_mlp": 1.02640796, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.8897809522703304, + "language_loss": 0.84676576, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86845291, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12835693, + "step": 8740, + "time_per_iteration": 2.6612472534179688 + }, + { + "auxiliary_loss_clip": 0.01131549, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.05304646, + "balance_loss_mlp": 1.02878952, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 2.12450885109769, + "language_loss": 0.69096565, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71271056, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.14160156, + "step": 8741, + "time_per_iteration": 2.415109395980835 + }, + { + "auxiliary_loss_clip": 0.01135409, + "auxiliary_loss_mlp": 0.01042183, + "balance_loss_clip": 1.05181074, + "balance_loss_mlp": 1.02732897, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 2.1308165390680376, + "language_loss": 0.66131651, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68309242, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14868164, + "step": 8742, + "time_per_iteration": 2.473504066467285 + }, + { + "auxiliary_loss_clip": 0.01132773, + "auxiliary_loss_mlp": 0.01045475, + "balance_loss_clip": 1.04935682, + "balance_loss_mlp": 1.02994215, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.9228677153003026, + "language_loss": 0.62751985, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.64930224, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.15539551, + "step": 8743, + "time_per_iteration": 2.390471935272217 + }, + { + "auxiliary_loss_clip": 0.01057974, + "auxiliary_loss_mlp": 0.01014722, + "balance_loss_clip": 1.03002036, + "balance_loss_mlp": 1.01327682, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7704553885639783, + "language_loss": 0.54206383, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56279075, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.01446533, + "step": 8744, + "time_per_iteration": 3.1839959621429443 + }, + { + "auxiliary_loss_clip": 0.01137052, + "auxiliary_loss_mlp": 0.01040077, + "balance_loss_clip": 1.05312204, + "balance_loss_mlp": 1.02437699, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.0022022194023967, + "language_loss": 0.75743532, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77920657, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.15704346, + "step": 8745, + "time_per_iteration": 3.955764055252075 + }, + { + "auxiliary_loss_clip": 0.01137173, + "auxiliary_loss_mlp": 0.01042985, + "balance_loss_clip": 1.05431974, + "balance_loss_mlp": 1.02950823, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.8579297734966786, + "language_loss": 0.81390631, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.8357079, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13482666, + "step": 8746, + "time_per_iteration": 2.4450063705444336 + }, + { + "auxiliary_loss_clip": 0.01127744, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.04993343, + "balance_loss_mlp": 1.0184691, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 2.084918430137692, + "language_loss": 0.75458622, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77618301, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13482666, + "step": 8747, + "time_per_iteration": 2.5826926231384277 + }, + { + "auxiliary_loss_clip": 0.01122794, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.04874468, + "balance_loss_mlp": 1.02239585, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.7997357112770107, + "language_loss": 0.83361077, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.85519707, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.13446045, + "step": 8748, + "time_per_iteration": 2.457090139389038 + }, + { + "auxiliary_loss_clip": 0.01129973, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.05047131, + "balance_loss_mlp": 1.02220798, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 2.360783880595978, + "language_loss": 0.80832636, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82999408, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.14593506, + "step": 8749, + "time_per_iteration": 2.52801513671875 + }, + { + "auxiliary_loss_clip": 0.01137277, + "auxiliary_loss_mlp": 0.01039545, + "balance_loss_clip": 1.05849266, + "balance_loss_mlp": 1.02521634, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.72121074276497, + "language_loss": 0.72581601, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74758422, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14343262, + "step": 8750, + "time_per_iteration": 2.529989242553711 + }, + { + "auxiliary_loss_clip": 0.01130616, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.0533911, + "balance_loss_mlp": 1.01875055, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3794302183934941, + "language_loss": 0.76318181, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.7848075, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13201904, + "step": 8751, + "time_per_iteration": 2.5387144088745117 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.05620742, + "balance_loss_mlp": 1.02359128, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.594873695633624, + "language_loss": 0.76242268, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78411561, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.13446045, + "step": 8752, + "time_per_iteration": 2.5052638053894043 + }, + { + "auxiliary_loss_clip": 0.01136356, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.05480826, + "balance_loss_mlp": 1.02094066, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.8005112803608514, + "language_loss": 0.68076336, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70247799, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.14178467, + "step": 8753, + "time_per_iteration": 2.5093088150024414 + }, + { + "auxiliary_loss_clip": 0.01130126, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.05337119, + "balance_loss_mlp": 1.02316105, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4633107374940375, + "language_loss": 0.83841348, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.86008322, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13690186, + "step": 8754, + "time_per_iteration": 2.526374101638794 + }, + { + "auxiliary_loss_clip": 0.01134797, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.05211091, + "balance_loss_mlp": 1.02214885, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.272382482999663, + "language_loss": 0.87451756, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89622772, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14050293, + "step": 8755, + "time_per_iteration": 2.438798427581787 + }, + { + "auxiliary_loss_clip": 0.0106291, + "auxiliary_loss_mlp": 0.01001879, + "balance_loss_clip": 1.03689051, + "balance_loss_mlp": 1.0003916, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7195565552019926, + "language_loss": 0.589077, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60972488, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.01486206, + "step": 8756, + "time_per_iteration": 3.1345338821411133 + }, + { + "auxiliary_loss_clip": 0.01128805, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.04842758, + "balance_loss_mlp": 1.02077723, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 5.557797416760589, + "language_loss": 0.70907927, + "learning_rate": 1.925435372588913e-06, + "loss": 0.73071408, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13885498, + "step": 8757, + "time_per_iteration": 2.463212013244629 + }, + { + "auxiliary_loss_clip": 0.01126697, + "auxiliary_loss_mlp": 0.01041583, + "balance_loss_clip": 1.0471276, + "balance_loss_mlp": 1.02783275, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 2.6546100738553493, + "language_loss": 0.88034427, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90202701, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13751221, + "step": 8758, + "time_per_iteration": 3.8156871795654297 + }, + { + "auxiliary_loss_clip": 0.01132698, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.05065966, + "balance_loss_mlp": 1.02159095, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.383361602463069, + "language_loss": 0.76022494, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78191793, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.15014648, + "step": 8759, + "time_per_iteration": 2.464445114135742 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.04135203, + "balance_loss_mlp": 1.01969731, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.8923346006967445, + "language_loss": 0.72143418, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.742944, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.14587402, + "step": 8760, + "time_per_iteration": 2.47628116607666 + }, + { + "auxiliary_loss_clip": 0.0113114, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.04692888, + "balance_loss_mlp": 1.02529109, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 3.105726304754192, + "language_loss": 0.75414479, + "learning_rate": 1.923878631697736e-06, + "loss": 0.77585948, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.15026855, + "step": 8761, + "time_per_iteration": 2.4239652156829834 + }, + { + "auxiliary_loss_clip": 0.01133762, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.05296612, + "balance_loss_mlp": 1.01765347, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.7122671135962004, + "language_loss": 0.70631945, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72796476, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13128662, + "step": 8762, + "time_per_iteration": 2.443192481994629 + }, + { + "auxiliary_loss_clip": 0.01059098, + "auxiliary_loss_mlp": 0.0100098, + "balance_loss_clip": 1.03197134, + "balance_loss_mlp": 0.99955004, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.93878900668996, + "language_loss": 0.65487587, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67547661, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01429749, + "step": 8763, + "time_per_iteration": 3.0850765705108643 + }, + { + "auxiliary_loss_clip": 0.01133958, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.052351, + "balance_loss_mlp": 1.01941609, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 10.4411516008078, + "language_loss": 0.7095089, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73118007, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13745117, + "step": 8764, + "time_per_iteration": 2.600644588470459 + }, + { + "auxiliary_loss_clip": 0.01134094, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.053069, + "balance_loss_mlp": 1.02008653, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.7143378921201877, + "language_loss": 0.73861384, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76030171, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14624023, + "step": 8765, + "time_per_iteration": 2.496600866317749 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.05108953, + "balance_loss_mlp": 1.02242565, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.5696616802687513, + "language_loss": 0.85461473, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.8763299, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14849854, + "step": 8766, + "time_per_iteration": 2.5375213623046875 + }, + { + "auxiliary_loss_clip": 0.01128979, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.0490483, + "balance_loss_mlp": 1.01948452, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.8317438147554839, + "language_loss": 0.79010814, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8117379, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.1451416, + "step": 8767, + "time_per_iteration": 2.472487688064575 + }, + { + "auxiliary_loss_clip": 0.01130537, + "auxiliary_loss_mlp": 0.01036339, + "balance_loss_clip": 1.05054212, + "balance_loss_mlp": 1.0211693, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.9012914907130063, + "language_loss": 0.73310006, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75476879, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.15167236, + "step": 8768, + "time_per_iteration": 2.4892578125 + }, + { + "auxiliary_loss_clip": 0.01129794, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.0523721, + "balance_loss_mlp": 1.02555084, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 1.8625628226253594, + "language_loss": 0.73817009, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.75984877, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12524414, + "step": 8769, + "time_per_iteration": 2.425536870956421 + }, + { + "auxiliary_loss_clip": 0.01127458, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.05050135, + "balance_loss_mlp": 1.02314198, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.8901079352687986, + "language_loss": 0.7412529, + "learning_rate": 1.920376134993436e-06, + "loss": 0.76289517, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13623047, + "step": 8770, + "time_per_iteration": 2.584245204925537 + }, + { + "auxiliary_loss_clip": 0.01131767, + "auxiliary_loss_mlp": 0.0103767, + "balance_loss_clip": 1.05235815, + "balance_loss_mlp": 1.02353215, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 3.0273808946291307, + "language_loss": 0.68397307, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.7056675, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14129639, + "step": 8771, + "time_per_iteration": 2.5150272846221924 + }, + { + "auxiliary_loss_clip": 0.01127826, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.04809046, + "balance_loss_mlp": 1.02565563, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 2.818295912770074, + "language_loss": 0.76859212, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.79026401, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13696289, + "step": 8772, + "time_per_iteration": 2.4753434658050537 + }, + { + "auxiliary_loss_clip": 0.01133503, + "auxiliary_loss_mlp": 0.01050539, + "balance_loss_clip": 1.05003095, + "balance_loss_mlp": 1.03477919, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.7754199656922462, + "language_loss": 0.66110158, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68294197, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15765381, + "step": 8773, + "time_per_iteration": 2.4399404525756836 + }, + { + "auxiliary_loss_clip": 0.01132417, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.04885888, + "balance_loss_mlp": 1.02414513, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 2.105135360881128, + "language_loss": 0.86183399, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88353026, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.1307373, + "step": 8774, + "time_per_iteration": 2.5180811882019043 + }, + { + "auxiliary_loss_clip": 0.01138015, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.05657339, + "balance_loss_mlp": 1.02026677, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.6358873043969424, + "language_loss": 0.79965234, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82136488, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12963867, + "step": 8775, + "time_per_iteration": 2.4784631729125977 + }, + { + "auxiliary_loss_clip": 0.01133308, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.05635643, + "balance_loss_mlp": 1.02354217, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.9604611442341702, + "language_loss": 0.83655035, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85825664, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13781738, + "step": 8776, + "time_per_iteration": 3.9567463397979736 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.05561602, + "balance_loss_mlp": 1.02067327, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.6120587138823397, + "language_loss": 0.67535198, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.69705427, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.1395874, + "step": 8777, + "time_per_iteration": 2.4200456142425537 + }, + { + "auxiliary_loss_clip": 0.01136041, + "auxiliary_loss_mlp": 0.01039413, + "balance_loss_clip": 1.06029153, + "balance_loss_mlp": 1.02620506, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.4546495952683847, + "language_loss": 0.82401228, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84576678, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13208008, + "step": 8778, + "time_per_iteration": 2.5157501697540283 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.0103844, + "balance_loss_clip": 1.05109227, + "balance_loss_mlp": 1.0239861, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 4.570049539148433, + "language_loss": 0.79743075, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81910968, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.14465332, + "step": 8779, + "time_per_iteration": 2.456125020980835 + }, + { + "auxiliary_loss_clip": 0.01125113, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.05022907, + "balance_loss_mlp": 1.02138853, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.0909848035835017, + "language_loss": 0.76977384, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.79136586, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12719727, + "step": 8780, + "time_per_iteration": 2.470456600189209 + }, + { + "auxiliary_loss_clip": 0.01135258, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.05363083, + "balance_loss_mlp": 1.02192807, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 2.039705956075331, + "language_loss": 0.69380349, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71551585, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.14044189, + "step": 8781, + "time_per_iteration": 2.5935795307159424 + }, + { + "auxiliary_loss_clip": 0.01118497, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.04392695, + "balance_loss_mlp": 1.02298152, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.7159515495578055, + "language_loss": 0.72532398, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74686325, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12438965, + "step": 8782, + "time_per_iteration": 3.8918092250823975 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.04830384, + "balance_loss_mlp": 1.01782167, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.7378986476613538, + "language_loss": 0.68550968, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70707601, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12792969, + "step": 8783, + "time_per_iteration": 2.4950153827667236 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.05697119, + "balance_loss_mlp": 1.02323616, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 2.0633375823288858, + "language_loss": 0.69527197, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71706349, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.15527344, + "step": 8784, + "time_per_iteration": 2.5079076290130615 + }, + { + "auxiliary_loss_clip": 0.01139645, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.05555522, + "balance_loss_mlp": 1.01850748, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 1.9354168872574395, + "language_loss": 0.74781811, + "learning_rate": 1.91453918928048e-06, + "loss": 0.76954687, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.1472168, + "step": 8785, + "time_per_iteration": 2.493744134902954 + }, + { + "auxiliary_loss_clip": 0.01133503, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.05468535, + "balance_loss_mlp": 1.01753974, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.8941160860443258, + "language_loss": 0.8311314, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85279226, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.15014648, + "step": 8786, + "time_per_iteration": 2.472601890563965 + }, + { + "auxiliary_loss_clip": 0.01118933, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.04481304, + "balance_loss_mlp": 1.01547074, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.1608714655719607, + "language_loss": 0.82389867, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.84536707, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12438965, + "step": 8787, + "time_per_iteration": 2.5191004276275635 + }, + { + "auxiliary_loss_clip": 0.01126763, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.04977393, + "balance_loss_mlp": 1.02282786, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.7653464504029686, + "language_loss": 0.83464372, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85627073, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13110352, + "step": 8788, + "time_per_iteration": 2.458838701248169 + }, + { + "auxiliary_loss_clip": 0.01135413, + "auxiliary_loss_mlp": 0.01039462, + "balance_loss_clip": 1.05919111, + "balance_loss_mlp": 1.02495456, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.9122070206730044, + "language_loss": 0.74852312, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.7702719, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.14489746, + "step": 8789, + "time_per_iteration": 3.9420738220214844 + }, + { + "auxiliary_loss_clip": 0.01133412, + "auxiliary_loss_mlp": 0.01038578, + "balance_loss_clip": 1.05182672, + "balance_loss_mlp": 1.02415419, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.5402991701385944, + "language_loss": 0.6981895, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71990931, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14416504, + "step": 8790, + "time_per_iteration": 2.4798948764801025 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.04718089, + "balance_loss_mlp": 1.0148257, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.6172950346646549, + "language_loss": 0.79003894, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81153989, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12957764, + "step": 8791, + "time_per_iteration": 2.5318355560302734 + }, + { + "auxiliary_loss_clip": 0.01134258, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.05664217, + "balance_loss_mlp": 1.01442575, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.96836491289923, + "language_loss": 0.66193521, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68355393, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13189697, + "step": 8792, + "time_per_iteration": 2.503917932510376 + }, + { + "auxiliary_loss_clip": 0.01124243, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.04756713, + "balance_loss_mlp": 1.02451372, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 1.909722345816674, + "language_loss": 0.79610693, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81772351, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12890625, + "step": 8793, + "time_per_iteration": 2.5036823749542236 + }, + { + "auxiliary_loss_clip": 0.01125274, + "auxiliary_loss_mlp": 0.01038687, + "balance_loss_clip": 1.04866958, + "balance_loss_mlp": 1.02448273, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 2.0949578633196215, + "language_loss": 0.84993523, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.87157488, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.14215088, + "step": 8794, + "time_per_iteration": 2.4381909370422363 + }, + { + "auxiliary_loss_clip": 0.01132718, + "auxiliary_loss_mlp": 0.01040614, + "balance_loss_clip": 1.04916549, + "balance_loss_mlp": 1.02576613, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 3.2612534658413845, + "language_loss": 0.67801332, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69974667, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14837646, + "step": 8795, + "time_per_iteration": 2.486876964569092 + }, + { + "auxiliary_loss_clip": 0.01120867, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04295754, + "balance_loss_mlp": 1.01940393, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.7755866474357276, + "language_loss": 0.80839127, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82993001, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13598633, + "step": 8796, + "time_per_iteration": 2.531669855117798 + }, + { + "auxiliary_loss_clip": 0.01130805, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.05138898, + "balance_loss_mlp": 1.02162433, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.622523756830034, + "language_loss": 0.69288176, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71454972, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.14355469, + "step": 8797, + "time_per_iteration": 2.5571131706237793 + }, + { + "auxiliary_loss_clip": 0.0112441, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.05072141, + "balance_loss_mlp": 1.02380943, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.5277106840126018, + "language_loss": 0.82613492, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84774196, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12481689, + "step": 8798, + "time_per_iteration": 2.4725823402404785 + }, + { + "auxiliary_loss_clip": 0.01129689, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.04953432, + "balance_loss_mlp": 1.02641034, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 1.9970303205903033, + "language_loss": 0.70642495, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72812945, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14361572, + "step": 8799, + "time_per_iteration": 2.475736141204834 + }, + { + "auxiliary_loss_clip": 0.01124632, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.05212736, + "balance_loss_mlp": 1.01827419, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.8658145602006915, + "language_loss": 0.69530988, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71686512, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12634277, + "step": 8800, + "time_per_iteration": 2.411139965057373 + }, + { + "auxiliary_loss_clip": 0.01059495, + "auxiliary_loss_mlp": 0.01009269, + "balance_loss_clip": 1.03290045, + "balance_loss_mlp": 1.00784922, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 1.025689197301816, + "language_loss": 0.56951785, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59020549, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.26611328, + "router_z_loss_mlp": 0.01422119, + "step": 8801, + "time_per_iteration": 2.9677181243896484 + }, + { + "auxiliary_loss_clip": 0.01128081, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.04844475, + "balance_loss_mlp": 1.02348483, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5776529532660293, + "language_loss": 0.6380412, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.65969121, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13427734, + "step": 8802, + "time_per_iteration": 3.938690423965454 + }, + { + "auxiliary_loss_clip": 0.01130992, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.05392432, + "balance_loss_mlp": 1.018206, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.6191665067174044, + "language_loss": 0.69141656, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71304929, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.14080811, + "step": 8803, + "time_per_iteration": 2.59857439994812 + }, + { + "auxiliary_loss_clip": 0.01118521, + "auxiliary_loss_mlp": 0.01042945, + "balance_loss_clip": 1.04427898, + "balance_loss_mlp": 1.02779341, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.797201852761157, + "language_loss": 0.76159245, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78320712, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.15161133, + "step": 8804, + "time_per_iteration": 2.497507095336914 + }, + { + "auxiliary_loss_clip": 0.01070253, + "auxiliary_loss_mlp": 0.01007844, + "balance_loss_clip": 1.04382753, + "balance_loss_mlp": 1.00632119, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.7540828922422722, + "language_loss": 0.52982283, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55060381, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.26416016, + "router_z_loss_mlp": 0.01522827, + "step": 8805, + "time_per_iteration": 3.182954788208008 + }, + { + "auxiliary_loss_clip": 0.01050396, + "auxiliary_loss_mlp": 0.01005896, + "balance_loss_clip": 1.02446592, + "balance_loss_mlp": 1.00451207, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7369632486604503, + "language_loss": 0.63794887, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65851188, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01383972, + "step": 8806, + "time_per_iteration": 3.0943403244018555 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.0509932, + "balance_loss_mlp": 1.01850545, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.551141227468258, + "language_loss": 0.71827936, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.73992497, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13317871, + "step": 8807, + "time_per_iteration": 2.461716413497925 + }, + { + "auxiliary_loss_clip": 0.01127015, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.05195713, + "balance_loss_mlp": 1.01897502, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.0235371973711684, + "language_loss": 0.69648314, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71807426, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13116455, + "step": 8808, + "time_per_iteration": 2.3994336128234863 + }, + { + "auxiliary_loss_clip": 0.01129509, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.05166781, + "balance_loss_mlp": 1.02070165, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 2.4235153666803932, + "language_loss": 0.86625814, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88788295, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12280273, + "step": 8809, + "time_per_iteration": 2.5126612186431885 + }, + { + "auxiliary_loss_clip": 0.01130653, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.04762065, + "balance_loss_mlp": 1.02228868, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 2.0198492074809735, + "language_loss": 0.64079523, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66247106, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14648438, + "step": 8810, + "time_per_iteration": 2.6273140907287598 + }, + { + "auxiliary_loss_clip": 0.01135606, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.0565083, + "balance_loss_mlp": 1.02562022, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.9134149143534973, + "language_loss": 0.68240798, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70415425, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13391113, + "step": 8811, + "time_per_iteration": 2.5856690406799316 + }, + { + "auxiliary_loss_clip": 0.01068583, + "auxiliary_loss_mlp": 0.01005218, + "balance_loss_clip": 1.04244232, + "balance_loss_mlp": 1.00366688, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6655959203698071, + "language_loss": 0.53384322, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55458117, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01551819, + "step": 8812, + "time_per_iteration": 3.231731653213501 + }, + { + "auxiliary_loss_clip": 0.01058659, + "auxiliary_loss_mlp": 0.01005396, + "balance_loss_clip": 1.03202105, + "balance_loss_mlp": 1.00372255, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7331049282021463, + "language_loss": 0.56311238, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58375287, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01675415, + "step": 8813, + "time_per_iteration": 3.2131454944610596 + }, + { + "auxiliary_loss_clip": 0.01124964, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.05099964, + "balance_loss_mlp": 1.02167988, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.646223588667187, + "language_loss": 0.81492937, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.8365342, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.13842773, + "step": 8814, + "time_per_iteration": 2.473237991333008 + }, + { + "auxiliary_loss_clip": 0.01131588, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.05128109, + "balance_loss_mlp": 1.02344179, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.6373384528446469, + "language_loss": 0.85054362, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87224567, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.1517334, + "step": 8815, + "time_per_iteration": 2.506251335144043 + }, + { + "auxiliary_loss_clip": 0.0113271, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.05704451, + "balance_loss_mlp": 1.0168705, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.579577698516839, + "language_loss": 0.6585415, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68016529, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12805176, + "step": 8816, + "time_per_iteration": 2.485286235809326 + }, + { + "auxiliary_loss_clip": 0.01135043, + "auxiliary_loss_mlp": 0.01040564, + "balance_loss_clip": 1.05472183, + "balance_loss_mlp": 1.02712953, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.5574628844481544, + "language_loss": 0.72288722, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74464333, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13439941, + "step": 8817, + "time_per_iteration": 2.654721736907959 + }, + { + "auxiliary_loss_clip": 0.01130844, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.05132818, + "balance_loss_mlp": 1.02005601, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6058938871390822, + "language_loss": 0.65321457, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67486238, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13885498, + "step": 8818, + "time_per_iteration": 2.4973251819610596 + }, + { + "auxiliary_loss_clip": 0.01141974, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.06211233, + "balance_loss_mlp": 1.0183146, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 2.281890071262525, + "language_loss": 0.74746525, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.76920903, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14086914, + "step": 8819, + "time_per_iteration": 3.917039632797241 + }, + { + "auxiliary_loss_clip": 0.01131963, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.05074453, + "balance_loss_mlp": 1.02173591, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.323428532413325, + "language_loss": 0.81919545, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.8408705, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13793945, + "step": 8820, + "time_per_iteration": 2.4865055084228516 + }, + { + "auxiliary_loss_clip": 0.01135, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.05236161, + "balance_loss_mlp": 1.0245564, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.9732277829195295, + "language_loss": 0.72535932, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74708521, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.1305542, + "step": 8821, + "time_per_iteration": 2.448761463165283 + }, + { + "auxiliary_loss_clip": 0.01124906, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.05021501, + "balance_loss_mlp": 1.01811159, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.6022297088109096, + "language_loss": 0.74138832, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76294184, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12359619, + "step": 8822, + "time_per_iteration": 2.507375717163086 + }, + { + "auxiliary_loss_clip": 0.01136666, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.05328238, + "balance_loss_mlp": 1.02361369, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.7536952092284936, + "language_loss": 0.67467594, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69642156, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14276123, + "step": 8823, + "time_per_iteration": 2.4957590103149414 + }, + { + "auxiliary_loss_clip": 0.01140985, + "auxiliary_loss_mlp": 0.01041817, + "balance_loss_clip": 1.0591352, + "balance_loss_mlp": 1.0273931, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.8109220283305265, + "language_loss": 0.69831741, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.72014546, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.14422607, + "step": 8824, + "time_per_iteration": 2.5006439685821533 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.05169129, + "balance_loss_mlp": 1.02047122, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 3.489458077504464, + "language_loss": 0.76013994, + "learning_rate": 1.898977700702689e-06, + "loss": 0.7817291, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12268066, + "step": 8825, + "time_per_iteration": 2.441401720046997 + }, + { + "auxiliary_loss_clip": 0.0113871, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.05785131, + "balance_loss_mlp": 1.02339602, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.2341186226940186, + "language_loss": 0.85606647, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87782019, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1328125, + "step": 8826, + "time_per_iteration": 3.908151626586914 + }, + { + "auxiliary_loss_clip": 0.01129483, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.05209684, + "balance_loss_mlp": 1.01927471, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 3.4972964303094978, + "language_loss": 0.64448696, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66610807, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13342285, + "step": 8827, + "time_per_iteration": 2.5956714153289795 + }, + { + "auxiliary_loss_clip": 0.01137486, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.05694425, + "balance_loss_mlp": 1.02607632, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.6325352043412433, + "language_loss": 0.59767962, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.61945081, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13543701, + "step": 8828, + "time_per_iteration": 2.663092851638794 + }, + { + "auxiliary_loss_clip": 0.01140922, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.05975413, + "balance_loss_mlp": 1.01895905, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.6555576687776181, + "language_loss": 0.81194437, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83368081, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13757324, + "step": 8829, + "time_per_iteration": 2.541560649871826 + }, + { + "auxiliary_loss_clip": 0.01134554, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.05929053, + "balance_loss_mlp": 1.01884985, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.460294829598216, + "language_loss": 0.78268552, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80435002, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13061523, + "step": 8830, + "time_per_iteration": 2.4761364459991455 + }, + { + "auxiliary_loss_clip": 0.01129944, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.05373502, + "balance_loss_mlp": 1.01730633, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.1764943871008144, + "language_loss": 0.80991066, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83151495, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13165283, + "step": 8831, + "time_per_iteration": 2.5400190353393555 + }, + { + "auxiliary_loss_clip": 0.01133684, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.05759656, + "balance_loss_mlp": 1.02147388, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 1.6580905859562167, + "language_loss": 0.73567343, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75735521, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13018799, + "step": 8832, + "time_per_iteration": 4.075336933135986 + }, + { + "auxiliary_loss_clip": 0.01138755, + "auxiliary_loss_mlp": 0.01046219, + "balance_loss_clip": 1.05524254, + "balance_loss_mlp": 1.03109121, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.9651758180921974, + "language_loss": 0.75809026, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77994001, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.15112305, + "step": 8833, + "time_per_iteration": 2.4966678619384766 + }, + { + "auxiliary_loss_clip": 0.01127871, + "auxiliary_loss_mlp": 0.01033644, + "balance_loss_clip": 1.04884195, + "balance_loss_mlp": 1.0198096, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 2.8614877444957565, + "language_loss": 0.73399609, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.7556113, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13824463, + "step": 8834, + "time_per_iteration": 2.508802652359009 + }, + { + "auxiliary_loss_clip": 0.01141781, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.05626774, + "balance_loss_mlp": 1.02125525, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.8548257691333625, + "language_loss": 0.77761078, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79939258, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.15155029, + "step": 8835, + "time_per_iteration": 2.565246820449829 + }, + { + "auxiliary_loss_clip": 0.01139213, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.05960512, + "balance_loss_mlp": 1.02119803, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.6035797105510357, + "language_loss": 0.72458494, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.7463336, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.14465332, + "step": 8836, + "time_per_iteration": 2.589233636856079 + }, + { + "auxiliary_loss_clip": 0.01132605, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.05433083, + "balance_loss_mlp": 1.0234679, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.999711725818245, + "language_loss": 0.80386221, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82556301, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14007568, + "step": 8837, + "time_per_iteration": 2.48982572555542 + }, + { + "auxiliary_loss_clip": 0.0113011, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.05365872, + "balance_loss_mlp": 1.01598263, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.7366679791323025, + "language_loss": 0.85635793, + "learning_rate": 1.893921490881035e-06, + "loss": 0.87795675, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13769531, + "step": 8838, + "time_per_iteration": 2.4910638332366943 + }, + { + "auxiliary_loss_clip": 0.01124244, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.0469898, + "balance_loss_mlp": 1.02656853, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.7892638386443058, + "language_loss": 0.72788328, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74953783, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.14630127, + "step": 8839, + "time_per_iteration": 2.528660535812378 + }, + { + "auxiliary_loss_clip": 0.01133502, + "auxiliary_loss_mlp": 0.01040795, + "balance_loss_clip": 1.05150747, + "balance_loss_mlp": 1.02724123, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.8421590363659723, + "language_loss": 0.76578557, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78752851, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13562012, + "step": 8840, + "time_per_iteration": 2.4836928844451904 + }, + { + "auxiliary_loss_clip": 0.01132759, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.05116177, + "balance_loss_mlp": 1.02235866, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.0656531007906276, + "language_loss": 0.77611154, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79780561, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.1428833, + "step": 8841, + "time_per_iteration": 2.4836347103118896 + }, + { + "auxiliary_loss_clip": 0.01073595, + "auxiliary_loss_mlp": 0.01021384, + "balance_loss_clip": 1.04745495, + "balance_loss_mlp": 1.01986837, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6942840503276178, + "language_loss": 0.56791455, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58886433, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.0151825, + "step": 8842, + "time_per_iteration": 3.275336742401123 + }, + { + "auxiliary_loss_clip": 0.0113623, + "auxiliary_loss_mlp": 0.01039792, + "balance_loss_clip": 1.05579293, + "balance_loss_mlp": 1.0251056, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7961134389443778, + "language_loss": 0.73652256, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75828278, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.14697266, + "step": 8843, + "time_per_iteration": 2.471240997314453 + }, + { + "auxiliary_loss_clip": 0.01073572, + "auxiliary_loss_mlp": 0.01020983, + "balance_loss_clip": 1.04695165, + "balance_loss_mlp": 1.01921844, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.9022206609166477, + "language_loss": 0.6094625, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63040805, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01766968, + "step": 8844, + "time_per_iteration": 3.157799243927002 + }, + { + "auxiliary_loss_clip": 0.01053798, + "auxiliary_loss_mlp": 0.01001064, + "balance_loss_clip": 1.02710283, + "balance_loss_mlp": 0.99963057, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.8648077186361834, + "language_loss": 0.62167543, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64222407, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.014328, + "step": 8845, + "time_per_iteration": 4.497785568237305 + }, + { + "auxiliary_loss_clip": 0.01139286, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.06055486, + "balance_loss_mlp": 1.02291834, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.605759055505333, + "language_loss": 0.75825506, + "learning_rate": 1.890810312970474e-06, + "loss": 0.78002387, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.14678955, + "step": 8846, + "time_per_iteration": 2.491154909133911 + }, + { + "auxiliary_loss_clip": 0.0113513, + "auxiliary_loss_mlp": 0.01039151, + "balance_loss_clip": 1.05549884, + "balance_loss_mlp": 1.02517951, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5697320711447362, + "language_loss": 0.75618482, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77792764, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13964844, + "step": 8847, + "time_per_iteration": 2.536327362060547 + }, + { + "auxiliary_loss_clip": 0.01125535, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.04854, + "balance_loss_mlp": 1.02245498, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.6593412485901244, + "language_loss": 0.87746203, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89906943, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12768555, + "step": 8848, + "time_per_iteration": 2.5075244903564453 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.05410075, + "balance_loss_mlp": 1.02346802, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 1.893794490600643, + "language_loss": 0.74428403, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76598489, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.1506958, + "step": 8849, + "time_per_iteration": 2.5550789833068848 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.04808235, + "balance_loss_mlp": 1.01688612, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 1.6912188582082548, + "language_loss": 0.79577595, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.8173852, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14276123, + "step": 8850, + "time_per_iteration": 2.474332332611084 + }, + { + "auxiliary_loss_clip": 0.01130169, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.05043912, + "balance_loss_mlp": 1.02221775, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4182088459412168, + "language_loss": 0.55152655, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57319164, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.14117432, + "step": 8851, + "time_per_iteration": 2.678581953048706 + }, + { + "auxiliary_loss_clip": 0.01132556, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.05184388, + "balance_loss_mlp": 1.01808524, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.6248324287508817, + "language_loss": 0.68522811, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70686388, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12939453, + "step": 8852, + "time_per_iteration": 2.4476680755615234 + }, + { + "auxiliary_loss_clip": 0.01089902, + "auxiliary_loss_mlp": 0.01024019, + "balance_loss_clip": 1.06287646, + "balance_loss_mlp": 1.02181506, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8100636049630373, + "language_loss": 0.62795651, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64909577, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.27050781, + "router_z_loss_mlp": 0.02204895, + "step": 8853, + "time_per_iteration": 3.0662553310394287 + }, + { + "auxiliary_loss_clip": 0.01129248, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.04866552, + "balance_loss_mlp": 1.01836133, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 3.3866789753679076, + "language_loss": 0.79379106, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81539798, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13098145, + "step": 8854, + "time_per_iteration": 2.4712677001953125 + }, + { + "auxiliary_loss_clip": 0.01132487, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.05745769, + "balance_loss_mlp": 1.02076316, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.8433982377841958, + "language_loss": 0.73279536, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75444508, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11730957, + "step": 8855, + "time_per_iteration": 2.548391103744507 + }, + { + "auxiliary_loss_clip": 0.01127371, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.04741263, + "balance_loss_mlp": 1.02251959, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 2.0544295532512327, + "language_loss": 0.65232599, + "learning_rate": 1.886921714110507e-06, + "loss": 0.67395175, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12670898, + "step": 8856, + "time_per_iteration": 2.6283750534057617 + }, + { + "auxiliary_loss_clip": 0.01138499, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.05753767, + "balance_loss_mlp": 1.02482486, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 2.305670298494048, + "language_loss": 0.77912843, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.8009268, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.16516113, + "step": 8857, + "time_per_iteration": 2.6285741329193115 + }, + { + "auxiliary_loss_clip": 0.01129411, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.04970169, + "balance_loss_mlp": 1.02476823, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 10.474971111467937, + "language_loss": 0.70653152, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.72820985, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13665771, + "step": 8858, + "time_per_iteration": 2.6058907508850098 + }, + { + "auxiliary_loss_clip": 0.01132995, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_clip": 1.05157077, + "balance_loss_mlp": 1.03097749, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.7775749215765255, + "language_loss": 0.69287616, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71468097, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1651001, + "step": 8859, + "time_per_iteration": 2.557194471359253 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.05266607, + "balance_loss_mlp": 1.01568174, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.6565016647329425, + "language_loss": 0.69712633, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71867049, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12268066, + "step": 8860, + "time_per_iteration": 2.6200759410858154 + }, + { + "auxiliary_loss_clip": 0.01128166, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.0524404, + "balance_loss_mlp": 1.02024376, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 2.5228528085470314, + "language_loss": 0.78112888, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80273861, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12548828, + "step": 8861, + "time_per_iteration": 2.571147918701172 + }, + { + "auxiliary_loss_clip": 0.01133776, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.05458951, + "balance_loss_mlp": 1.02700615, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.8713955533651532, + "language_loss": 0.85699242, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87873912, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13885498, + "step": 8862, + "time_per_iteration": 2.519282341003418 + }, + { + "auxiliary_loss_clip": 0.01125478, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.04504681, + "balance_loss_mlp": 1.02353096, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 1.7970217800168835, + "language_loss": 0.61748809, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63913095, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.152771, + "step": 8863, + "time_per_iteration": 3.892935037612915 + }, + { + "auxiliary_loss_clip": 0.01131847, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.05489326, + "balance_loss_mlp": 1.01738739, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.788188640047526, + "language_loss": 0.73850042, + "learning_rate": 1.883811143046377e-06, + "loss": 0.76011586, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12304688, + "step": 8864, + "time_per_iteration": 2.5733723640441895 + }, + { + "auxiliary_loss_clip": 0.01127549, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.04956961, + "balance_loss_mlp": 1.0254178, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.6489768413234351, + "language_loss": 0.64295125, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66461068, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12976074, + "step": 8865, + "time_per_iteration": 2.5550215244293213 + }, + { + "auxiliary_loss_clip": 0.01136667, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.05571914, + "balance_loss_mlp": 1.02053857, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.7436972009253857, + "language_loss": 0.78750193, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80921066, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13659668, + "step": 8866, + "time_per_iteration": 2.499314069747925 + }, + { + "auxiliary_loss_clip": 0.01127413, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.04969239, + "balance_loss_mlp": 1.02004552, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 4.757411978819348, + "language_loss": 0.73578829, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75739527, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13244629, + "step": 8867, + "time_per_iteration": 2.4232370853424072 + }, + { + "auxiliary_loss_clip": 0.01135232, + "auxiliary_loss_mlp": 0.01042395, + "balance_loss_clip": 1.05530643, + "balance_loss_mlp": 1.02856052, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.9267742276168287, + "language_loss": 0.71873415, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74051034, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.1383667, + "step": 8868, + "time_per_iteration": 2.640756607055664 + }, + { + "auxiliary_loss_clip": 0.01129058, + "auxiliary_loss_mlp": 0.01036142, + "balance_loss_clip": 1.0509398, + "balance_loss_mlp": 1.02303529, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.7178187559461977, + "language_loss": 0.78709567, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80874765, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13110352, + "step": 8869, + "time_per_iteration": 4.120243787765503 + }, + { + "auxiliary_loss_clip": 0.01135085, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.05252039, + "balance_loss_mlp": 1.0209682, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.7817268140803644, + "language_loss": 0.75236452, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.77406371, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13867188, + "step": 8870, + "time_per_iteration": 2.4562220573425293 + }, + { + "auxiliary_loss_clip": 0.01135792, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.05451632, + "balance_loss_mlp": 1.0249927, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.7232961115924241, + "language_loss": 0.75801325, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.7797662, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.1449585, + "step": 8871, + "time_per_iteration": 2.5997867584228516 + }, + { + "auxiliary_loss_clip": 0.01132744, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.05308342, + "balance_loss_mlp": 1.0184809, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.934121015521056, + "language_loss": 0.72023666, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74188781, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13891602, + "step": 8872, + "time_per_iteration": 2.4689836502075195 + }, + { + "auxiliary_loss_clip": 0.01137978, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.05905378, + "balance_loss_mlp": 1.02828264, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.9122241492881948, + "language_loss": 0.6520974, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67389238, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13238525, + "step": 8873, + "time_per_iteration": 2.5704445838928223 + }, + { + "auxiliary_loss_clip": 0.01131948, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.0513401, + "balance_loss_mlp": 1.02704597, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 2.23549498720441, + "language_loss": 0.80541688, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82713753, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13085938, + "step": 8874, + "time_per_iteration": 2.4486191272735596 + }, + { + "auxiliary_loss_clip": 0.01131385, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.05361438, + "balance_loss_mlp": 1.01939511, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.6065380386900407, + "language_loss": 0.69776607, + "learning_rate": 1.879534569789582e-06, + "loss": 0.71940708, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13323975, + "step": 8875, + "time_per_iteration": 3.9542829990386963 + }, + { + "auxiliary_loss_clip": 0.01058343, + "auxiliary_loss_mlp": 0.01007408, + "balance_loss_clip": 1.03208494, + "balance_loss_mlp": 1.00561714, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7193203840556508, + "language_loss": 0.59677911, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61743659, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.0178833, + "step": 8876, + "time_per_iteration": 3.217029094696045 + }, + { + "auxiliary_loss_clip": 0.01126004, + "auxiliary_loss_mlp": 0.01039574, + "balance_loss_clip": 1.04955089, + "balance_loss_mlp": 1.02615058, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.7993267439684417, + "language_loss": 0.74726397, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76891971, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13433838, + "step": 8877, + "time_per_iteration": 2.509413719177246 + }, + { + "auxiliary_loss_clip": 0.01086066, + "auxiliary_loss_mlp": 0.01007591, + "balance_loss_clip": 1.05909562, + "balance_loss_mlp": 1.00531375, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.749565704607705, + "language_loss": 0.57138109, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59231758, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.02276611, + "step": 8878, + "time_per_iteration": 3.068920850753784 + }, + { + "auxiliary_loss_clip": 0.01133069, + "auxiliary_loss_mlp": 0.01037534, + "balance_loss_clip": 1.05130804, + "balance_loss_mlp": 1.02341938, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.550588773274557, + "language_loss": 0.72245681, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74416286, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.14123535, + "step": 8879, + "time_per_iteration": 2.511261224746704 + }, + { + "auxiliary_loss_clip": 0.01139217, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.05674922, + "balance_loss_mlp": 1.01785231, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.1413562730399653, + "language_loss": 0.83878505, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.86049879, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.14312744, + "step": 8880, + "time_per_iteration": 2.4953930377960205 + }, + { + "auxiliary_loss_clip": 0.01128139, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.05233979, + "balance_loss_mlp": 1.01773238, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.7795771804563136, + "language_loss": 0.7936312, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81520915, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11920166, + "step": 8881, + "time_per_iteration": 2.506154775619507 + }, + { + "auxiliary_loss_clip": 0.0106663, + "auxiliary_loss_mlp": 0.01002398, + "balance_loss_clip": 1.03914237, + "balance_loss_mlp": 1.00109756, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7898235452282847, + "language_loss": 0.59245765, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61314791, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.01301575, + "step": 8882, + "time_per_iteration": 3.0430092811584473 + }, + { + "auxiliary_loss_clip": 0.01060401, + "auxiliary_loss_mlp": 0.01003781, + "balance_loss_clip": 1.03419471, + "balance_loss_mlp": 1.00241435, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8698890610828549, + "language_loss": 0.63576627, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65640813, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01367188, + "step": 8883, + "time_per_iteration": 2.8924455642700195 + }, + { + "auxiliary_loss_clip": 0.01135341, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.0550952, + "balance_loss_mlp": 1.0178237, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.938279983190323, + "language_loss": 0.81702381, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.83869708, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14178467, + "step": 8884, + "time_per_iteration": 2.627678632736206 + }, + { + "auxiliary_loss_clip": 0.01128582, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.05509353, + "balance_loss_mlp": 1.01832747, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.8207700262186137, + "language_loss": 0.72649479, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74808896, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.125, + "step": 8885, + "time_per_iteration": 2.510688304901123 + }, + { + "auxiliary_loss_clip": 0.01129542, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.04830503, + "balance_loss_mlp": 1.01636863, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 3.8735065903828487, + "language_loss": 0.78983641, + "learning_rate": 1.87525854926798e-06, + "loss": 0.81143433, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13879395, + "step": 8886, + "time_per_iteration": 2.5575525760650635 + }, + { + "auxiliary_loss_clip": 0.0112645, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.04655743, + "balance_loss_mlp": 1.01859975, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.5991548596240237, + "language_loss": 0.74897826, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.77057588, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14709473, + "step": 8887, + "time_per_iteration": 2.6212871074676514 + }, + { + "auxiliary_loss_clip": 0.01124838, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.04741716, + "balance_loss_mlp": 1.01877618, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.4802293282636727, + "language_loss": 0.68953753, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.71110511, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13140869, + "step": 8888, + "time_per_iteration": 3.8898441791534424 + }, + { + "auxiliary_loss_clip": 0.01138562, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.05443752, + "balance_loss_mlp": 1.02512455, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.4364176192173503, + "language_loss": 0.7772578, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79902977, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.1350708, + "step": 8889, + "time_per_iteration": 2.465851306915283 + }, + { + "auxiliary_loss_clip": 0.01125369, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.04786086, + "balance_loss_mlp": 1.02760208, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.9216468279548158, + "language_loss": 0.69228315, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71397108, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.15820312, + "step": 8890, + "time_per_iteration": 2.422889471054077 + }, + { + "auxiliary_loss_clip": 0.01137677, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.0549345, + "balance_loss_mlp": 1.02565575, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 3.1694035021656233, + "language_loss": 0.76921749, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.15185547, + "step": 8891, + "time_per_iteration": 2.42293643951416 + }, + { + "auxiliary_loss_clip": 0.01125744, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.04800248, + "balance_loss_mlp": 1.03169072, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5936680415489424, + "language_loss": 0.74205196, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76377958, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.15338135, + "step": 8892, + "time_per_iteration": 2.4538800716400146 + }, + { + "auxiliary_loss_clip": 0.01133212, + "auxiliary_loss_mlp": 0.01045219, + "balance_loss_clip": 1.05138969, + "balance_loss_mlp": 1.03170681, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.6250748596378053, + "language_loss": 0.88172626, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90351057, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13513184, + "step": 8893, + "time_per_iteration": 2.5145609378814697 + }, + { + "auxiliary_loss_clip": 0.01146101, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.06626117, + "balance_loss_mlp": 1.02028108, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.7804875822796165, + "language_loss": 0.72778845, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74957061, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.11828613, + "step": 8894, + "time_per_iteration": 2.510721206665039 + }, + { + "auxiliary_loss_clip": 0.01132858, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.05613911, + "balance_loss_mlp": 1.01808071, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.5938520700870593, + "language_loss": 0.75135612, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.7729997, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13421631, + "step": 8895, + "time_per_iteration": 2.5101661682128906 + }, + { + "auxiliary_loss_clip": 0.01126384, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.04976726, + "balance_loss_mlp": 1.01807296, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.7061168594975376, + "language_loss": 0.77029276, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79186535, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12799072, + "step": 8896, + "time_per_iteration": 2.4733245372772217 + }, + { + "auxiliary_loss_clip": 0.01137946, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.06007171, + "balance_loss_mlp": 1.01602793, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.6595791033875362, + "language_loss": 0.78275406, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80442786, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13421631, + "step": 8897, + "time_per_iteration": 2.483280897140503 + }, + { + "auxiliary_loss_clip": 0.01132172, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.05286789, + "balance_loss_mlp": 1.01651883, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.8401099435515802, + "language_loss": 0.75566936, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77728623, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12988281, + "step": 8898, + "time_per_iteration": 2.438218832015991 + }, + { + "auxiliary_loss_clip": 0.01065562, + "auxiliary_loss_mlp": 0.01002144, + "balance_loss_clip": 1.03884053, + "balance_loss_mlp": 1.0008148, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8558224734863268, + "language_loss": 0.57927322, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.59995025, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01329041, + "step": 8899, + "time_per_iteration": 3.2806286811828613 + }, + { + "auxiliary_loss_clip": 0.01122176, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.0463388, + "balance_loss_mlp": 1.0187341, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.727549614589338, + "language_loss": 0.70189071, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72342789, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12786865, + "step": 8900, + "time_per_iteration": 2.559471845626831 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.04715431, + "balance_loss_mlp": 1.0215857, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 2.0919178686119237, + "language_loss": 0.71407342, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.7356869, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13031006, + "step": 8901, + "time_per_iteration": 2.5668041706085205 + }, + { + "auxiliary_loss_clip": 0.011228, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.04448688, + "balance_loss_mlp": 1.01646042, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.1581833423049663, + "language_loss": 0.77079242, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79231846, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13336182, + "step": 8902, + "time_per_iteration": 2.4993896484375 + }, + { + "auxiliary_loss_clip": 0.0112856, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.05314136, + "balance_loss_mlp": 1.02209544, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.461061673662561, + "language_loss": 0.70436245, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72599024, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12121582, + "step": 8903, + "time_per_iteration": 2.5172858238220215 + }, + { + "auxiliary_loss_clip": 0.01125064, + "auxiliary_loss_mlp": 0.01038634, + "balance_loss_clip": 1.04512906, + "balance_loss_mlp": 1.02506256, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.6384730471717985, + "language_loss": 0.72548175, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74711871, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13568115, + "step": 8904, + "time_per_iteration": 2.556622266769409 + }, + { + "auxiliary_loss_clip": 0.01130051, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.0508492, + "balance_loss_mlp": 1.02562618, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.9093691000958009, + "language_loss": 0.73575628, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75744843, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13525391, + "step": 8905, + "time_per_iteration": 2.519415855407715 + }, + { + "auxiliary_loss_clip": 0.01118749, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.0450561, + "balance_loss_mlp": 1.02539992, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.3921673324951207, + "language_loss": 0.83588612, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85744452, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11694336, + "step": 8906, + "time_per_iteration": 4.0354766845703125 + }, + { + "auxiliary_loss_clip": 0.01135493, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.05543637, + "balance_loss_mlp": 1.02335238, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 2.061001188321185, + "language_loss": 0.74018264, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76190698, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13580322, + "step": 8907, + "time_per_iteration": 2.462003707885742 + }, + { + "auxiliary_loss_clip": 0.01123959, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.04647291, + "balance_loss_mlp": 1.01954412, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.6444101773077464, + "language_loss": 0.76756126, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78912932, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13311768, + "step": 8908, + "time_per_iteration": 2.572481632232666 + }, + { + "auxiliary_loss_clip": 0.01126323, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.04633844, + "balance_loss_mlp": 1.02125072, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 1.6959532678030373, + "language_loss": 0.74121815, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.76284134, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.14746094, + "step": 8909, + "time_per_iteration": 2.5211598873138428 + }, + { + "auxiliary_loss_clip": 0.01124873, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.0474534, + "balance_loss_mlp": 1.03153872, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 3.667143960160348, + "language_loss": 0.84036523, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86205637, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12689209, + "step": 8910, + "time_per_iteration": 2.6815569400787354 + }, + { + "auxiliary_loss_clip": 0.01132786, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.04979765, + "balance_loss_mlp": 1.01879478, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.5782454505024035, + "language_loss": 0.81953549, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.84119201, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14074707, + "step": 8911, + "time_per_iteration": 2.5108771324157715 + }, + { + "auxiliary_loss_clip": 0.01132036, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.05437529, + "balance_loss_mlp": 1.02086902, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.7664129730623193, + "language_loss": 0.69390321, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71555102, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11871338, + "step": 8912, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.01134077, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.05666792, + "balance_loss_mlp": 1.02855062, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.962005789770473, + "language_loss": 0.71512687, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73688591, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1328125, + "step": 8913, + "time_per_iteration": 3.906928539276123 + }, + { + "auxiliary_loss_clip": 0.01150777, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.06673503, + "balance_loss_mlp": 1.02572298, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.8102423907945098, + "language_loss": 0.72187263, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74376792, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13031006, + "step": 8914, + "time_per_iteration": 2.495490312576294 + }, + { + "auxiliary_loss_clip": 0.01135938, + "auxiliary_loss_mlp": 0.01040185, + "balance_loss_clip": 1.05309868, + "balance_loss_mlp": 1.02536106, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 1.848670580658745, + "language_loss": 0.7066586, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72841978, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14801025, + "step": 8915, + "time_per_iteration": 2.5224883556365967 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.0463903, + "balance_loss_mlp": 1.02985108, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.7805102508343331, + "language_loss": 0.75316399, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77484781, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13806152, + "step": 8916, + "time_per_iteration": 2.5160698890686035 + }, + { + "auxiliary_loss_clip": 0.01130929, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.05053508, + "balance_loss_mlp": 1.01677167, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 1.887698308592283, + "language_loss": 0.72536254, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74696672, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12713623, + "step": 8917, + "time_per_iteration": 2.509660482406616 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01044028, + "balance_loss_clip": 1.04908133, + "balance_loss_mlp": 1.02875698, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.1758680154413157, + "language_loss": 0.71405071, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73575717, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.15252686, + "step": 8918, + "time_per_iteration": 2.438551664352417 + }, + { + "auxiliary_loss_clip": 0.01133302, + "auxiliary_loss_mlp": 0.01035264, + "balance_loss_clip": 1.05435872, + "balance_loss_mlp": 1.02203155, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 2.125095449432708, + "language_loss": 0.75140887, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77309453, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13226318, + "step": 8919, + "time_per_iteration": 3.9573287963867188 + }, + { + "auxiliary_loss_clip": 0.01124794, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.04417229, + "balance_loss_mlp": 1.02535772, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 2.043078414769861, + "language_loss": 0.71339417, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73502624, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13049316, + "step": 8920, + "time_per_iteration": 2.474898099899292 + }, + { + "auxiliary_loss_clip": 0.01128452, + "auxiliary_loss_mlp": 0.01035228, + "balance_loss_clip": 1.05104136, + "balance_loss_mlp": 1.02175164, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.3723675155471526, + "language_loss": 0.68663895, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70827568, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13476562, + "step": 8921, + "time_per_iteration": 2.758110284805298 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.05861688, + "balance_loss_mlp": 1.02358544, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.7715128116561134, + "language_loss": 0.82001269, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84175014, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12805176, + "step": 8922, + "time_per_iteration": 2.468275308609009 + }, + { + "auxiliary_loss_clip": 0.01135964, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.05474675, + "balance_loss_mlp": 1.02427161, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 2.0071837970808097, + "language_loss": 0.76676762, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78851581, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14587402, + "step": 8923, + "time_per_iteration": 2.4896600246429443 + }, + { + "auxiliary_loss_clip": 0.011402, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.05835509, + "balance_loss_mlp": 1.02336454, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.337584561589823, + "language_loss": 0.70020068, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72196984, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13354492, + "step": 8924, + "time_per_iteration": 2.547088146209717 + }, + { + "auxiliary_loss_clip": 0.01143241, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.05600619, + "balance_loss_mlp": 1.03054333, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 2.04430320877744, + "language_loss": 0.87141734, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89330715, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15185547, + "step": 8925, + "time_per_iteration": 2.5517661571502686 + }, + { + "auxiliary_loss_clip": 0.01130094, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.04892313, + "balance_loss_mlp": 1.01754785, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.4613130519579909, + "language_loss": 0.782601, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80420887, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13165283, + "step": 8926, + "time_per_iteration": 2.517960786819458 + }, + { + "auxiliary_loss_clip": 0.01128683, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.0532186, + "balance_loss_mlp": 1.01938856, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.3613772014322956, + "language_loss": 0.66940182, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69100046, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11773682, + "step": 8927, + "time_per_iteration": 2.5605270862579346 + }, + { + "auxiliary_loss_clip": 0.01131689, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.05122876, + "balance_loss_mlp": 1.01480365, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.7272216973103878, + "language_loss": 0.73131979, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75291634, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13153076, + "step": 8928, + "time_per_iteration": 2.4577744007110596 + }, + { + "auxiliary_loss_clip": 0.01127639, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.04831052, + "balance_loss_mlp": 1.0179503, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.603770825486304, + "language_loss": 0.62969851, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65127969, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12542725, + "step": 8929, + "time_per_iteration": 2.6306073665618896 + }, + { + "auxiliary_loss_clip": 0.01124975, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_clip": 1.04545987, + "balance_loss_mlp": 1.03042841, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 2.0009326780094914, + "language_loss": 0.66405499, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68574071, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.1315918, + "step": 8930, + "time_per_iteration": 2.5354177951812744 + }, + { + "auxiliary_loss_clip": 0.01122269, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.04450881, + "balance_loss_mlp": 1.02078116, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4781288327606177, + "language_loss": 0.67171931, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69329035, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.14056396, + "step": 8931, + "time_per_iteration": 4.055795669555664 + }, + { + "auxiliary_loss_clip": 0.0112459, + "auxiliary_loss_mlp": 0.01041147, + "balance_loss_clip": 1.04784846, + "balance_loss_mlp": 1.02533412, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.9632805386190546, + "language_loss": 0.75754756, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77920496, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.15814209, + "step": 8932, + "time_per_iteration": 2.536329746246338 + }, + { + "auxiliary_loss_clip": 0.01125194, + "auxiliary_loss_mlp": 0.01041126, + "balance_loss_clip": 1.04865766, + "balance_loss_mlp": 1.02738667, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 2.8189445152671744, + "language_loss": 0.66134179, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68300498, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13757324, + "step": 8933, + "time_per_iteration": 2.5664305686950684 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.05629015, + "balance_loss_mlp": 1.02158999, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.7539902909480605, + "language_loss": 0.82721794, + "learning_rate": 1.856606505975565e-06, + "loss": 0.84887421, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12731934, + "step": 8934, + "time_per_iteration": 2.4863433837890625 + }, + { + "auxiliary_loss_clip": 0.01126753, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.05080807, + "balance_loss_mlp": 1.02045381, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.9688606035340677, + "language_loss": 0.7949053, + "learning_rate": 1.856218049303999e-06, + "loss": 0.8165158, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1383667, + "step": 8935, + "time_per_iteration": 2.4160001277923584 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_clip": 1.04669452, + "balance_loss_mlp": 1.02798688, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.9642142464647312, + "language_loss": 0.8364203, + "learning_rate": 1.855829598084659e-06, + "loss": 0.85809255, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13775635, + "step": 8936, + "time_per_iteration": 2.55633807182312 + }, + { + "auxiliary_loss_clip": 0.01131765, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.05363142, + "balance_loss_mlp": 1.02610469, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.662992279596106, + "language_loss": 0.72460437, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74630922, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.1262207, + "step": 8937, + "time_per_iteration": 2.654017210006714 + }, + { + "auxiliary_loss_clip": 0.01134614, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.05377197, + "balance_loss_mlp": 1.02007163, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.181877751621643, + "language_loss": 0.81250966, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83418936, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13275146, + "step": 8938, + "time_per_iteration": 2.505960464477539 + }, + { + "auxiliary_loss_clip": 0.01129196, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.04632521, + "balance_loss_mlp": 1.02843177, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.8833886081139997, + "language_loss": 0.80722028, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82892424, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.12762451, + "step": 8939, + "time_per_iteration": 2.4267661571502686 + }, + { + "auxiliary_loss_clip": 0.0106132, + "auxiliary_loss_mlp": 0.01006454, + "balance_loss_clip": 1.03459799, + "balance_loss_mlp": 1.00481057, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7042396701324917, + "language_loss": 0.52485156, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54552925, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01643372, + "step": 8940, + "time_per_iteration": 3.1008501052856445 + }, + { + "auxiliary_loss_clip": 0.01132013, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.05466115, + "balance_loss_mlp": 1.02011561, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 2.596836679651401, + "language_loss": 0.71020734, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73185694, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12835693, + "step": 8941, + "time_per_iteration": 2.423746347427368 + }, + { + "auxiliary_loss_clip": 0.01120343, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.04577005, + "balance_loss_mlp": 1.01775479, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.6064201448451225, + "language_loss": 0.79825681, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81976312, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12548828, + "step": 8942, + "time_per_iteration": 2.5014848709106445 + }, + { + "auxiliary_loss_clip": 0.01138628, + "auxiliary_loss_mlp": 0.01037868, + "balance_loss_clip": 1.05458474, + "balance_loss_mlp": 1.02334833, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.5465322129766899, + "language_loss": 0.7057308, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72749579, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.1451416, + "step": 8943, + "time_per_iteration": 2.532240390777588 + }, + { + "auxiliary_loss_clip": 0.01070058, + "auxiliary_loss_mlp": 0.01001726, + "balance_loss_clip": 1.04210865, + "balance_loss_mlp": 1.00018501, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8358431941105969, + "language_loss": 0.59655923, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61727709, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.01542664, + "step": 8944, + "time_per_iteration": 3.084460735321045 + }, + { + "auxiliary_loss_clip": 0.0113311, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_clip": 1.05111301, + "balance_loss_mlp": 1.02619553, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.2961645813043194, + "language_loss": 0.77257389, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79432523, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.1583252, + "step": 8945, + "time_per_iteration": 2.471461057662964 + }, + { + "auxiliary_loss_clip": 0.01122999, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.0435698, + "balance_loss_mlp": 1.02381265, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.9325767411269246, + "language_loss": 0.68495935, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70655876, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13134766, + "step": 8946, + "time_per_iteration": 2.4894471168518066 + }, + { + "auxiliary_loss_clip": 0.01122897, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.04658866, + "balance_loss_mlp": 1.0324018, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.54933337051791, + "language_loss": 0.76986665, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79155791, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13818359, + "step": 8947, + "time_per_iteration": 2.5077288150787354 + }, + { + "auxiliary_loss_clip": 0.01124725, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.04672599, + "balance_loss_mlp": 1.02246606, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 5.0990395901754395, + "language_loss": 0.60347831, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62507939, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.1293335, + "step": 8948, + "time_per_iteration": 2.515610694885254 + }, + { + "auxiliary_loss_clip": 0.01128365, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.0495708, + "balance_loss_mlp": 1.02425063, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.6779576542161287, + "language_loss": 0.7953859, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81703973, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12774658, + "step": 8949, + "time_per_iteration": 2.4781899452209473 + }, + { + "auxiliary_loss_clip": 0.011298, + "auxiliary_loss_mlp": 0.01038717, + "balance_loss_clip": 1.05305731, + "balance_loss_mlp": 1.02576518, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 2.401598482028478, + "language_loss": 0.77760816, + "learning_rate": 1.850391861746111e-06, + "loss": 0.79929328, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1295166, + "step": 8950, + "time_per_iteration": 3.961296796798706 + }, + { + "auxiliary_loss_clip": 0.01124747, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.04944956, + "balance_loss_mlp": 1.01788759, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.6870774913323892, + "language_loss": 0.72511768, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74666518, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12121582, + "step": 8951, + "time_per_iteration": 2.557504177093506 + }, + { + "auxiliary_loss_clip": 0.01137697, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.05630088, + "balance_loss_mlp": 1.01662016, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.7243302975879558, + "language_loss": 0.7492702, + "learning_rate": 1.849615132097085e-06, + "loss": 0.77095073, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13720703, + "step": 8952, + "time_per_iteration": 2.432953357696533 + }, + { + "auxiliary_loss_clip": 0.011372, + "auxiliary_loss_mlp": 0.01032431, + "balance_loss_clip": 1.0587821, + "balance_loss_mlp": 1.01896644, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.3809737443459802, + "language_loss": 0.79426938, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81596571, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13464355, + "step": 8953, + "time_per_iteration": 2.5549087524414062 + }, + { + "auxiliary_loss_clip": 0.01125286, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.05074525, + "balance_loss_mlp": 1.01803482, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 2.2308013584505924, + "language_loss": 0.80469179, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82625687, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13183594, + "step": 8954, + "time_per_iteration": 2.4205398559570312 + }, + { + "auxiliary_loss_clip": 0.0112768, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.04999185, + "balance_loss_mlp": 1.01941717, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.2119805155009926, + "language_loss": 0.76973593, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.79134226, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13531494, + "step": 8955, + "time_per_iteration": 2.4751100540161133 + }, + { + "auxiliary_loss_clip": 0.01131384, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.05328178, + "balance_loss_mlp": 1.01763356, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.5143135920412387, + "language_loss": 0.78474188, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80636352, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13134766, + "step": 8956, + "time_per_iteration": 2.460799217224121 + }, + { + "auxiliary_loss_clip": 0.01101215, + "auxiliary_loss_mlp": 0.01007373, + "balance_loss_clip": 1.07294297, + "balance_loss_mlp": 1.00515604, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8495635178907172, + "language_loss": 0.63409346, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65517932, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 0.02215576, + "step": 8957, + "time_per_iteration": 4.515086650848389 + }, + { + "auxiliary_loss_clip": 0.01074148, + "auxiliary_loss_mlp": 0.01011742, + "balance_loss_clip": 1.04759336, + "balance_loss_mlp": 1.01034856, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7049087688644073, + "language_loss": 0.51630151, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.5371604, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01393127, + "step": 8958, + "time_per_iteration": 3.143749475479126 + }, + { + "auxiliary_loss_clip": 0.01137513, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.05764365, + "balance_loss_mlp": 1.01550698, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 2.783470932294419, + "language_loss": 0.77165157, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.7933259, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14422607, + "step": 8959, + "time_per_iteration": 2.4822471141815186 + }, + { + "auxiliary_loss_clip": 0.01131267, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.0525806, + "balance_loss_mlp": 1.02037525, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.3938444085600734, + "language_loss": 0.83948594, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.86113423, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13183594, + "step": 8960, + "time_per_iteration": 2.4563615322113037 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.04516315, + "balance_loss_mlp": 1.01977491, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.5127543800347574, + "language_loss": 0.78834838, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.8099041, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13317871, + "step": 8961, + "time_per_iteration": 2.531503677368164 + }, + { + "auxiliary_loss_clip": 0.01143281, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.0623945, + "balance_loss_mlp": 1.02014899, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.595597054185071, + "language_loss": 0.84187198, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86363614, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12994385, + "step": 8962, + "time_per_iteration": 2.4532992839813232 + }, + { + "auxiliary_loss_clip": 0.01090313, + "auxiliary_loss_mlp": 0.01005085, + "balance_loss_clip": 1.06390941, + "balance_loss_mlp": 1.00259638, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7322432603337261, + "language_loss": 0.54200107, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56295502, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.02490234, + "step": 8963, + "time_per_iteration": 4.424288272857666 + }, + { + "auxiliary_loss_clip": 0.01065322, + "auxiliary_loss_mlp": 0.01003016, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.0015558, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.7993336448341457, + "language_loss": 0.63401502, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65469837, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.26806641, + "router_z_loss_mlp": 0.01461792, + "step": 8964, + "time_per_iteration": 3.1445114612579346 + }, + { + "auxiliary_loss_clip": 0.01127724, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.04770207, + "balance_loss_mlp": 1.01782155, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.4674713106868926, + "language_loss": 0.6988306, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72042084, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13470459, + "step": 8965, + "time_per_iteration": 2.546156883239746 + }, + { + "auxiliary_loss_clip": 0.01123249, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.04419315, + "balance_loss_mlp": 1.02084899, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 1.9202784416611156, + "language_loss": 0.81938219, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.84097081, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14788818, + "step": 8966, + "time_per_iteration": 2.4469735622406006 + }, + { + "auxiliary_loss_clip": 0.01131846, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.05440021, + "balance_loss_mlp": 1.01884031, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 2.066343668228918, + "language_loss": 0.72373587, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74537969, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13696289, + "step": 8967, + "time_per_iteration": 2.4951171875 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.04433346, + "balance_loss_mlp": 1.01992059, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.620659224938114, + "language_loss": 0.82131886, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.84285408, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12390137, + "step": 8968, + "time_per_iteration": 2.4724721908569336 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.04675865, + "balance_loss_mlp": 1.02047801, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.5165546724389698, + "language_loss": 0.73792505, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.75953484, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14117432, + "step": 8969, + "time_per_iteration": 2.485659599304199 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.0497936, + "balance_loss_mlp": 1.01641476, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 1.6958857029238124, + "language_loss": 0.82595992, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84758514, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14044189, + "step": 8970, + "time_per_iteration": 2.4285130500793457 + }, + { + "auxiliary_loss_clip": 0.01131294, + "auxiliary_loss_mlp": 0.010289, + "balance_loss_clip": 1.05464423, + "balance_loss_mlp": 1.01582825, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.3979170554065015, + "language_loss": 0.75434685, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77594876, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.1307373, + "step": 8971, + "time_per_iteration": 2.5379018783569336 + }, + { + "auxiliary_loss_clip": 0.01061875, + "auxiliary_loss_mlp": 0.01004405, + "balance_loss_clip": 1.03543591, + "balance_loss_mlp": 1.00294936, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8748655989493986, + "language_loss": 0.60283893, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62350172, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01457214, + "step": 8972, + "time_per_iteration": 3.079364538192749 + }, + { + "auxiliary_loss_clip": 0.01128471, + "auxiliary_loss_mlp": 0.01043203, + "balance_loss_clip": 1.05082965, + "balance_loss_mlp": 1.02897501, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.4486457625122753, + "language_loss": 0.78513014, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80684686, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.14221191, + "step": 8973, + "time_per_iteration": 2.533473014831543 + }, + { + "auxiliary_loss_clip": 0.01137717, + "auxiliary_loss_mlp": 0.01038618, + "balance_loss_clip": 1.05353141, + "balance_loss_mlp": 1.02315021, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 1.746379692410048, + "language_loss": 0.73749346, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.75925684, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.15460205, + "step": 8974, + "time_per_iteration": 2.51780104637146 + }, + { + "auxiliary_loss_clip": 0.01055541, + "auxiliary_loss_mlp": 0.01007111, + "balance_loss_clip": 1.02949119, + "balance_loss_mlp": 1.00578904, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7237943521391842, + "language_loss": 0.51075101, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53137755, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01321411, + "step": 8975, + "time_per_iteration": 4.524048805236816 + }, + { + "auxiliary_loss_clip": 0.01132432, + "auxiliary_loss_mlp": 0.01040563, + "balance_loss_clip": 1.05371165, + "balance_loss_mlp": 1.02654934, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.4160929065834447, + "language_loss": 0.7214554, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74318534, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.14013672, + "step": 8976, + "time_per_iteration": 2.552535057067871 + }, + { + "auxiliary_loss_clip": 0.01132311, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.05361772, + "balance_loss_mlp": 1.02121973, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 2.0905594028585748, + "language_loss": 0.69826442, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71992844, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12872314, + "step": 8977, + "time_per_iteration": 2.4523983001708984 + }, + { + "auxiliary_loss_clip": 0.01128427, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.04888368, + "balance_loss_mlp": 1.02352452, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.59389796978399, + "language_loss": 0.72784394, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74951017, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14654541, + "step": 8978, + "time_per_iteration": 2.4731690883636475 + }, + { + "auxiliary_loss_clip": 0.01137635, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.05460358, + "balance_loss_mlp": 1.02197111, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.0611671463631156, + "language_loss": 0.74092615, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76267928, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.15716553, + "step": 8979, + "time_per_iteration": 2.390132188796997 + }, + { + "auxiliary_loss_clip": 0.01143137, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.05818748, + "balance_loss_mlp": 1.04000235, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.3426586349141525, + "language_loss": 0.76843786, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79041553, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.14624023, + "step": 8980, + "time_per_iteration": 2.4578588008880615 + }, + { + "auxiliary_loss_clip": 0.01128215, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.04900408, + "balance_loss_mlp": 1.01931703, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 2.2193994317575654, + "language_loss": 0.82189822, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.8434999, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12646484, + "step": 8981, + "time_per_iteration": 2.505723237991333 + }, + { + "auxiliary_loss_clip": 0.01130891, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.0516727, + "balance_loss_mlp": 1.01568651, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.8751532267539666, + "language_loss": 0.67509544, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69669259, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13146973, + "step": 8982, + "time_per_iteration": 2.5245234966278076 + }, + { + "auxiliary_loss_clip": 0.01138102, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.05967784, + "balance_loss_mlp": 1.02351677, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.5435888189040026, + "language_loss": 0.82737207, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84911335, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12512207, + "step": 8983, + "time_per_iteration": 2.4631083011627197 + }, + { + "auxiliary_loss_clip": 0.01125811, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.04881704, + "balance_loss_mlp": 1.02451849, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.777001999465456, + "language_loss": 0.70625818, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72789615, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13464355, + "step": 8984, + "time_per_iteration": 2.4700191020965576 + }, + { + "auxiliary_loss_clip": 0.01139772, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.05900443, + "balance_loss_mlp": 1.02266431, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.773737483978491, + "language_loss": 0.80145448, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.8232221, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14331055, + "step": 8985, + "time_per_iteration": 2.43496036529541 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.04961228, + "balance_loss_mlp": 1.0180676, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.6537501433842536, + "language_loss": 0.78706098, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80860376, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.13061523, + "step": 8986, + "time_per_iteration": 2.5067739486694336 + }, + { + "auxiliary_loss_clip": 0.01128464, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.05076623, + "balance_loss_mlp": 1.01924777, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.5971929622404364, + "language_loss": 0.76610053, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.78770423, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12664795, + "step": 8987, + "time_per_iteration": 2.4411370754241943 + }, + { + "auxiliary_loss_clip": 0.0113078, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.05352902, + "balance_loss_mlp": 1.0187602, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 1.734199932739806, + "language_loss": 0.7119357, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73356366, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13256836, + "step": 8988, + "time_per_iteration": 2.438845157623291 + }, + { + "auxiliary_loss_clip": 0.01137193, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.05397499, + "balance_loss_mlp": 1.02746606, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.368423074253296, + "language_loss": 0.67296028, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69474691, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.14007568, + "step": 8989, + "time_per_iteration": 2.5068345069885254 + }, + { + "auxiliary_loss_clip": 0.01128024, + "auxiliary_loss_mlp": 0.0103771, + "balance_loss_clip": 1.04956985, + "balance_loss_mlp": 1.02381063, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.6655710504961263, + "language_loss": 0.77655679, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79821414, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13897705, + "step": 8990, + "time_per_iteration": 2.5612621307373047 + }, + { + "auxiliary_loss_clip": 0.01132434, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.05097091, + "balance_loss_mlp": 1.02306318, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.5525523347761112, + "language_loss": 0.69081604, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71249616, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12524414, + "step": 8991, + "time_per_iteration": 2.4667930603027344 + }, + { + "auxiliary_loss_clip": 0.01123222, + "auxiliary_loss_mlp": 0.01032645, + "balance_loss_clip": 1.04554665, + "balance_loss_mlp": 1.01897168, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 2.0616533087054347, + "language_loss": 0.76213956, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78369832, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13665771, + "step": 8992, + "time_per_iteration": 2.471205472946167 + }, + { + "auxiliary_loss_clip": 0.01124996, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.04514718, + "balance_loss_mlp": 1.02162766, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.7158920522920864, + "language_loss": 0.75733864, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.77894354, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13879395, + "step": 8993, + "time_per_iteration": 3.8251590728759766 + }, + { + "auxiliary_loss_clip": 0.01131961, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.05608606, + "balance_loss_mlp": 1.0184927, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6038982238969568, + "language_loss": 0.70424592, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72587621, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12554932, + "step": 8994, + "time_per_iteration": 2.4754602909088135 + }, + { + "auxiliary_loss_clip": 0.01127782, + "auxiliary_loss_mlp": 0.01035874, + "balance_loss_clip": 1.04936218, + "balance_loss_mlp": 1.02100837, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 2.260265509253964, + "language_loss": 0.75519264, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77682912, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.14855957, + "step": 8995, + "time_per_iteration": 2.4563980102539062 + }, + { + "auxiliary_loss_clip": 0.01124009, + "auxiliary_loss_mlp": 0.01039565, + "balance_loss_clip": 1.04731083, + "balance_loss_mlp": 1.02668405, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.7746734892521412, + "language_loss": 0.73512411, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75675988, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12884521, + "step": 8996, + "time_per_iteration": 2.454767942428589 + }, + { + "auxiliary_loss_clip": 0.01125279, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.04910111, + "balance_loss_mlp": 1.02545702, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 2.566827767800965, + "language_loss": 0.7364763, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75810748, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12384033, + "step": 8997, + "time_per_iteration": 2.5066888332366943 + }, + { + "auxiliary_loss_clip": 0.01132061, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.05393147, + "balance_loss_mlp": 1.01880741, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.0468573348556705, + "language_loss": 0.71924031, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74088281, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13372803, + "step": 8998, + "time_per_iteration": 2.4493279457092285 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.05398011, + "balance_loss_mlp": 1.0236032, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.4275374225009132, + "language_loss": 0.70366001, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72532982, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.1260376, + "step": 8999, + "time_per_iteration": 2.6920006275177 + }, + { + "auxiliary_loss_clip": 0.01119073, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.04480386, + "balance_loss_mlp": 1.02128267, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.182283773579217, + "language_loss": 0.80780232, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82933396, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12799072, + "step": 9000, + "time_per_iteration": 3.857663869857788 + }, + { + "auxiliary_loss_clip": 0.01130169, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.05236018, + "balance_loss_mlp": 1.0189929, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.5247801790199018, + "language_loss": 0.72748196, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.74910617, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.1328125, + "step": 9001, + "time_per_iteration": 2.4597105979919434 + }, + { + "auxiliary_loss_clip": 0.01129748, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_clip": 1.04644573, + "balance_loss_mlp": 1.0277946, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.309985060423673, + "language_loss": 0.84825581, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.86998785, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.15673828, + "step": 9002, + "time_per_iteration": 2.4297873973846436 + }, + { + "auxiliary_loss_clip": 0.01125891, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.04895556, + "balance_loss_mlp": 1.02609324, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 2.005322239199558, + "language_loss": 0.77924198, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80088484, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12298584, + "step": 9003, + "time_per_iteration": 2.4531185626983643 + }, + { + "auxiliary_loss_clip": 0.01126496, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.04812968, + "balance_loss_mlp": 1.0205512, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 1.9006453915540868, + "language_loss": 0.69066739, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71227467, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13677979, + "step": 9004, + "time_per_iteration": 2.4516961574554443 + }, + { + "auxiliary_loss_clip": 0.01060639, + "auxiliary_loss_mlp": 0.01010247, + "balance_loss_clip": 1.03414893, + "balance_loss_mlp": 1.00873339, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9702284893501322, + "language_loss": 0.59032935, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61103821, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01515198, + "step": 9005, + "time_per_iteration": 3.167172431945801 + }, + { + "auxiliary_loss_clip": 0.01132547, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.05356634, + "balance_loss_mlp": 1.02156925, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 2.022882748336172, + "language_loss": 0.77911437, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.80077648, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12084961, + "step": 9006, + "time_per_iteration": 3.9475767612457275 + }, + { + "auxiliary_loss_clip": 0.01125257, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.04865241, + "balance_loss_mlp": 1.02741897, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.6150139750886996, + "language_loss": 0.83455712, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.856197, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11303711, + "step": 9007, + "time_per_iteration": 2.5097968578338623 + }, + { + "auxiliary_loss_clip": 0.01130461, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.05154634, + "balance_loss_mlp": 1.02079344, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.632403353760036, + "language_loss": 0.67306799, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.6947121, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.1315918, + "step": 9008, + "time_per_iteration": 2.575423002243042 + }, + { + "auxiliary_loss_clip": 0.01140165, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.05704999, + "balance_loss_mlp": 1.02008498, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.7018692554474533, + "language_loss": 0.73877859, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76052606, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14501953, + "step": 9009, + "time_per_iteration": 2.449136734008789 + }, + { + "auxiliary_loss_clip": 0.01127345, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.04884708, + "balance_loss_mlp": 1.0241245, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.105794920715, + "language_loss": 0.87796831, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89962441, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.14135742, + "step": 9010, + "time_per_iteration": 2.4775807857513428 + }, + { + "auxiliary_loss_clip": 0.01131783, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.05437601, + "balance_loss_mlp": 1.01973414, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.9674630566330518, + "language_loss": 0.65266073, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67430311, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12713623, + "step": 9011, + "time_per_iteration": 2.5605809688568115 + }, + { + "auxiliary_loss_clip": 0.01124943, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.04749417, + "balance_loss_mlp": 1.02179158, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.561108411781966, + "language_loss": 0.79048491, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.8120901, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13800049, + "step": 9012, + "time_per_iteration": 2.553060293197632 + }, + { + "auxiliary_loss_clip": 0.01123433, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.04722238, + "balance_loss_mlp": 1.02067018, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 2.1330923765095187, + "language_loss": 0.7462244, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76781034, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.14477539, + "step": 9013, + "time_per_iteration": 2.4796030521392822 + }, + { + "auxiliary_loss_clip": 0.01133199, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.04963088, + "balance_loss_mlp": 1.02238119, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.8530366937338862, + "language_loss": 0.72465098, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74634916, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14233398, + "step": 9014, + "time_per_iteration": 2.4698822498321533 + }, + { + "auxiliary_loss_clip": 0.01129942, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.0507257, + "balance_loss_mlp": 1.02000833, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.6053751080353489, + "language_loss": 0.80556262, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82719672, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13464355, + "step": 9015, + "time_per_iteration": 2.4405345916748047 + }, + { + "auxiliary_loss_clip": 0.0113549, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.05379605, + "balance_loss_mlp": 1.02479887, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.128203680400395, + "language_loss": 0.81001663, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83175457, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.1350708, + "step": 9016, + "time_per_iteration": 2.4793221950531006 + }, + { + "auxiliary_loss_clip": 0.01127972, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.05103683, + "balance_loss_mlp": 1.0180397, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.7839163967428417, + "language_loss": 0.81586599, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83745539, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.1293335, + "step": 9017, + "time_per_iteration": 2.428283929824829 + }, + { + "auxiliary_loss_clip": 0.01125974, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.05064178, + "balance_loss_mlp": 1.02030647, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 2.0832352931940177, + "language_loss": 0.77625865, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79785156, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13012695, + "step": 9018, + "time_per_iteration": 3.90946364402771 + }, + { + "auxiliary_loss_clip": 0.01128688, + "auxiliary_loss_mlp": 0.01039307, + "balance_loss_clip": 1.04843664, + "balance_loss_mlp": 1.02566385, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.467440331575889, + "language_loss": 0.66604805, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68772805, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.1362915, + "step": 9019, + "time_per_iteration": 2.6653382778167725 + }, + { + "auxiliary_loss_clip": 0.01124246, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.04931641, + "balance_loss_mlp": 1.01986122, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.8989776594688321, + "language_loss": 0.69730437, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71887159, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12615967, + "step": 9020, + "time_per_iteration": 2.512448787689209 + }, + { + "auxiliary_loss_clip": 0.01123149, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.04953361, + "balance_loss_mlp": 1.02287579, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.480142796065923, + "language_loss": 0.80570155, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82728124, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1194458, + "step": 9021, + "time_per_iteration": 2.4960412979125977 + }, + { + "auxiliary_loss_clip": 0.01125614, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.04913282, + "balance_loss_mlp": 1.02397847, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 4.439512831299215, + "language_loss": 0.78550875, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80713463, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13012695, + "step": 9022, + "time_per_iteration": 2.468628168106079 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.01033617, + "balance_loss_clip": 1.04186618, + "balance_loss_mlp": 1.02124286, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.7275387808550955, + "language_loss": 0.82000566, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84152412, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12371826, + "step": 9023, + "time_per_iteration": 2.5152809619903564 + }, + { + "auxiliary_loss_clip": 0.01123363, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.04611325, + "balance_loss_mlp": 1.0244844, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.5039848748393019, + "language_loss": 0.70922029, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73083621, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13739014, + "step": 9024, + "time_per_iteration": 2.463864326477051 + }, + { + "auxiliary_loss_clip": 0.01134036, + "auxiliary_loss_mlp": 0.01028835, + "balance_loss_clip": 1.05425286, + "balance_loss_mlp": 1.01664627, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.6317972556616525, + "language_loss": 0.65123957, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67286831, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12188721, + "step": 9025, + "time_per_iteration": 2.5220448970794678 + }, + { + "auxiliary_loss_clip": 0.01130314, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.05198669, + "balance_loss_mlp": 1.02284718, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 1.7790355903349053, + "language_loss": 0.74253631, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76420569, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13781738, + "step": 9026, + "time_per_iteration": 2.4330501556396484 + }, + { + "auxiliary_loss_clip": 0.01121723, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.04367495, + "balance_loss_mlp": 1.02418017, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 2.805552005456609, + "language_loss": 0.7836982, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80531704, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.15991211, + "step": 9027, + "time_per_iteration": 2.5036652088165283 + }, + { + "auxiliary_loss_clip": 0.01053582, + "auxiliary_loss_mlp": 0.01010205, + "balance_loss_clip": 1.02791119, + "balance_loss_mlp": 1.00872254, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7475986476930014, + "language_loss": 0.56559515, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58623308, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01481628, + "step": 9028, + "time_per_iteration": 3.1002562046051025 + }, + { + "auxiliary_loss_clip": 0.01128418, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.04722166, + "balance_loss_mlp": 1.02144432, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 1.9490535280685108, + "language_loss": 0.77828312, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.79993486, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.15307617, + "step": 9029, + "time_per_iteration": 2.4606940746307373 + }, + { + "auxiliary_loss_clip": 0.01131857, + "auxiliary_loss_mlp": 0.01027764, + "balance_loss_clip": 1.05498374, + "balance_loss_mlp": 1.01483607, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 2.2671622113118817, + "language_loss": 0.83201265, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85360885, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12921143, + "step": 9030, + "time_per_iteration": 2.51949143409729 + }, + { + "auxiliary_loss_clip": 0.01126631, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.04866481, + "balance_loss_mlp": 1.02049685, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.8420176539755957, + "language_loss": 0.74635243, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.76794469, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12097168, + "step": 9031, + "time_per_iteration": 2.5144309997558594 + }, + { + "auxiliary_loss_clip": 0.01124228, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.04876328, + "balance_loss_mlp": 1.01994729, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 3.053318344134766, + "language_loss": 0.8523131, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.8738693, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11456299, + "step": 9032, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.05419612, + "balance_loss_mlp": 1.02098548, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.7251183076519976, + "language_loss": 0.73810959, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.75980222, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13208008, + "step": 9033, + "time_per_iteration": 2.4737913608551025 + }, + { + "auxiliary_loss_clip": 0.01125105, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.04786015, + "balance_loss_mlp": 1.02022362, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 2.8149308441803624, + "language_loss": 0.75412244, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77571034, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13458252, + "step": 9034, + "time_per_iteration": 2.476789951324463 + }, + { + "auxiliary_loss_clip": 0.01130757, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.05393624, + "balance_loss_mlp": 1.01882148, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.9131772766300723, + "language_loss": 0.84563285, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86724961, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12078857, + "step": 9035, + "time_per_iteration": 2.448960304260254 + }, + { + "auxiliary_loss_clip": 0.01065355, + "auxiliary_loss_mlp": 0.01001104, + "balance_loss_clip": 1.03776193, + "balance_loss_mlp": 0.99977517, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7408002440135874, + "language_loss": 0.55874288, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57940751, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.27587891, + "router_z_loss_mlp": 0.01330566, + "step": 9036, + "time_per_iteration": 3.0587618350982666 + }, + { + "auxiliary_loss_clip": 0.01127788, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.04998827, + "balance_loss_mlp": 1.02232432, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.5681077339765601, + "language_loss": 0.75134659, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77297592, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12817383, + "step": 9037, + "time_per_iteration": 3.909559726715088 + }, + { + "auxiliary_loss_clip": 0.01132283, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.05267227, + "balance_loss_mlp": 1.02010345, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.703945691739668, + "language_loss": 0.66791016, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68956816, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13415527, + "step": 9038, + "time_per_iteration": 2.5577144622802734 + }, + { + "auxiliary_loss_clip": 0.011276, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.05251479, + "balance_loss_mlp": 1.01872993, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.7587306202493662, + "language_loss": 0.77861238, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80019557, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11999512, + "step": 9039, + "time_per_iteration": 2.4746320247650146 + }, + { + "auxiliary_loss_clip": 0.01133714, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.05495167, + "balance_loss_mlp": 1.01995158, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.739152896847, + "language_loss": 0.76552236, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.7871812, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12219238, + "step": 9040, + "time_per_iteration": 2.470280647277832 + }, + { + "auxiliary_loss_clip": 0.01054724, + "auxiliary_loss_mlp": 0.01007046, + "balance_loss_clip": 1.02828312, + "balance_loss_mlp": 1.00559139, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6592174694845376, + "language_loss": 0.52361989, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54423761, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01454163, + "step": 9041, + "time_per_iteration": 3.0474159717559814 + }, + { + "auxiliary_loss_clip": 0.01131782, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.05545545, + "balance_loss_mlp": 1.02357531, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.825076239881633, + "language_loss": 0.76481044, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78649044, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12640381, + "step": 9042, + "time_per_iteration": 2.47151517868042 + }, + { + "auxiliary_loss_clip": 0.0112185, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.04741037, + "balance_loss_mlp": 1.01834035, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.62186046581238, + "language_loss": 0.67474449, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69626474, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11834717, + "step": 9043, + "time_per_iteration": 2.4741454124450684 + }, + { + "auxiliary_loss_clip": 0.01116123, + "auxiliary_loss_mlp": 0.01039955, + "balance_loss_clip": 1.04241037, + "balance_loss_mlp": 1.02693748, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 2.3328976798805163, + "language_loss": 0.84490556, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86646634, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.13024902, + "step": 9044, + "time_per_iteration": 3.8848843574523926 + }, + { + "auxiliary_loss_clip": 0.01129365, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.04936504, + "balance_loss_mlp": 1.01970339, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.5311328115925285, + "language_loss": 0.62373978, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64537632, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14599609, + "step": 9045, + "time_per_iteration": 2.5052640438079834 + }, + { + "auxiliary_loss_clip": 0.01127126, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.04904747, + "balance_loss_mlp": 1.01956081, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.7719065906172746, + "language_loss": 0.6983242, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.71992254, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13146973, + "step": 9046, + "time_per_iteration": 2.5377964973449707 + }, + { + "auxiliary_loss_clip": 0.01129993, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.05426419, + "balance_loss_mlp": 1.01873231, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.5763450356836026, + "language_loss": 0.77109706, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79270744, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12316895, + "step": 9047, + "time_per_iteration": 2.429295539855957 + }, + { + "auxiliary_loss_clip": 0.01136824, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.05607688, + "balance_loss_mlp": 1.02575207, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.8366468049466056, + "language_loss": 0.72584808, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74759942, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12548828, + "step": 9048, + "time_per_iteration": 2.4868521690368652 + }, + { + "auxiliary_loss_clip": 0.01129915, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.05368876, + "balance_loss_mlp": 1.01994944, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.2023048888359473, + "language_loss": 0.93312651, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.9547627, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13757324, + "step": 9049, + "time_per_iteration": 2.414551019668579 + }, + { + "auxiliary_loss_clip": 0.01123776, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.04946268, + "balance_loss_mlp": 1.01907516, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.7769153881293647, + "language_loss": 0.73955548, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76109672, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11273193, + "step": 9050, + "time_per_iteration": 3.892847776412964 + }, + { + "auxiliary_loss_clip": 0.01135286, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.05624604, + "balance_loss_mlp": 1.01634026, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 4.565360581193227, + "language_loss": 0.67142385, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69307125, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13110352, + "step": 9051, + "time_per_iteration": 2.51424503326416 + }, + { + "auxiliary_loss_clip": 0.01126161, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.04992104, + "balance_loss_mlp": 1.02100599, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.764172543262919, + "language_loss": 0.67752409, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69911349, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11791992, + "step": 9052, + "time_per_iteration": 2.522662401199341 + }, + { + "auxiliary_loss_clip": 0.01131688, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.05384588, + "balance_loss_mlp": 1.01852477, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.8172220968657469, + "language_loss": 0.92859888, + "learning_rate": 1.810422473773436e-06, + "loss": 0.95022559, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12463379, + "step": 9053, + "time_per_iteration": 2.506417989730835 + }, + { + "auxiliary_loss_clip": 0.01129495, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.05167401, + "balance_loss_mlp": 1.02556407, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 1.8665647196720019, + "language_loss": 0.83235431, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85403538, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13037109, + "step": 9054, + "time_per_iteration": 2.41945743560791 + }, + { + "auxiliary_loss_clip": 0.01128012, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.04981971, + "balance_loss_mlp": 1.02176297, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.790753179087751, + "language_loss": 0.68445027, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70607698, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12890625, + "step": 9055, + "time_per_iteration": 2.474318742752075 + }, + { + "auxiliary_loss_clip": 0.01058211, + "auxiliary_loss_mlp": 0.01012941, + "balance_loss_clip": 1.03134632, + "balance_loss_mlp": 1.01161647, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7423721621652428, + "language_loss": 0.57666779, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59737933, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.26904297, + "router_z_loss_mlp": 0.01324463, + "step": 9056, + "time_per_iteration": 3.1539323329925537 + }, + { + "auxiliary_loss_clip": 0.01127928, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.04676175, + "balance_loss_mlp": 1.02317238, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.7509181709705057, + "language_loss": 0.69641769, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71806264, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13391113, + "step": 9057, + "time_per_iteration": 2.524587631225586 + }, + { + "auxiliary_loss_clip": 0.01120533, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.04428363, + "balance_loss_mlp": 1.02334571, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 1.9073926948835813, + "language_loss": 0.75253969, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77411073, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13232422, + "step": 9058, + "time_per_iteration": 2.519216299057007 + }, + { + "auxiliary_loss_clip": 0.01059023, + "auxiliary_loss_mlp": 0.01004242, + "balance_loss_clip": 1.03237939, + "balance_loss_mlp": 1.00282478, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.788838744809159, + "language_loss": 0.62715125, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64778394, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01416016, + "step": 9059, + "time_per_iteration": 3.185119390487671 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.04436255, + "balance_loss_mlp": 1.02077937, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.7916783156901013, + "language_loss": 0.79658651, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81811607, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12536621, + "step": 9060, + "time_per_iteration": 2.4675991535186768 + }, + { + "auxiliary_loss_clip": 0.01125529, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.04770434, + "balance_loss_mlp": 1.02141869, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.6000273178387305, + "language_loss": 0.7952413, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81683862, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12780762, + "step": 9061, + "time_per_iteration": 4.049983024597168 + }, + { + "auxiliary_loss_clip": 0.01117172, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.04312027, + "balance_loss_mlp": 1.0186578, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.7820522406629966, + "language_loss": 0.87074846, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89222246, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11560059, + "step": 9062, + "time_per_iteration": 2.5087239742279053 + }, + { + "auxiliary_loss_clip": 0.01132892, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.05176091, + "balance_loss_mlp": 1.02065563, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.9241444759365756, + "language_loss": 0.82332623, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84499449, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13275146, + "step": 9063, + "time_per_iteration": 2.41658353805542 + }, + { + "auxiliary_loss_clip": 0.01120233, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.04340839, + "balance_loss_mlp": 1.02375674, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.8164246173479417, + "language_loss": 0.63787746, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65946931, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.15209961, + "step": 9064, + "time_per_iteration": 2.473203182220459 + }, + { + "auxiliary_loss_clip": 0.01132069, + "auxiliary_loss_mlp": 0.01036113, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.02257061, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.9704135084030223, + "language_loss": 0.80086565, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82254744, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13555908, + "step": 9065, + "time_per_iteration": 2.4875738620758057 + }, + { + "auxiliary_loss_clip": 0.01118351, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.04294264, + "balance_loss_mlp": 1.02715242, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 2.2039806697255124, + "language_loss": 0.78527033, + "learning_rate": 1.805382881379827e-06, + "loss": 0.8068499, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12451172, + "step": 9066, + "time_per_iteration": 2.444578170776367 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.04448605, + "balance_loss_mlp": 1.02321744, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.7317561305104032, + "language_loss": 0.75767505, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.77931488, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13995361, + "step": 9067, + "time_per_iteration": 2.4733753204345703 + }, + { + "auxiliary_loss_clip": 0.01133118, + "auxiliary_loss_mlp": 0.01042785, + "balance_loss_clip": 1.04909921, + "balance_loss_mlp": 1.02631044, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 1.8764218762391545, + "language_loss": 0.63631833, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65807736, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.16479492, + "step": 9068, + "time_per_iteration": 2.6182355880737305 + }, + { + "auxiliary_loss_clip": 0.01124976, + "auxiliary_loss_mlp": 0.01059113, + "balance_loss_clip": 1.0477109, + "balance_loss_mlp": 1.04420638, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.4815915738726786, + "language_loss": 0.72495174, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74679267, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.14898682, + "step": 9069, + "time_per_iteration": 2.480088949203491 + }, + { + "auxiliary_loss_clip": 0.01129435, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.05349982, + "balance_loss_mlp": 1.02094412, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.7003066765036732, + "language_loss": 0.73531783, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.75694126, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11968994, + "step": 9070, + "time_per_iteration": 2.464243173599243 + }, + { + "auxiliary_loss_clip": 0.0112758, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.05198634, + "balance_loss_mlp": 1.02145505, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 2.1194837926295276, + "language_loss": 0.60977417, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.63138759, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12310791, + "step": 9071, + "time_per_iteration": 2.551813840866089 + }, + { + "auxiliary_loss_clip": 0.01062522, + "auxiliary_loss_mlp": 0.01011255, + "balance_loss_clip": 1.03601313, + "balance_loss_mlp": 1.01004529, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.6996328520099651, + "language_loss": 0.57055843, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59129626, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01211548, + "step": 9072, + "time_per_iteration": 3.1980230808258057 + }, + { + "auxiliary_loss_clip": 0.01121174, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.04841471, + "balance_loss_mlp": 1.01689112, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.808094147971677, + "language_loss": 0.70243835, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72393829, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1192627, + "step": 9073, + "time_per_iteration": 2.4332754611968994 + }, + { + "auxiliary_loss_clip": 0.01126378, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.05260682, + "balance_loss_mlp": 1.02234507, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 2.238737286452175, + "language_loss": 0.70974642, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73134798, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11419678, + "step": 9074, + "time_per_iteration": 2.501300573348999 + }, + { + "auxiliary_loss_clip": 0.01123396, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.04647493, + "balance_loss_mlp": 1.02339196, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.7799203520590943, + "language_loss": 0.68710792, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70869929, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12347412, + "step": 9075, + "time_per_iteration": 2.444241523742676 + }, + { + "auxiliary_loss_clip": 0.01126442, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.05005121, + "balance_loss_mlp": 1.0234766, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.6246597098882232, + "language_loss": 0.81208509, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.83369744, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11309814, + "step": 9076, + "time_per_iteration": 2.495781898498535 + }, + { + "auxiliary_loss_clip": 0.0112813, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.05270219, + "balance_loss_mlp": 1.02150917, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.6848193032338197, + "language_loss": 0.80582726, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82744199, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11834717, + "step": 9077, + "time_per_iteration": 2.4724395275115967 + }, + { + "auxiliary_loss_clip": 0.01130604, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.05220008, + "balance_loss_mlp": 1.01809084, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.770427365683446, + "language_loss": 0.67626673, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69787204, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.11846924, + "step": 9078, + "time_per_iteration": 2.531381130218506 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.0515728, + "balance_loss_mlp": 1.021523, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.941878292960551, + "language_loss": 0.80668509, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82832438, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12768555, + "step": 9079, + "time_per_iteration": 2.4710898399353027 + }, + { + "auxiliary_loss_clip": 0.01135614, + "auxiliary_loss_mlp": 0.01035143, + "balance_loss_clip": 1.05444598, + "balance_loss_mlp": 1.02148795, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.7512370136774735, + "language_loss": 0.75801754, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77972513, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13653564, + "step": 9080, + "time_per_iteration": 2.504974603652954 + }, + { + "auxiliary_loss_clip": 0.01131625, + "auxiliary_loss_mlp": 0.01039574, + "balance_loss_clip": 1.05029917, + "balance_loss_mlp": 1.02618635, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 4.108257548027377, + "language_loss": 0.83263838, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85435033, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1338501, + "step": 9081, + "time_per_iteration": 2.6224799156188965 + }, + { + "auxiliary_loss_clip": 0.01137957, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.05465913, + "balance_loss_mlp": 1.01800406, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.5395650207675131, + "language_loss": 0.69970208, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72139621, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13458252, + "step": 9082, + "time_per_iteration": 4.081507444381714 + }, + { + "auxiliary_loss_clip": 0.01122512, + "auxiliary_loss_mlp": 0.01024735, + "balance_loss_clip": 1.04867911, + "balance_loss_mlp": 1.01283145, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.6640222126312327, + "language_loss": 0.66325212, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68472451, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11895752, + "step": 9083, + "time_per_iteration": 2.6674857139587402 + }, + { + "auxiliary_loss_clip": 0.01130059, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.05136943, + "balance_loss_mlp": 1.02432644, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.733656962734459, + "language_loss": 0.7926054, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81427658, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.1272583, + "step": 9084, + "time_per_iteration": 2.5294110774993896 + }, + { + "auxiliary_loss_clip": 0.01132672, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.0546751, + "balance_loss_mlp": 1.02054286, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 1.7818342600884851, + "language_loss": 0.75047147, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77213275, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12908936, + "step": 9085, + "time_per_iteration": 2.5287702083587646 + }, + { + "auxiliary_loss_clip": 0.01131306, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.05335331, + "balance_loss_mlp": 1.01900911, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 1.7325528597288875, + "language_loss": 0.74812603, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76975644, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12713623, + "step": 9086, + "time_per_iteration": 2.524374008178711 + }, + { + "auxiliary_loss_clip": 0.01127578, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.05004525, + "balance_loss_mlp": 1.01982522, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.644852773335504, + "language_loss": 0.76668435, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.78828377, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12536621, + "step": 9087, + "time_per_iteration": 2.522038459777832 + }, + { + "auxiliary_loss_clip": 0.01138185, + "auxiliary_loss_mlp": 0.01040009, + "balance_loss_clip": 1.05583072, + "balance_loss_mlp": 1.02558494, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.7257123614489218, + "language_loss": 0.77665854, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79844046, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.144104, + "step": 9088, + "time_per_iteration": 3.890815019607544 + }, + { + "auxiliary_loss_clip": 0.01064262, + "auxiliary_loss_mlp": 0.01002604, + "balance_loss_clip": 1.03769195, + "balance_loss_mlp": 1.0013032, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7342189617652606, + "language_loss": 0.57719326, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59786189, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01301575, + "step": 9089, + "time_per_iteration": 3.109407424926758 + }, + { + "auxiliary_loss_clip": 0.01125072, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.04766929, + "balance_loss_mlp": 1.02248323, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.746255945267879, + "language_loss": 0.76893753, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.79054201, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12890625, + "step": 9090, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.01132051, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.05073845, + "balance_loss_mlp": 1.0213356, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 2.056251508240483, + "language_loss": 0.73615754, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75783539, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.14404297, + "step": 9091, + "time_per_iteration": 2.492387056350708 + }, + { + "auxiliary_loss_clip": 0.01130499, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.05290473, + "balance_loss_mlp": 1.02071548, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.705021670352777, + "language_loss": 0.7787779, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80042541, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13525391, + "step": 9092, + "time_per_iteration": 2.447070837020874 + }, + { + "auxiliary_loss_clip": 0.0113373, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.05134237, + "balance_loss_mlp": 1.0204643, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.227805187345564, + "language_loss": 0.74956906, + "learning_rate": 1.794920057818476e-06, + "loss": 0.7712487, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13781738, + "step": 9093, + "time_per_iteration": 2.548945188522339 + }, + { + "auxiliary_loss_clip": 0.01129808, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.04995918, + "balance_loss_mlp": 1.02297592, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 2.0741856279202584, + "language_loss": 0.68852937, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71020138, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.14422607, + "step": 9094, + "time_per_iteration": 3.9352915287017822 + }, + { + "auxiliary_loss_clip": 0.01138301, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.05982637, + "balance_loss_mlp": 1.02229929, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.6730154282954928, + "language_loss": 0.67675018, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.69847369, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11761475, + "step": 9095, + "time_per_iteration": 2.5003130435943604 + }, + { + "auxiliary_loss_clip": 0.01134083, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.05552101, + "balance_loss_mlp": 1.02089334, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.5022637429282284, + "language_loss": 0.66772079, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.68939102, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12054443, + "step": 9096, + "time_per_iteration": 2.6001904010772705 + }, + { + "auxiliary_loss_clip": 0.01087967, + "auxiliary_loss_mlp": 0.01005166, + "balance_loss_clip": 1.06172943, + "balance_loss_mlp": 1.00300813, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7333057276608305, + "language_loss": 0.57499552, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59592688, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.02160645, + "step": 9097, + "time_per_iteration": 3.1893534660339355 + }, + { + "auxiliary_loss_clip": 0.01064422, + "auxiliary_loss_mlp": 0.01008467, + "balance_loss_clip": 1.0371604, + "balance_loss_mlp": 1.00715005, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9102962252471374, + "language_loss": 0.64736736, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66809618, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01316833, + "step": 9098, + "time_per_iteration": 3.033716917037964 + }, + { + "auxiliary_loss_clip": 0.01126075, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.04861403, + "balance_loss_mlp": 1.02332759, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 4.631579882898769, + "language_loss": 0.7344082, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75603306, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13085938, + "step": 9099, + "time_per_iteration": 2.505312204360962 + }, + { + "auxiliary_loss_clip": 0.0112858, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.05137527, + "balance_loss_mlp": 1.0201546, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.8989967917173858, + "language_loss": 0.72701395, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.7486257, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12432861, + "step": 9100, + "time_per_iteration": 2.5141971111297607 + }, + { + "auxiliary_loss_clip": 0.01126808, + "auxiliary_loss_mlp": 0.01032422, + "balance_loss_clip": 1.05038106, + "balance_loss_mlp": 1.0191716, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.8051842019271218, + "language_loss": 0.68309408, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.7046864, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13244629, + "step": 9101, + "time_per_iteration": 2.6404266357421875 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.05199265, + "balance_loss_mlp": 1.01782703, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 2.0765915624802487, + "language_loss": 0.78285563, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80444932, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.128479, + "step": 9102, + "time_per_iteration": 2.499000310897827 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.05733395, + "balance_loss_mlp": 1.02333808, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.5395125337106, + "language_loss": 0.72197574, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74367994, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13018799, + "step": 9103, + "time_per_iteration": 2.500826835632324 + }, + { + "auxiliary_loss_clip": 0.0112349, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.04661369, + "balance_loss_mlp": 1.01625276, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.6253840264448778, + "language_loss": 0.65540999, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67693639, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12902832, + "step": 9104, + "time_per_iteration": 4.3611085414886475 + }, + { + "auxiliary_loss_clip": 0.01132529, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.05325139, + "balance_loss_mlp": 1.0184257, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.7906977977425524, + "language_loss": 0.81557262, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83722198, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13995361, + "step": 9105, + "time_per_iteration": 2.46317458152771 + }, + { + "auxiliary_loss_clip": 0.01127041, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.05073214, + "balance_loss_mlp": 1.01605165, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.747017493063883, + "language_loss": 0.80414426, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82569778, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12268066, + "step": 9106, + "time_per_iteration": 2.5204708576202393 + }, + { + "auxiliary_loss_clip": 0.01123452, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.04638302, + "balance_loss_mlp": 1.02525795, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7872080378654693, + "language_loss": 0.69548059, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71711028, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.14276123, + "step": 9107, + "time_per_iteration": 2.438828468322754 + }, + { + "auxiliary_loss_clip": 0.01133429, + "auxiliary_loss_mlp": 0.01039986, + "balance_loss_clip": 1.05048156, + "balance_loss_mlp": 1.02618778, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.9455511213574908, + "language_loss": 0.63326895, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65500307, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13800049, + "step": 9108, + "time_per_iteration": 2.4783408641815186 + }, + { + "auxiliary_loss_clip": 0.01129841, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.05202651, + "balance_loss_mlp": 1.01641297, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6736056130828036, + "language_loss": 0.74894691, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77052981, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12023926, + "step": 9109, + "time_per_iteration": 2.505662679672241 + }, + { + "auxiliary_loss_clip": 0.01124997, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.04788017, + "balance_loss_mlp": 1.02130163, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.8190625321440554, + "language_loss": 0.7771402, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79874837, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.14520264, + "step": 9110, + "time_per_iteration": 2.434654951095581 + }, + { + "auxiliary_loss_clip": 0.01130742, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.05402541, + "balance_loss_mlp": 1.01853037, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.4755635322052048, + "language_loss": 0.7110396, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73265302, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12060547, + "step": 9111, + "time_per_iteration": 2.5087380409240723 + }, + { + "auxiliary_loss_clip": 0.0112753, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.05011964, + "balance_loss_mlp": 1.02164102, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.8400629253601466, + "language_loss": 0.71274704, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73436332, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12451172, + "step": 9112, + "time_per_iteration": 2.4407947063446045 + }, + { + "auxiliary_loss_clip": 0.01128868, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.0512898, + "balance_loss_mlp": 1.01858354, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 3.0783031877717137, + "language_loss": 0.87816757, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.8997671, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12518311, + "step": 9113, + "time_per_iteration": 2.444410800933838 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.04978943, + "balance_loss_mlp": 1.02178311, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.545040200675051, + "language_loss": 0.73075223, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75240695, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.14135742, + "step": 9114, + "time_per_iteration": 2.4684336185455322 + }, + { + "auxiliary_loss_clip": 0.01124534, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.0487752, + "balance_loss_mlp": 1.01812637, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 2.756262192553591, + "language_loss": 0.72126424, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74280888, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11798096, + "step": 9115, + "time_per_iteration": 2.5399086475372314 + }, + { + "auxiliary_loss_clip": 0.01120923, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.04241335, + "balance_loss_mlp": 1.022017, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 2.1626070380148494, + "language_loss": 0.71622401, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.7377882, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13476562, + "step": 9116, + "time_per_iteration": 2.5272319316864014 + }, + { + "auxiliary_loss_clip": 0.01127907, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.0516243, + "balance_loss_mlp": 1.0222007, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 1.8699378771475919, + "language_loss": 0.76448697, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78611583, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12780762, + "step": 9117, + "time_per_iteration": 2.5184290409088135 + }, + { + "auxiliary_loss_clip": 0.01112771, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.03983879, + "balance_loss_mlp": 1.0153209, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.676840056554097, + "language_loss": 0.6295867, + "learning_rate": 1.785237306671674e-06, + "loss": 0.65098584, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1182251, + "step": 9118, + "time_per_iteration": 2.587329149246216 + }, + { + "auxiliary_loss_clip": 0.01127362, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.04887402, + "balance_loss_mlp": 1.0151906, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 1.6574187457528948, + "language_loss": 0.79469311, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81625724, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13861084, + "step": 9119, + "time_per_iteration": 2.4825186729431152 + }, + { + "auxiliary_loss_clip": 0.01125699, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.05166304, + "balance_loss_mlp": 1.0214119, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.7271071604525108, + "language_loss": 0.82418031, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.8457641, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11273193, + "step": 9120, + "time_per_iteration": 2.476161479949951 + }, + { + "auxiliary_loss_clip": 0.01127767, + "auxiliary_loss_mlp": 0.01037949, + "balance_loss_clip": 1.04985237, + "balance_loss_mlp": 1.02322114, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.6908509091852144, + "language_loss": 0.80314106, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82479823, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.1473999, + "step": 9121, + "time_per_iteration": 2.475236654281616 + }, + { + "auxiliary_loss_clip": 0.0112889, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.04850793, + "balance_loss_mlp": 1.01965296, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 2.755247741333691, + "language_loss": 0.61107349, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63269728, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1383667, + "step": 9122, + "time_per_iteration": 2.507086753845215 + }, + { + "auxiliary_loss_clip": 0.01123116, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.04789972, + "balance_loss_mlp": 1.0224123, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7710860362010177, + "language_loss": 0.7189827, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74054646, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.10852051, + "step": 9123, + "time_per_iteration": 2.531926393508911 + }, + { + "auxiliary_loss_clip": 0.0112182, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.04550207, + "balance_loss_mlp": 1.01774454, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 1.8536833847813021, + "language_loss": 0.83628392, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85779756, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11798096, + "step": 9124, + "time_per_iteration": 2.4380295276641846 + }, + { + "auxiliary_loss_clip": 0.01126646, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.05101454, + "balance_loss_mlp": 1.01768255, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.6936072650982823, + "language_loss": 0.80282724, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82439679, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1262207, + "step": 9125, + "time_per_iteration": 3.931346893310547 + }, + { + "auxiliary_loss_clip": 0.01127324, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_clip": 1.04962373, + "balance_loss_mlp": 1.01535225, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.024034157775054, + "language_loss": 0.74472302, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76627719, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12750244, + "step": 9126, + "time_per_iteration": 2.4181129932403564 + }, + { + "auxiliary_loss_clip": 0.01120531, + "auxiliary_loss_mlp": 0.01039649, + "balance_loss_clip": 1.04158831, + "balance_loss_mlp": 1.02487922, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.9914294625374924, + "language_loss": 0.67357051, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.69517231, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.14776611, + "step": 9127, + "time_per_iteration": 2.3896660804748535 + }, + { + "auxiliary_loss_clip": 0.0112705, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.0479579, + "balance_loss_mlp": 1.02541828, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.6553497811374234, + "language_loss": 0.82958215, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85124761, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.14074707, + "step": 9128, + "time_per_iteration": 2.489649534225464 + }, + { + "auxiliary_loss_clip": 0.01129358, + "auxiliary_loss_mlp": 0.01053299, + "balance_loss_clip": 1.04727042, + "balance_loss_mlp": 1.03873205, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.8326425493667962, + "language_loss": 0.7390027, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76082927, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14569092, + "step": 9129, + "time_per_iteration": 2.4051296710968018 + }, + { + "auxiliary_loss_clip": 0.01130562, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.04820704, + "balance_loss_mlp": 1.02056801, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 3.4780377155216744, + "language_loss": 0.63255906, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65421271, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14227295, + "step": 9130, + "time_per_iteration": 2.4428927898406982 + }, + { + "auxiliary_loss_clip": 0.01123847, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.04469049, + "balance_loss_mlp": 1.02265131, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.9574055870696005, + "language_loss": 0.63050467, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65209842, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12896729, + "step": 9131, + "time_per_iteration": 2.461915969848633 + }, + { + "auxiliary_loss_clip": 0.01127834, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.04975605, + "balance_loss_mlp": 1.01609647, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.716144238805276, + "language_loss": 0.74719167, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76876473, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.1338501, + "step": 9132, + "time_per_iteration": 3.8235628604888916 + }, + { + "auxiliary_loss_clip": 0.0112666, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.04850435, + "balance_loss_mlp": 1.01779056, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.5690054119002494, + "language_loss": 0.81339884, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83496737, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12420654, + "step": 9133, + "time_per_iteration": 2.449073076248169 + }, + { + "auxiliary_loss_clip": 0.01125514, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.04883552, + "balance_loss_mlp": 1.01832712, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 3.0116237207034287, + "language_loss": 0.70139718, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72296274, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1270752, + "step": 9134, + "time_per_iteration": 2.442192554473877 + }, + { + "auxiliary_loss_clip": 0.01134056, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.0546689, + "balance_loss_mlp": 1.02280092, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 2.803597689079373, + "language_loss": 0.61175931, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63345158, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12359619, + "step": 9135, + "time_per_iteration": 2.69352388381958 + }, + { + "auxiliary_loss_clip": 0.01131729, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.05187464, + "balance_loss_mlp": 1.02056563, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 2.3710632376487943, + "language_loss": 0.72651553, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74817634, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13775635, + "step": 9136, + "time_per_iteration": 2.508124589920044 + }, + { + "auxiliary_loss_clip": 0.01124255, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.04151154, + "balance_loss_mlp": 1.02179074, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 4.9106311491243355, + "language_loss": 0.6811235, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70272899, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14508057, + "step": 9137, + "time_per_iteration": 2.5159597396850586 + }, + { + "auxiliary_loss_clip": 0.01064972, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.03904295, + "balance_loss_mlp": 1.00223327, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.7405679538643022, + "language_loss": 0.65232712, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67301214, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01290894, + "step": 9138, + "time_per_iteration": 3.1343159675598145 + }, + { + "auxiliary_loss_clip": 0.01131251, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.05191731, + "balance_loss_mlp": 1.01904047, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 3.3024861049173917, + "language_loss": 0.75215173, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77378345, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12890625, + "step": 9139, + "time_per_iteration": 3.9166605472564697 + }, + { + "auxiliary_loss_clip": 0.01130238, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.05317354, + "balance_loss_mlp": 1.01737928, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.9930702421583104, + "language_loss": 0.71165812, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.7332567, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12231445, + "step": 9140, + "time_per_iteration": 2.432396411895752 + }, + { + "auxiliary_loss_clip": 0.01120156, + "auxiliary_loss_mlp": 0.01044475, + "balance_loss_clip": 1.04265428, + "balance_loss_mlp": 1.02910876, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.6411993319000373, + "language_loss": 0.76509178, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.7867381, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.15356445, + "step": 9141, + "time_per_iteration": 2.49538254737854 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.04575956, + "balance_loss_mlp": 1.02118218, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.7913069735748668, + "language_loss": 0.74997246, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77151597, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12304688, + "step": 9142, + "time_per_iteration": 2.451714038848877 + }, + { + "auxiliary_loss_clip": 0.01138598, + "auxiliary_loss_mlp": 0.01043517, + "balance_loss_clip": 1.05433989, + "balance_loss_mlp": 1.02926528, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 2.145925886366669, + "language_loss": 0.76742089, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78924197, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14251709, + "step": 9143, + "time_per_iteration": 2.4931435585021973 + }, + { + "auxiliary_loss_clip": 0.01122269, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.046821, + "balance_loss_mlp": 1.02077234, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 3.2352769800731345, + "language_loss": 0.79875791, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82031059, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12243652, + "step": 9144, + "time_per_iteration": 2.4613089561462402 + }, + { + "auxiliary_loss_clip": 0.01131597, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.05279756, + "balance_loss_mlp": 1.01908493, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.7293003971224017, + "language_loss": 0.71195048, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73358345, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.1262207, + "step": 9145, + "time_per_iteration": 2.623661994934082 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01033531, + "balance_loss_clip": 1.05114794, + "balance_loss_mlp": 1.02121687, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.577396272905536, + "language_loss": 0.7053051, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72694981, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12322998, + "step": 9146, + "time_per_iteration": 2.6703052520751953 + }, + { + "auxiliary_loss_clip": 0.01120647, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.04699731, + "balance_loss_mlp": 1.01914728, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.8008217457440987, + "language_loss": 0.63837028, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.65988004, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11193848, + "step": 9147, + "time_per_iteration": 2.507230758666992 + }, + { + "auxiliary_loss_clip": 0.0113701, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.05888343, + "balance_loss_mlp": 1.018435, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.0315668545972376, + "language_loss": 0.81119996, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.83288252, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12811279, + "step": 9148, + "time_per_iteration": 3.9868617057800293 + }, + { + "auxiliary_loss_clip": 0.01129871, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.05031657, + "balance_loss_mlp": 1.02145791, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.7691795769824274, + "language_loss": 0.78919768, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81083995, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12902832, + "step": 9149, + "time_per_iteration": 2.5073585510253906 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01025919, + "balance_loss_clip": 1.05328441, + "balance_loss_mlp": 1.01349187, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 1.9229782104430961, + "language_loss": 0.72278762, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74434662, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12438965, + "step": 9150, + "time_per_iteration": 2.5497474670410156 + }, + { + "auxiliary_loss_clip": 0.01131119, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.05191612, + "balance_loss_mlp": 1.01377761, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.7760696204518227, + "language_loss": 0.74935687, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77093917, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13330078, + "step": 9151, + "time_per_iteration": 2.438321590423584 + }, + { + "auxiliary_loss_clip": 0.01122553, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.04680419, + "balance_loss_mlp": 1.02047622, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.849657320860589, + "language_loss": 0.76231712, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78386706, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11950684, + "step": 9152, + "time_per_iteration": 2.469390869140625 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.04509163, + "balance_loss_mlp": 1.01867557, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.787383597740431, + "language_loss": 0.82469034, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84619623, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1217041, + "step": 9153, + "time_per_iteration": 2.5705156326293945 + }, + { + "auxiliary_loss_clip": 0.01126254, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.05129862, + "balance_loss_mlp": 1.02099109, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 2.458240807787056, + "language_loss": 0.74063128, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76223326, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.1293335, + "step": 9154, + "time_per_iteration": 2.5735716819763184 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.05094576, + "balance_loss_mlp": 1.01981902, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 2.508482578641778, + "language_loss": 0.7287122, + "learning_rate": 1.770916243273199e-06, + "loss": 0.75035775, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.1361084, + "step": 9155, + "time_per_iteration": 2.4473118782043457 + }, + { + "auxiliary_loss_clip": 0.01061703, + "auxiliary_loss_mlp": 0.01006742, + "balance_loss_clip": 1.03512001, + "balance_loss_mlp": 1.00547731, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7420214988232344, + "language_loss": 0.55357921, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57426363, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01264954, + "step": 9156, + "time_per_iteration": 3.252563953399658 + }, + { + "auxiliary_loss_clip": 0.01123981, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.04809952, + "balance_loss_mlp": 1.02118337, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.7246650467126992, + "language_loss": 0.83141446, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.852988, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12188721, + "step": 9157, + "time_per_iteration": 2.4491374492645264 + }, + { + "auxiliary_loss_clip": 0.01131498, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.04882801, + "balance_loss_mlp": 1.02205515, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.1946773893170968, + "language_loss": 0.75665653, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77833766, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14562988, + "step": 9158, + "time_per_iteration": 2.537430763244629 + }, + { + "auxiliary_loss_clip": 0.01124173, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.05052412, + "balance_loss_mlp": 1.02015924, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.6975541000645318, + "language_loss": 0.69434643, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71591204, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12231445, + "step": 9159, + "time_per_iteration": 2.4778566360473633 + }, + { + "auxiliary_loss_clip": 0.01128546, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.05092478, + "balance_loss_mlp": 1.0197041, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 2.2153662567773584, + "language_loss": 0.68290728, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.70452189, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13201904, + "step": 9160, + "time_per_iteration": 2.576282262802124 + }, + { + "auxiliary_loss_clip": 0.01115942, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.04171181, + "balance_loss_mlp": 1.02238882, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.7869690396308058, + "language_loss": 0.71222699, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7337324, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12213135, + "step": 9161, + "time_per_iteration": 2.4441404342651367 + }, + { + "auxiliary_loss_clip": 0.01127255, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.05125177, + "balance_loss_mlp": 1.0263896, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.6162559790010758, + "language_loss": 0.69519603, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71685439, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12194824, + "step": 9162, + "time_per_iteration": 2.6139824390411377 + }, + { + "auxiliary_loss_clip": 0.01118489, + "auxiliary_loss_mlp": 0.01038253, + "balance_loss_clip": 1.04343677, + "balance_loss_mlp": 1.02542627, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6738346058078382, + "language_loss": 0.85706931, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87863678, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1282959, + "step": 9163, + "time_per_iteration": 2.500411033630371 + }, + { + "auxiliary_loss_clip": 0.01126061, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.05063045, + "balance_loss_mlp": 1.01994646, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 2.3772767327033235, + "language_loss": 0.80461311, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82619476, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12145996, + "step": 9164, + "time_per_iteration": 2.45497989654541 + }, + { + "auxiliary_loss_clip": 0.01124029, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.04610312, + "balance_loss_mlp": 1.01631296, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.7763546127084529, + "language_loss": 0.73612618, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75766104, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13140869, + "step": 9165, + "time_per_iteration": 2.446068525314331 + }, + { + "auxiliary_loss_clip": 0.01115958, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.0403477, + "balance_loss_mlp": 1.0169642, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 2.079461566543487, + "language_loss": 0.7941916, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81564277, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12182617, + "step": 9166, + "time_per_iteration": 2.4334442615509033 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01037806, + "balance_loss_clip": 1.04835176, + "balance_loss_mlp": 1.02339911, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.219425139441517, + "language_loss": 0.7667824, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78843772, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14385986, + "step": 9167, + "time_per_iteration": 2.4665310382843018 + }, + { + "auxiliary_loss_clip": 0.01126567, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.05048203, + "balance_loss_mlp": 1.01820016, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 1.9408724470441443, + "language_loss": 0.80230087, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82388169, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13311768, + "step": 9168, + "time_per_iteration": 2.4602627754211426 + }, + { + "auxiliary_loss_clip": 0.01128168, + "auxiliary_loss_mlp": 0.01037917, + "balance_loss_clip": 1.04930782, + "balance_loss_mlp": 1.0235641, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 2.2338883700412255, + "language_loss": 0.68788272, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.70954359, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14355469, + "step": 9169, + "time_per_iteration": 3.9859402179718018 + }, + { + "auxiliary_loss_clip": 0.01119666, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.04538202, + "balance_loss_mlp": 1.01398611, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 1.9797219038967473, + "language_loss": 0.85492659, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87639004, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12670898, + "step": 9170, + "time_per_iteration": 2.437499523162842 + }, + { + "auxiliary_loss_clip": 0.01058463, + "auxiliary_loss_mlp": 0.0100381, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.00246429, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7805990759156753, + "language_loss": 0.59844583, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61906856, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 0.01345825, + "step": 9171, + "time_per_iteration": 3.107865810394287 + }, + { + "auxiliary_loss_clip": 0.01125938, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.04958594, + "balance_loss_mlp": 1.02361882, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.450645797360506, + "language_loss": 0.70380282, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.7254262, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12786865, + "step": 9172, + "time_per_iteration": 2.4405455589294434 + }, + { + "auxiliary_loss_clip": 0.01118716, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.04268491, + "balance_loss_mlp": 1.02239013, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.730628083481248, + "language_loss": 0.75928891, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.78082705, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12701416, + "step": 9173, + "time_per_iteration": 2.4651741981506348 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.05710125, + "balance_loss_mlp": 1.01938188, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.6310007017110741, + "language_loss": 0.7510671, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.7727319, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12878418, + "step": 9174, + "time_per_iteration": 2.4754719734191895 + }, + { + "auxiliary_loss_clip": 0.01130454, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.05280113, + "balance_loss_mlp": 1.0192399, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.9503045289715557, + "language_loss": 0.73371446, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.75533915, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12774658, + "step": 9175, + "time_per_iteration": 3.90635347366333 + }, + { + "auxiliary_loss_clip": 0.01128711, + "auxiliary_loss_mlp": 0.01036549, + "balance_loss_clip": 1.04997993, + "balance_loss_mlp": 1.02368021, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8802135275587957, + "language_loss": 0.69269079, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71434337, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12866211, + "step": 9176, + "time_per_iteration": 2.450625419616699 + }, + { + "auxiliary_loss_clip": 0.0112734, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.05064988, + "balance_loss_mlp": 1.02069592, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.5760802252104702, + "language_loss": 0.71301782, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73462331, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12524414, + "step": 9177, + "time_per_iteration": 2.5136916637420654 + }, + { + "auxiliary_loss_clip": 0.01129349, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.05167675, + "balance_loss_mlp": 1.01745284, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.656679263559422, + "language_loss": 0.80119497, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82278848, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12561035, + "step": 9178, + "time_per_iteration": 2.4343326091766357 + }, + { + "auxiliary_loss_clip": 0.01123337, + "auxiliary_loss_mlp": 0.01043913, + "balance_loss_clip": 1.04305518, + "balance_loss_mlp": 1.02936387, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.5744022809500768, + "language_loss": 0.74913132, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77080381, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14538574, + "step": 9179, + "time_per_iteration": 2.506239652633667 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.01040973, + "balance_loss_clip": 1.04303956, + "balance_loss_mlp": 1.0277288, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.688900262658396, + "language_loss": 0.70212775, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72372913, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13256836, + "step": 9180, + "time_per_iteration": 2.597104072570801 + }, + { + "auxiliary_loss_clip": 0.01129079, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04980564, + "balance_loss_mlp": 1.02118182, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 2.0298389029274126, + "language_loss": 0.67092931, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69256485, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13293457, + "step": 9181, + "time_per_iteration": 2.4599690437316895 + }, + { + "auxiliary_loss_clip": 0.01128182, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.04858136, + "balance_loss_mlp": 1.0204457, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9191089613085837, + "language_loss": 0.79481965, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.816441, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13525391, + "step": 9182, + "time_per_iteration": 3.911684513092041 + }, + { + "auxiliary_loss_clip": 0.01122092, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.04480886, + "balance_loss_mlp": 1.02309859, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 2.385505544227983, + "language_loss": 0.82747465, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.84907854, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.15203857, + "step": 9183, + "time_per_iteration": 2.4386813640594482 + }, + { + "auxiliary_loss_clip": 0.01127133, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.04770458, + "balance_loss_mlp": 1.01932919, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.306981937176197, + "language_loss": 0.67423582, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.6958313, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13079834, + "step": 9184, + "time_per_iteration": 2.57188081741333 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.04551303, + "balance_loss_mlp": 1.01355147, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.6298515096770552, + "language_loss": 0.76095939, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78245926, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13397217, + "step": 9185, + "time_per_iteration": 2.521394968032837 + }, + { + "auxiliary_loss_clip": 0.01131416, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.05012679, + "balance_loss_mlp": 1.02516675, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.7241433416931509, + "language_loss": 0.74575543, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76745802, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13671875, + "step": 9186, + "time_per_iteration": 2.5231497287750244 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.05335069, + "balance_loss_mlp": 1.02292907, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.042716218252293, + "language_loss": 0.66678733, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68847567, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12487793, + "step": 9187, + "time_per_iteration": 2.4628119468688965 + }, + { + "auxiliary_loss_clip": 0.01130014, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.05001962, + "balance_loss_mlp": 1.02194834, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.5814931099354332, + "language_loss": 0.77795154, + "learning_rate": 1.758153413657318e-06, + "loss": 0.7996062, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13500977, + "step": 9188, + "time_per_iteration": 2.4739387035369873 + }, + { + "auxiliary_loss_clip": 0.01129653, + "auxiliary_loss_mlp": 0.01032962, + "balance_loss_clip": 1.05222845, + "balance_loss_mlp": 1.01930618, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.921121539199938, + "language_loss": 0.80902243, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83064866, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13653564, + "step": 9189, + "time_per_iteration": 2.508949041366577 + }, + { + "auxiliary_loss_clip": 0.0112505, + "auxiliary_loss_mlp": 0.01028944, + "balance_loss_clip": 1.04733562, + "balance_loss_mlp": 1.01487136, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.5046575629946226, + "language_loss": 0.76772511, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78926504, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.14068604, + "step": 9190, + "time_per_iteration": 2.574918031692505 + }, + { + "auxiliary_loss_clip": 0.0113786, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.05649924, + "balance_loss_mlp": 1.0224545, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.3064272509971326, + "language_loss": 0.78704453, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.80879766, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.14990234, + "step": 9191, + "time_per_iteration": 3.9286410808563232 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.05439389, + "balance_loss_mlp": 1.02159834, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.7527622406264256, + "language_loss": 0.68708396, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70876569, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13146973, + "step": 9192, + "time_per_iteration": 2.4410109519958496 + }, + { + "auxiliary_loss_clip": 0.01143034, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.06515455, + "balance_loss_mlp": 1.02019429, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.4837507592783798, + "language_loss": 0.77328503, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79503232, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11505127, + "step": 9193, + "time_per_iteration": 2.4910054206848145 + }, + { + "auxiliary_loss_clip": 0.01123688, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.04781187, + "balance_loss_mlp": 1.01936603, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.4791296144139063, + "language_loss": 0.78489512, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80644715, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.121521, + "step": 9194, + "time_per_iteration": 2.446348190307617 + }, + { + "auxiliary_loss_clip": 0.01134695, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.052809, + "balance_loss_mlp": 1.01989341, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 2.0186927713315583, + "language_loss": 0.69965625, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.72133988, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13775635, + "step": 9195, + "time_per_iteration": 2.6135311126708984 + }, + { + "auxiliary_loss_clip": 0.01143144, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.05688822, + "balance_loss_mlp": 1.01936412, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 1.9812360209059554, + "language_loss": 0.74163914, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76340967, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14520264, + "step": 9196, + "time_per_iteration": 2.4159581661224365 + }, + { + "auxiliary_loss_clip": 0.01128669, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.05211687, + "balance_loss_mlp": 1.01999068, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5073998355566454, + "language_loss": 0.76918972, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79080063, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12432861, + "step": 9197, + "time_per_iteration": 2.4967596530914307 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.05041289, + "balance_loss_mlp": 1.01797795, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.566132900515399, + "language_loss": 0.76060206, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78217483, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.1182251, + "step": 9198, + "time_per_iteration": 2.6523587703704834 + }, + { + "auxiliary_loss_clip": 0.01123329, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.0473274, + "balance_loss_mlp": 1.01629591, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.5556821812023027, + "language_loss": 0.7907263, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81224561, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12310791, + "step": 9199, + "time_per_iteration": 2.5055129528045654 + }, + { + "auxiliary_loss_clip": 0.01126996, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.04910207, + "balance_loss_mlp": 1.02088761, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.9817202917724615, + "language_loss": 0.63973725, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66134089, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12481689, + "step": 9200, + "time_per_iteration": 2.4317216873168945 + }, + { + "auxiliary_loss_clip": 0.01147882, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.06422412, + "balance_loss_mlp": 1.01547933, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.5280531235986616, + "language_loss": 0.65966243, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68143785, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.1416626, + "step": 9201, + "time_per_iteration": 2.5312986373901367 + }, + { + "auxiliary_loss_clip": 0.0112935, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.05238509, + "balance_loss_mlp": 1.01966906, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.238175327405593, + "language_loss": 0.60744596, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.62907088, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13482666, + "step": 9202, + "time_per_iteration": 2.453237771987915 + }, + { + "auxiliary_loss_clip": 0.01123132, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.0486834, + "balance_loss_mlp": 1.01685452, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 3.0498416078461332, + "language_loss": 0.64120555, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66272724, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12176514, + "step": 9203, + "time_per_iteration": 2.7199296951293945 + }, + { + "auxiliary_loss_clip": 0.01125332, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.04907012, + "balance_loss_mlp": 1.01810873, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5081801264978394, + "language_loss": 0.63667685, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65823436, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12310791, + "step": 9204, + "time_per_iteration": 2.616964340209961 + }, + { + "auxiliary_loss_clip": 0.01121219, + "auxiliary_loss_mlp": 0.01025047, + "balance_loss_clip": 1.04572415, + "balance_loss_mlp": 1.01353717, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 2.0655971064979872, + "language_loss": 0.77647209, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79793477, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11505127, + "step": 9205, + "time_per_iteration": 2.593284845352173 + }, + { + "auxiliary_loss_clip": 0.0112311, + "auxiliary_loss_mlp": 0.010379, + "balance_loss_clip": 1.04833412, + "balance_loss_mlp": 1.02489424, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.4200065012747232, + "language_loss": 0.72656965, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74817967, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13000488, + "step": 9206, + "time_per_iteration": 2.5908472537994385 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.04563653, + "balance_loss_mlp": 1.02270222, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.9549488816206892, + "language_loss": 0.75226712, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77386957, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13110352, + "step": 9207, + "time_per_iteration": 2.5289578437805176 + }, + { + "auxiliary_loss_clip": 0.01127645, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.04664266, + "balance_loss_mlp": 1.02501392, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.2556707820125146, + "language_loss": 0.61538106, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6370492, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14160156, + "step": 9208, + "time_per_iteration": 2.4138998985290527 + }, + { + "auxiliary_loss_clip": 0.01125601, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.04735899, + "balance_loss_mlp": 1.01814651, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.033523298757623, + "language_loss": 0.64480114, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66636252, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12390137, + "step": 9209, + "time_per_iteration": 2.510662794113159 + }, + { + "auxiliary_loss_clip": 0.01126075, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.04796553, + "balance_loss_mlp": 1.02779484, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.9593937217220243, + "language_loss": 0.82651222, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84818733, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13659668, + "step": 9210, + "time_per_iteration": 2.448079824447632 + }, + { + "auxiliary_loss_clip": 0.0112174, + "auxiliary_loss_mlp": 0.01025438, + "balance_loss_clip": 1.04725027, + "balance_loss_mlp": 1.01369572, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 4.828685404119653, + "language_loss": 0.72919703, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75066882, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11749268, + "step": 9211, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01125819, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.04691613, + "balance_loss_mlp": 1.02326047, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.8344306875315786, + "language_loss": 0.66586584, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68749928, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.14257812, + "step": 9212, + "time_per_iteration": 3.8553950786590576 + }, + { + "auxiliary_loss_clip": 0.01125021, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.04471254, + "balance_loss_mlp": 1.01753569, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.40725946417483, + "language_loss": 0.51821691, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53978658, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.14404297, + "step": 9213, + "time_per_iteration": 2.5821003913879395 + }, + { + "auxiliary_loss_clip": 0.01133106, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.0536778, + "balance_loss_mlp": 1.0198853, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.918192976645356, + "language_loss": 0.85909027, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.88075328, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13293457, + "step": 9214, + "time_per_iteration": 2.4263477325439453 + }, + { + "auxiliary_loss_clip": 0.01123864, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.04712307, + "balance_loss_mlp": 1.01784742, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.818837061408716, + "language_loss": 0.7019009, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72343814, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12017822, + "step": 9215, + "time_per_iteration": 2.491828680038452 + }, + { + "auxiliary_loss_clip": 0.01124923, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.0475297, + "balance_loss_mlp": 1.01577425, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.575673578253578, + "language_loss": 0.72955108, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75108773, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12982178, + "step": 9216, + "time_per_iteration": 2.5427074432373047 + }, + { + "auxiliary_loss_clip": 0.01123763, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.0493691, + "balance_loss_mlp": 1.01618052, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.8870341401534745, + "language_loss": 0.71755081, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.7390722, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12213135, + "step": 9217, + "time_per_iteration": 2.528597831726074 + }, + { + "auxiliary_loss_clip": 0.01114863, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.04044294, + "balance_loss_mlp": 1.01827741, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.7915894336533777, + "language_loss": 0.78284156, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80430251, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1295166, + "step": 9218, + "time_per_iteration": 2.490434408187866 + }, + { + "auxiliary_loss_clip": 0.01125113, + "auxiliary_loss_mlp": 0.01040154, + "balance_loss_clip": 1.04423511, + "balance_loss_mlp": 1.02605772, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.6455182587951538, + "language_loss": 0.72041541, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74206811, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14099121, + "step": 9219, + "time_per_iteration": 3.9677350521087646 + }, + { + "auxiliary_loss_clip": 0.01128399, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.04969609, + "balance_loss_mlp": 1.02216518, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 2.3065330339340697, + "language_loss": 0.71552658, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73715973, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12744141, + "step": 9220, + "time_per_iteration": 2.4561362266540527 + }, + { + "auxiliary_loss_clip": 0.0112014, + "auxiliary_loss_mlp": 0.01025237, + "balance_loss_clip": 1.04475808, + "balance_loss_mlp": 1.01366162, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.5900087767936688, + "language_loss": 0.79275304, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81420684, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11578369, + "step": 9221, + "time_per_iteration": 2.411647319793701 + }, + { + "auxiliary_loss_clip": 0.01125679, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.05080831, + "balance_loss_mlp": 1.0194788, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.7791814080253523, + "language_loss": 0.83893871, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86051673, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12640381, + "step": 9222, + "time_per_iteration": 2.487880229949951 + }, + { + "auxiliary_loss_clip": 0.01130117, + "auxiliary_loss_mlp": 0.01042099, + "balance_loss_clip": 1.0492183, + "balance_loss_mlp": 1.02612507, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 2.9769342377733596, + "language_loss": 0.75632715, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77804935, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.15979004, + "step": 9223, + "time_per_iteration": 2.4859793186187744 + }, + { + "auxiliary_loss_clip": 0.01129036, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.05166233, + "balance_loss_mlp": 1.01729608, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.6954495065133144, + "language_loss": 0.8180154, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.83961046, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13183594, + "step": 9224, + "time_per_iteration": 2.5282747745513916 + }, + { + "auxiliary_loss_clip": 0.01134852, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.05544865, + "balance_loss_mlp": 1.02500892, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 1.9078075209550458, + "language_loss": 0.57212126, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59384841, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12866211, + "step": 9225, + "time_per_iteration": 2.409296751022339 + }, + { + "auxiliary_loss_clip": 0.01130422, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.05203986, + "balance_loss_mlp": 1.01810074, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 3.9983050226117887, + "language_loss": 0.67761022, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69922197, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12646484, + "step": 9226, + "time_per_iteration": 3.8657820224761963 + }, + { + "auxiliary_loss_clip": 0.01122742, + "auxiliary_loss_mlp": 0.01038242, + "balance_loss_clip": 1.04460382, + "balance_loss_mlp": 1.0251466, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.5764706394147712, + "language_loss": 0.74424124, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76585108, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13098145, + "step": 9227, + "time_per_iteration": 2.4841363430023193 + }, + { + "auxiliary_loss_clip": 0.01133039, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.05314541, + "balance_loss_mlp": 1.01793516, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 1.6119155869178532, + "language_loss": 0.73403794, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75568116, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13336182, + "step": 9228, + "time_per_iteration": 2.567955493927002 + }, + { + "auxiliary_loss_clip": 0.0112602, + "auxiliary_loss_mlp": 0.01027235, + "balance_loss_clip": 1.04906464, + "balance_loss_mlp": 1.01499796, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.7742060290315267, + "language_loss": 0.76124907, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12249756, + "step": 9229, + "time_per_iteration": 2.4770712852478027 + }, + { + "auxiliary_loss_clip": 0.01129022, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.05208707, + "balance_loss_mlp": 1.02663279, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.4116573115960307, + "language_loss": 0.69079554, + "learning_rate": 1.741924325613172e-06, + "loss": 0.71248674, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13458252, + "step": 9230, + "time_per_iteration": 2.585153818130493 + }, + { + "auxiliary_loss_clip": 0.01123139, + "auxiliary_loss_mlp": 0.01037881, + "balance_loss_clip": 1.04478359, + "balance_loss_mlp": 1.02312899, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.2923345882194495, + "language_loss": 0.68419337, + "learning_rate": 1.741538124855163e-06, + "loss": 0.70580357, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.1473999, + "step": 9231, + "time_per_iteration": 2.5176751613616943 + }, + { + "auxiliary_loss_clip": 0.01131807, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.05176115, + "balance_loss_mlp": 1.0201267, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.6583044755081013, + "language_loss": 0.78211844, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80378306, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.14556885, + "step": 9232, + "time_per_iteration": 2.542041540145874 + }, + { + "auxiliary_loss_clip": 0.01129855, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.05145574, + "balance_loss_mlp": 1.02547932, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.7573802673821548, + "language_loss": 0.82443571, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84610677, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.11773682, + "step": 9233, + "time_per_iteration": 2.517423391342163 + }, + { + "auxiliary_loss_clip": 0.01126954, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.04842401, + "balance_loss_mlp": 1.02348542, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.208045030999276, + "language_loss": 0.7495048, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77113396, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12481689, + "step": 9234, + "time_per_iteration": 3.861814260482788 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01026257, + "balance_loss_clip": 1.05127895, + "balance_loss_mlp": 1.01397848, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.2564846018744444, + "language_loss": 0.64562392, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.6671437, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1227417, + "step": 9235, + "time_per_iteration": 2.552339792251587 + }, + { + "auxiliary_loss_clip": 0.01120652, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.04428673, + "balance_loss_mlp": 1.01678228, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.6702914426177775, + "language_loss": 0.68867522, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.71017569, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12615967, + "step": 9236, + "time_per_iteration": 2.4338572025299072 + }, + { + "auxiliary_loss_clip": 0.01114995, + "auxiliary_loss_mlp": 0.01027675, + "balance_loss_clip": 1.04136574, + "balance_loss_mlp": 1.01539636, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 1.9058367632837694, + "language_loss": 0.86445272, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.8858794, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12280273, + "step": 9237, + "time_per_iteration": 2.513824462890625 + }, + { + "auxiliary_loss_clip": 0.01126845, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.05087805, + "balance_loss_mlp": 1.02251685, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.835624012669535, + "language_loss": 0.73626316, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75788748, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.13067627, + "step": 9238, + "time_per_iteration": 2.465890884399414 + }, + { + "auxiliary_loss_clip": 0.01128122, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.05002725, + "balance_loss_mlp": 1.01803041, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.674611965294263, + "language_loss": 0.78447175, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80605811, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12469482, + "step": 9239, + "time_per_iteration": 2.7218973636627197 + }, + { + "auxiliary_loss_clip": 0.01121911, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.04481971, + "balance_loss_mlp": 1.01658535, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.5573009101783208, + "language_loss": 0.7993955, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82090706, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12652588, + "step": 9240, + "time_per_iteration": 2.4576644897460938 + }, + { + "auxiliary_loss_clip": 0.01123135, + "auxiliary_loss_mlp": 0.01029443, + "balance_loss_clip": 1.04811287, + "balance_loss_mlp": 1.01748586, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 2.7861638958138566, + "language_loss": 0.65359467, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67512041, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11938477, + "step": 9241, + "time_per_iteration": 2.488471746444702 + }, + { + "auxiliary_loss_clip": 0.01119391, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.0429554, + "balance_loss_mlp": 1.01972568, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.8747543748749258, + "language_loss": 0.72813785, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74965751, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12854004, + "step": 9242, + "time_per_iteration": 2.4105052947998047 + }, + { + "auxiliary_loss_clip": 0.01122236, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.04382432, + "balance_loss_mlp": 1.02079177, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.887155067675976, + "language_loss": 0.63673699, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.65830457, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1373291, + "step": 9243, + "time_per_iteration": 2.526930093765259 + }, + { + "auxiliary_loss_clip": 0.01123551, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.04864907, + "balance_loss_mlp": 1.01955819, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.9211662007553245, + "language_loss": 0.75288033, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77443254, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12109375, + "step": 9244, + "time_per_iteration": 2.459956169128418 + }, + { + "auxiliary_loss_clip": 0.01121449, + "auxiliary_loss_mlp": 0.01028192, + "balance_loss_clip": 1.04946208, + "balance_loss_mlp": 1.01652122, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.095103872900313, + "language_loss": 0.75445342, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77594984, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11688232, + "step": 9245, + "time_per_iteration": 2.509451150894165 + }, + { + "auxiliary_loss_clip": 0.01124502, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.04626381, + "balance_loss_mlp": 1.0177412, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.594119341616725, + "language_loss": 0.79924256, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82079816, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13293457, + "step": 9246, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01120057, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.04471183, + "balance_loss_mlp": 1.0207051, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.9026236275867305, + "language_loss": 0.7396338, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76116753, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12609863, + "step": 9247, + "time_per_iteration": 2.4774527549743652 + }, + { + "auxiliary_loss_clip": 0.01120774, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.04327655, + "balance_loss_mlp": 1.02383518, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 2.8659724522430623, + "language_loss": 0.7554493, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.77702463, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12921143, + "step": 9248, + "time_per_iteration": 2.402519941329956 + }, + { + "auxiliary_loss_clip": 0.01066405, + "auxiliary_loss_mlp": 0.01005273, + "balance_loss_clip": 1.04050851, + "balance_loss_mlp": 1.00397706, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8427485494182908, + "language_loss": 0.59429741, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.6150142, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01295471, + "step": 9249, + "time_per_iteration": 3.290865659713745 + }, + { + "auxiliary_loss_clip": 0.01118005, + "auxiliary_loss_mlp": 0.01047734, + "balance_loss_clip": 1.04097331, + "balance_loss_mlp": 1.03156924, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8452908316892276, + "language_loss": 0.7975781, + "learning_rate": 1.734202189316832e-06, + "loss": 0.81923556, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.16162109, + "step": 9250, + "time_per_iteration": 2.4599955081939697 + }, + { + "auxiliary_loss_clip": 0.01126773, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.04777908, + "balance_loss_mlp": 1.02016592, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 3.1038728640971067, + "language_loss": 0.69423664, + "learning_rate": 1.733816187358836e-06, + "loss": 0.71583664, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13049316, + "step": 9251, + "time_per_iteration": 2.4753453731536865 + }, + { + "auxiliary_loss_clip": 0.01114603, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0279249, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.6217707725046435, + "language_loss": 0.75503361, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77659601, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.137146, + "step": 9252, + "time_per_iteration": 2.5475456714630127 + }, + { + "auxiliary_loss_clip": 0.011293, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.04944932, + "balance_loss_mlp": 1.02211261, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 2.444045696287641, + "language_loss": 0.73188436, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.75352156, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12310791, + "step": 9253, + "time_per_iteration": 2.652355194091797 + }, + { + "auxiliary_loss_clip": 0.01126141, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.0500226, + "balance_loss_mlp": 1.0155735, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 2.0399145230740103, + "language_loss": 0.83024383, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85178071, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11975098, + "step": 9254, + "time_per_iteration": 2.512260913848877 + }, + { + "auxiliary_loss_clip": 0.01062565, + "auxiliary_loss_mlp": 0.01005495, + "balance_loss_clip": 1.03671634, + "balance_loss_mlp": 1.00376749, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.870094377293597, + "language_loss": 0.64910316, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66978383, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01725769, + "step": 9255, + "time_per_iteration": 2.936825752258301 + }, + { + "auxiliary_loss_clip": 0.01123774, + "auxiliary_loss_mlp": 0.01037593, + "balance_loss_clip": 1.04992092, + "balance_loss_mlp": 1.02498627, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.8203674600129813, + "language_loss": 0.69705844, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.7186721, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1260376, + "step": 9256, + "time_per_iteration": 2.482776641845703 + }, + { + "auxiliary_loss_clip": 0.01112561, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.04004061, + "balance_loss_mlp": 1.02430725, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.5782318146908234, + "language_loss": 0.75938123, + "learning_rate": 1.73150038809119e-06, + "loss": 0.78088486, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13500977, + "step": 9257, + "time_per_iteration": 3.890258312225342 + }, + { + "auxiliary_loss_clip": 0.0112369, + "auxiliary_loss_mlp": 0.0104695, + "balance_loss_clip": 1.04413033, + "balance_loss_mlp": 1.03305042, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 3.038024705418455, + "language_loss": 0.60764241, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.62934887, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13903809, + "step": 9258, + "time_per_iteration": 2.511389970779419 + }, + { + "auxiliary_loss_clip": 0.01128913, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.05151272, + "balance_loss_mlp": 1.01839304, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.6101226747021793, + "language_loss": 0.7916984, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81330651, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13494873, + "step": 9259, + "time_per_iteration": 2.480093002319336 + }, + { + "auxiliary_loss_clip": 0.01127058, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.05042601, + "balance_loss_mlp": 1.02079201, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.918780979690959, + "language_loss": 0.8194291, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.84103805, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13043213, + "step": 9260, + "time_per_iteration": 2.5056023597717285 + }, + { + "auxiliary_loss_clip": 0.01128214, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.05260551, + "balance_loss_mlp": 1.02254581, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4892286326853947, + "language_loss": 0.69024092, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71187943, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13079834, + "step": 9261, + "time_per_iteration": 2.4376378059387207 + }, + { + "auxiliary_loss_clip": 0.01059598, + "auxiliary_loss_mlp": 0.01002073, + "balance_loss_clip": 1.03328383, + "balance_loss_mlp": 1.00067925, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7344965541327577, + "language_loss": 0.61075544, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63137221, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.01394653, + "step": 9262, + "time_per_iteration": 3.0855190753936768 + }, + { + "auxiliary_loss_clip": 0.01131888, + "auxiliary_loss_mlp": 0.01036193, + "balance_loss_clip": 1.05234921, + "balance_loss_mlp": 1.02371204, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.5799511186201358, + "language_loss": 0.64738119, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66906202, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12475586, + "step": 9263, + "time_per_iteration": 3.943775177001953 + }, + { + "auxiliary_loss_clip": 0.01124981, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.04942119, + "balance_loss_mlp": 1.01732063, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 1.7484009408311711, + "language_loss": 0.73129177, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75283903, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12432861, + "step": 9264, + "time_per_iteration": 2.440204381942749 + }, + { + "auxiliary_loss_clip": 0.01126468, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.04776371, + "balance_loss_mlp": 1.02287221, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 2.1328476164042827, + "language_loss": 0.76749057, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78911507, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13116455, + "step": 9265, + "time_per_iteration": 2.5008344650268555 + }, + { + "auxiliary_loss_clip": 0.01127753, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.05310249, + "balance_loss_mlp": 1.02092052, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.447828335323215, + "language_loss": 0.71131688, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73291624, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1126709, + "step": 9266, + "time_per_iteration": 2.4947707653045654 + }, + { + "auxiliary_loss_clip": 0.01129747, + "auxiliary_loss_mlp": 0.0103966, + "balance_loss_clip": 1.05139208, + "balance_loss_mlp": 1.02677965, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7512754264169204, + "language_loss": 0.67807639, + "learning_rate": 1.727641538728533e-06, + "loss": 0.69977045, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12884521, + "step": 9267, + "time_per_iteration": 2.4515531063079834 + }, + { + "auxiliary_loss_clip": 0.01122664, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0480845, + "balance_loss_mlp": 1.04091954, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 2.1606592883866975, + "language_loss": 0.74814498, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76992345, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.14263916, + "step": 9268, + "time_per_iteration": 2.435706615447998 + }, + { + "auxiliary_loss_clip": 0.01122223, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.04679024, + "balance_loss_mlp": 1.01878285, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 2.2662755771707412, + "language_loss": 0.75250256, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77403063, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11798096, + "step": 9269, + "time_per_iteration": 3.8766233921051025 + }, + { + "auxiliary_loss_clip": 0.01128616, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.04669571, + "balance_loss_mlp": 1.02905703, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.5612510086642626, + "language_loss": 0.82643819, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84814465, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12982178, + "step": 9270, + "time_per_iteration": 2.529895305633545 + }, + { + "auxiliary_loss_clip": 0.01126544, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.04609144, + "balance_loss_mlp": 1.02276635, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.148135999342312, + "language_loss": 0.8002739, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.82189822, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13110352, + "step": 9271, + "time_per_iteration": 2.4842565059661865 + }, + { + "auxiliary_loss_clip": 0.01132366, + "auxiliary_loss_mlp": 0.01038841, + "balance_loss_clip": 1.05157459, + "balance_loss_mlp": 1.02578759, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.7638275745838197, + "language_loss": 0.90376139, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92547345, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13049316, + "step": 9272, + "time_per_iteration": 2.4586293697357178 + }, + { + "auxiliary_loss_clip": 0.01127213, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.05190432, + "balance_loss_mlp": 1.01849341, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.8490902955223334, + "language_loss": 0.84277046, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.8643527, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12512207, + "step": 9273, + "time_per_iteration": 2.5070748329162598 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01043302, + "balance_loss_clip": 1.04990506, + "balance_loss_mlp": 1.02899718, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 3.733567550447826, + "language_loss": 0.73777455, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.75950515, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14306641, + "step": 9274, + "time_per_iteration": 2.506540060043335 + }, + { + "auxiliary_loss_clip": 0.01133574, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.05152678, + "balance_loss_mlp": 1.02165055, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.5884192764888256, + "language_loss": 0.78127122, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.80296898, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14556885, + "step": 9275, + "time_per_iteration": 2.44181489944458 + }, + { + "auxiliary_loss_clip": 0.01134397, + "auxiliary_loss_mlp": 0.0103286, + "balance_loss_clip": 1.05572116, + "balance_loss_mlp": 1.0202353, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 2.658055286581019, + "language_loss": 0.74815536, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76982796, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.1262207, + "step": 9276, + "time_per_iteration": 2.439741849899292 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.05308449, + "balance_loss_mlp": 1.02279758, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.6651780855605, + "language_loss": 0.75198996, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77362335, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12005615, + "step": 9277, + "time_per_iteration": 2.4820775985717773 + }, + { + "auxiliary_loss_clip": 0.01120336, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.04702115, + "balance_loss_mlp": 1.02350163, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.58464092023857, + "language_loss": 0.71820879, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73976064, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11340332, + "step": 9278, + "time_per_iteration": 3.9553487300872803 + }, + { + "auxiliary_loss_clip": 0.01133007, + "auxiliary_loss_mlp": 0.01033436, + "balance_loss_clip": 1.05330205, + "balance_loss_mlp": 1.01987612, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.6149417135051825, + "language_loss": 0.75826287, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77992731, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13549805, + "step": 9279, + "time_per_iteration": 2.510164499282837 + }, + { + "auxiliary_loss_clip": 0.01137766, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.0573169, + "balance_loss_mlp": 1.02234459, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.574263035150027, + "language_loss": 0.67432624, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69605196, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12457275, + "step": 9280, + "time_per_iteration": 2.482893466949463 + }, + { + "auxiliary_loss_clip": 0.01130076, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.05117774, + "balance_loss_mlp": 1.02344704, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 2.3544524978555756, + "language_loss": 0.73268044, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75433844, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12280273, + "step": 9281, + "time_per_iteration": 2.471073627471924 + }, + { + "auxiliary_loss_clip": 0.01123576, + "auxiliary_loss_mlp": 0.01038528, + "balance_loss_clip": 1.04861379, + "balance_loss_mlp": 1.02599287, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 3.1872224210022395, + "language_loss": 0.75638473, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77800572, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12542725, + "step": 9282, + "time_per_iteration": 2.437725782394409 + }, + { + "auxiliary_loss_clip": 0.01123048, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.04888535, + "balance_loss_mlp": 1.01818371, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 2.347423772084779, + "language_loss": 0.66472322, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68626332, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12780762, + "step": 9283, + "time_per_iteration": 2.4257373809814453 + }, + { + "auxiliary_loss_clip": 0.01126806, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.05169868, + "balance_loss_mlp": 1.01607549, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 1.7245540016554468, + "language_loss": 0.82763684, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.8491801, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11444092, + "step": 9284, + "time_per_iteration": 2.507323741912842 + }, + { + "auxiliary_loss_clip": 0.01123904, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.04722905, + "balance_loss_mlp": 1.0190444, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 3.3723475566006775, + "language_loss": 0.85362101, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87517715, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12670898, + "step": 9285, + "time_per_iteration": 2.4369308948516846 + }, + { + "auxiliary_loss_clip": 0.01132588, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.05557108, + "balance_loss_mlp": 1.0248003, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 22.447165935752516, + "language_loss": 0.74135458, + "learning_rate": 1.720312582354912e-06, + "loss": 0.76305199, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12347412, + "step": 9286, + "time_per_iteration": 2.43563175201416 + }, + { + "auxiliary_loss_clip": 0.01124455, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.04880977, + "balance_loss_mlp": 1.01879549, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.7946356047633782, + "language_loss": 0.73814505, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.75969923, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12176514, + "step": 9287, + "time_per_iteration": 2.480949640274048 + }, + { + "auxiliary_loss_clip": 0.01132915, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.05404019, + "balance_loss_mlp": 1.01945376, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.6481124384878656, + "language_loss": 0.74989921, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77155787, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.1348877, + "step": 9288, + "time_per_iteration": 2.505990505218506 + }, + { + "auxiliary_loss_clip": 0.01130063, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.05326152, + "balance_loss_mlp": 1.02326715, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.8300520324898972, + "language_loss": 0.78080899, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.80247581, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13360596, + "step": 9289, + "time_per_iteration": 2.411039113998413 + }, + { + "auxiliary_loss_clip": 0.01141238, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.05838645, + "balance_loss_mlp": 1.02336276, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 2.0155637215083932, + "language_loss": 0.61185384, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63363302, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13311768, + "step": 9290, + "time_per_iteration": 2.5373785495758057 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.0458256, + "balance_loss_mlp": 1.02023244, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 4.1306917953857765, + "language_loss": 0.67885923, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70042634, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12823486, + "step": 9291, + "time_per_iteration": 2.4496243000030518 + }, + { + "auxiliary_loss_clip": 0.01126809, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.04711628, + "balance_loss_mlp": 1.02468657, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 1.7825648766437325, + "language_loss": 0.84373319, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86538184, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13366699, + "step": 9292, + "time_per_iteration": 2.4798104763031006 + }, + { + "auxiliary_loss_clip": 0.01121558, + "auxiliary_loss_mlp": 0.01044024, + "balance_loss_clip": 1.04583406, + "balance_loss_mlp": 1.02988601, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 2.0139533712822257, + "language_loss": 0.73382455, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75548035, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.14129639, + "step": 9293, + "time_per_iteration": 2.477910041809082 + }, + { + "auxiliary_loss_clip": 0.01123532, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.04837072, + "balance_loss_mlp": 1.02298379, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 2.0579661016044875, + "language_loss": 0.73014581, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.75173193, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12097168, + "step": 9294, + "time_per_iteration": 2.516622304916382 + }, + { + "auxiliary_loss_clip": 0.01122038, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.04511619, + "balance_loss_mlp": 1.01943493, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.4385753885981494, + "language_loss": 0.68213326, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70367396, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12609863, + "step": 9295, + "time_per_iteration": 2.4465830326080322 + }, + { + "auxiliary_loss_clip": 0.01129665, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.05337024, + "balance_loss_mlp": 1.02516174, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.521109174203552, + "language_loss": 0.80341935, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82509565, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12805176, + "step": 9296, + "time_per_iteration": 2.540578842163086 + }, + { + "auxiliary_loss_clip": 0.01123822, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.04869604, + "balance_loss_mlp": 1.02095377, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.7042585842592115, + "language_loss": 0.65506732, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67663908, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12402344, + "step": 9297, + "time_per_iteration": 2.5363996028900146 + }, + { + "auxiliary_loss_clip": 0.01127629, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.04935229, + "balance_loss_mlp": 1.02248836, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.6032085515675298, + "language_loss": 0.75360513, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77524245, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13616943, + "step": 9298, + "time_per_iteration": 2.5449442863464355 + }, + { + "auxiliary_loss_clip": 0.01058718, + "auxiliary_loss_mlp": 0.01001123, + "balance_loss_clip": 1.03308845, + "balance_loss_mlp": 0.99980831, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.681773353601689, + "language_loss": 0.52421987, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54481828, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01315308, + "step": 9299, + "time_per_iteration": 3.1598174571990967 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.05582821, + "balance_loss_mlp": 1.01872993, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 2.4239892428159164, + "language_loss": 0.69068754, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.71233255, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11999512, + "step": 9300, + "time_per_iteration": 2.596947193145752 + }, + { + "auxiliary_loss_clip": 0.01128234, + "auxiliary_loss_mlp": 0.01044941, + "balance_loss_clip": 1.04730821, + "balance_loss_mlp": 1.03132105, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 2.864312805737601, + "language_loss": 0.82113063, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84286237, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13616943, + "step": 9301, + "time_per_iteration": 3.9399361610412598 + }, + { + "auxiliary_loss_clip": 0.01124084, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.04514861, + "balance_loss_mlp": 1.01746273, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.859197302315568, + "language_loss": 0.68156421, + "learning_rate": 1.714143795138756e-06, + "loss": 0.70311064, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13092041, + "step": 9302, + "time_per_iteration": 2.6324503421783447 + }, + { + "auxiliary_loss_clip": 0.01133966, + "auxiliary_loss_mlp": 0.01028672, + "balance_loss_clip": 1.0549705, + "balance_loss_mlp": 1.01605988, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.8745775129378004, + "language_loss": 0.71270514, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73433149, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12615967, + "step": 9303, + "time_per_iteration": 2.505119800567627 + }, + { + "auxiliary_loss_clip": 0.01124003, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.05210626, + "balance_loss_mlp": 1.01865649, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.6023106340597246, + "language_loss": 0.72630596, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.747841, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10839844, + "step": 9304, + "time_per_iteration": 2.4729628562927246 + }, + { + "auxiliary_loss_clip": 0.01121354, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.04544795, + "balance_loss_mlp": 1.01855946, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.8490581699821702, + "language_loss": 0.77635872, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.79788506, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1270752, + "step": 9305, + "time_per_iteration": 2.4392409324645996 + }, + { + "auxiliary_loss_clip": 0.01120114, + "auxiliary_loss_mlp": 0.01027099, + "balance_loss_clip": 1.04699838, + "balance_loss_mlp": 1.01574469, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.8148167343805481, + "language_loss": 0.69525337, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71672547, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11358643, + "step": 9306, + "time_per_iteration": 2.4216151237487793 + }, + { + "auxiliary_loss_clip": 0.01055544, + "auxiliary_loss_mlp": 0.01009795, + "balance_loss_clip": 1.02910447, + "balance_loss_mlp": 1.00805187, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9215116093323351, + "language_loss": 0.60369062, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62434399, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01741028, + "step": 9307, + "time_per_iteration": 4.650007486343384 + }, + { + "auxiliary_loss_clip": 0.01127374, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.04780591, + "balance_loss_mlp": 1.03118491, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.5828074937076866, + "language_loss": 0.73952192, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.7612344, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12677002, + "step": 9308, + "time_per_iteration": 2.497211217880249 + }, + { + "auxiliary_loss_clip": 0.01127855, + "auxiliary_loss_mlp": 0.01035348, + "balance_loss_clip": 1.0491693, + "balance_loss_mlp": 1.02193117, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.9591798111610628, + "language_loss": 0.70067453, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.72230661, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13415527, + "step": 9309, + "time_per_iteration": 2.5203652381896973 + }, + { + "auxiliary_loss_clip": 0.01127599, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.05012119, + "balance_loss_mlp": 1.02054513, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.9519465487415, + "language_loss": 0.74948931, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77110553, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13482666, + "step": 9310, + "time_per_iteration": 2.5714168548583984 + }, + { + "auxiliary_loss_clip": 0.01123277, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.04593194, + "balance_loss_mlp": 1.02070999, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.435474599744453, + "language_loss": 0.70037699, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.72195172, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13482666, + "step": 9311, + "time_per_iteration": 2.523674726486206 + }, + { + "auxiliary_loss_clip": 0.01119426, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.04412067, + "balance_loss_mlp": 1.02162457, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.9944654591619801, + "language_loss": 0.72600126, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74753267, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12103271, + "step": 9312, + "time_per_iteration": 2.4790775775909424 + }, + { + "auxiliary_loss_clip": 0.01127845, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.05257249, + "balance_loss_mlp": 1.02061462, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.8418801163967784, + "language_loss": 0.89157009, + "learning_rate": 1.709904360003822e-06, + "loss": 0.9131825, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12805176, + "step": 9313, + "time_per_iteration": 3.9659574031829834 + }, + { + "auxiliary_loss_clip": 0.01125501, + "auxiliary_loss_mlp": 0.01039689, + "balance_loss_clip": 1.05098987, + "balance_loss_mlp": 1.02723217, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.9467832601288126, + "language_loss": 0.77544433, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79709625, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12469482, + "step": 9314, + "time_per_iteration": 2.619760751724243 + }, + { + "auxiliary_loss_clip": 0.01123924, + "auxiliary_loss_mlp": 0.01028457, + "balance_loss_clip": 1.04931045, + "balance_loss_mlp": 1.01600552, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 5.899681313718758, + "language_loss": 0.70543015, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72695398, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12445068, + "step": 9315, + "time_per_iteration": 2.5479977130889893 + }, + { + "auxiliary_loss_clip": 0.01117474, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.03975666, + "balance_loss_mlp": 1.02261388, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 1.6554250567455728, + "language_loss": 0.66868639, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.6902169, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12963867, + "step": 9316, + "time_per_iteration": 2.530797004699707 + }, + { + "auxiliary_loss_clip": 0.01118563, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.0438832, + "balance_loss_mlp": 1.01884127, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.5094962247158294, + "language_loss": 0.86645639, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88796329, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.13287354, + "step": 9317, + "time_per_iteration": 2.4741101264953613 + }, + { + "auxiliary_loss_clip": 0.01124506, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.04639137, + "balance_loss_mlp": 1.02333975, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7568532358156148, + "language_loss": 0.77038276, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79200155, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.14025879, + "step": 9318, + "time_per_iteration": 2.508260488510132 + }, + { + "auxiliary_loss_clip": 0.01118023, + "auxiliary_loss_mlp": 0.01039335, + "balance_loss_clip": 1.04234135, + "balance_loss_mlp": 1.02649593, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.8011713192823422, + "language_loss": 0.76373589, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78530949, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.1282959, + "step": 9319, + "time_per_iteration": 2.537323474884033 + }, + { + "auxiliary_loss_clip": 0.01118303, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.0438385, + "balance_loss_mlp": 1.02422702, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.5035023481894307, + "language_loss": 0.85290772, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87446344, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13049316, + "step": 9320, + "time_per_iteration": 2.546452283859253 + }, + { + "auxiliary_loss_clip": 0.01053081, + "auxiliary_loss_mlp": 0.01007806, + "balance_loss_clip": 1.02620804, + "balance_loss_mlp": 1.0059495, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7741885828918346, + "language_loss": 0.52536798, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54597688, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.26904297, + "router_z_loss_mlp": 0.01855469, + "step": 9321, + "time_per_iteration": 2.883287191390991 + }, + { + "auxiliary_loss_clip": 0.0112206, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.04795146, + "balance_loss_mlp": 1.01815295, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.3266241338318052, + "language_loss": 0.7469101, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76843023, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11816406, + "step": 9322, + "time_per_iteration": 3.893934726715088 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.04413152, + "balance_loss_mlp": 1.0228008, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.7772617336043708, + "language_loss": 0.73898798, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76055908, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13525391, + "step": 9323, + "time_per_iteration": 2.651148796081543 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.05123353, + "balance_loss_mlp": 1.01683223, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.8188404435023597, + "language_loss": 0.61859334, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.6401937, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12890625, + "step": 9324, + "time_per_iteration": 2.541910171508789 + }, + { + "auxiliary_loss_clip": 0.01125965, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.04760456, + "balance_loss_mlp": 1.01843309, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 1.8125261888280326, + "language_loss": 0.8789351, + "learning_rate": 1.705281040409226e-06, + "loss": 0.90051109, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13208008, + "step": 9325, + "time_per_iteration": 2.4374420642852783 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.05309558, + "balance_loss_mlp": 1.01654994, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 2.1385386562469795, + "language_loss": 0.74163014, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76324487, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13330078, + "step": 9326, + "time_per_iteration": 2.4428250789642334 + }, + { + "auxiliary_loss_clip": 0.01134345, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.05527961, + "balance_loss_mlp": 1.01582205, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 1.9313546359003078, + "language_loss": 0.78157371, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80321288, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13745117, + "step": 9327, + "time_per_iteration": 2.4490575790405273 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.05432415, + "balance_loss_mlp": 1.01608515, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.8364139721858834, + "language_loss": 0.78458863, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80617923, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13110352, + "step": 9328, + "time_per_iteration": 2.5067811012268066 + }, + { + "auxiliary_loss_clip": 0.01121107, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.04477108, + "balance_loss_mlp": 1.01712012, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.4456792062890582, + "language_loss": 0.73437196, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75587797, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12371826, + "step": 9329, + "time_per_iteration": 2.557168483734131 + }, + { + "auxiliary_loss_clip": 0.01127483, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.04697967, + "balance_loss_mlp": 1.01993537, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.6249540663426747, + "language_loss": 0.83433425, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85594225, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13391113, + "step": 9330, + "time_per_iteration": 2.4456138610839844 + }, + { + "auxiliary_loss_clip": 0.01052166, + "auxiliary_loss_mlp": 0.01001649, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.00048697, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.713210416211112, + "language_loss": 0.57841831, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59895647, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01161194, + "step": 9331, + "time_per_iteration": 3.143541097640991 + }, + { + "auxiliary_loss_clip": 0.01118121, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.04161906, + "balance_loss_mlp": 1.02453601, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 2.0092330095010467, + "language_loss": 0.82155704, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84312153, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13775635, + "step": 9332, + "time_per_iteration": 2.445796251296997 + }, + { + "auxiliary_loss_clip": 0.01129963, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.04980874, + "balance_loss_mlp": 1.02028131, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.8910720461697417, + "language_loss": 0.81664681, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.8382895, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.14025879, + "step": 9333, + "time_per_iteration": 2.4348349571228027 + }, + { + "auxiliary_loss_clip": 0.0112931, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.05250168, + "balance_loss_mlp": 1.01698732, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.8895224723264161, + "language_loss": 0.72938776, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75097275, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12213135, + "step": 9334, + "time_per_iteration": 2.448566436767578 + }, + { + "auxiliary_loss_clip": 0.01124644, + "auxiliary_loss_mlp": 0.0103503, + "balance_loss_clip": 1.04712832, + "balance_loss_mlp": 1.02185142, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7254093176559344, + "language_loss": 0.70792973, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.72952646, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.1317749, + "step": 9335, + "time_per_iteration": 2.4438087940216064 + }, + { + "auxiliary_loss_clip": 0.01124992, + "auxiliary_loss_mlp": 0.01032024, + "balance_loss_clip": 1.04512751, + "balance_loss_mlp": 1.01846957, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.635617408053412, + "language_loss": 0.77038223, + "learning_rate": 1.701044410566205e-06, + "loss": 0.79195237, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13562012, + "step": 9336, + "time_per_iteration": 2.426215410232544 + }, + { + "auxiliary_loss_clip": 0.01116933, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.04264951, + "balance_loss_mlp": 1.0197643, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.2911989442175194, + "language_loss": 0.64298683, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66447604, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12213135, + "step": 9337, + "time_per_iteration": 2.5219991207122803 + }, + { + "auxiliary_loss_clip": 0.01062198, + "auxiliary_loss_mlp": 0.01005276, + "balance_loss_clip": 1.03572798, + "balance_loss_mlp": 1.00397193, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9670004542318602, + "language_loss": 0.6253038, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64597857, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01304626, + "step": 9338, + "time_per_iteration": 3.0839476585388184 + }, + { + "auxiliary_loss_clip": 0.0112895, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.05051184, + "balance_loss_mlp": 1.01915419, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.8889522476782286, + "language_loss": 0.65818357, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67978799, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12335205, + "step": 9339, + "time_per_iteration": 2.575899600982666 + }, + { + "auxiliary_loss_clip": 0.01121659, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.04684389, + "balance_loss_mlp": 1.01943219, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.776612713878075, + "language_loss": 0.70115614, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72270918, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.14233398, + "step": 9340, + "time_per_iteration": 2.4385712146759033 + }, + { + "auxiliary_loss_clip": 0.01119359, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.04609656, + "balance_loss_mlp": 1.0182724, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.5431429749055912, + "language_loss": 0.77050054, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79199588, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11895752, + "step": 9341, + "time_per_iteration": 2.4810876846313477 + }, + { + "auxiliary_loss_clip": 0.01116568, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.04060817, + "balance_loss_mlp": 1.01967764, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.6182963407181696, + "language_loss": 0.7963748, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81786782, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.13049316, + "step": 9342, + "time_per_iteration": 2.5557355880737305 + }, + { + "auxiliary_loss_clip": 0.01123453, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.04420877, + "balance_loss_mlp": 1.02008128, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 1.9806267552317167, + "language_loss": 0.76524127, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.7868095, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13287354, + "step": 9343, + "time_per_iteration": 2.492772340774536 + }, + { + "auxiliary_loss_clip": 0.01126346, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.04977679, + "balance_loss_mlp": 1.0257796, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 2.2572996811258834, + "language_loss": 0.68833268, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70999718, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.14337158, + "step": 9344, + "time_per_iteration": 3.904275417327881 + }, + { + "auxiliary_loss_clip": 0.01123738, + "auxiliary_loss_mlp": 0.01039439, + "balance_loss_clip": 1.04487586, + "balance_loss_mlp": 1.02547359, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.157325347161358, + "language_loss": 0.66902566, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.69065738, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.1395874, + "step": 9345, + "time_per_iteration": 2.506963014602661 + }, + { + "auxiliary_loss_clip": 0.01124036, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.04612064, + "balance_loss_mlp": 1.01795447, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 2.0142651429483047, + "language_loss": 0.87828779, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89983404, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12646484, + "step": 9346, + "time_per_iteration": 2.480253219604492 + }, + { + "auxiliary_loss_clip": 0.01133286, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.05396569, + "balance_loss_mlp": 1.0245316, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.063788354375038, + "language_loss": 0.59085673, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61257124, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13641357, + "step": 9347, + "time_per_iteration": 2.5768723487854004 + }, + { + "auxiliary_loss_clip": 0.01129019, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.05166435, + "balance_loss_mlp": 1.01854157, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.462700626721686, + "language_loss": 0.694152, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71576881, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.14105225, + "step": 9348, + "time_per_iteration": 2.4485929012298584 + }, + { + "auxiliary_loss_clip": 0.01124558, + "auxiliary_loss_mlp": 0.01025435, + "balance_loss_clip": 1.04320407, + "balance_loss_mlp": 1.01154137, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 2.0825421492575322, + "language_loss": 0.79339898, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81489885, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.13903809, + "step": 9349, + "time_per_iteration": 2.414562940597534 + }, + { + "auxiliary_loss_clip": 0.01124502, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.04863644, + "balance_loss_mlp": 1.01705492, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.1847739520368528, + "language_loss": 0.67668998, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69823849, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13305664, + "step": 9350, + "time_per_iteration": 4.004393100738525 + }, + { + "auxiliary_loss_clip": 0.011259, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.04683506, + "balance_loss_mlp": 1.02297044, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 1.8497618912478775, + "language_loss": 0.78617287, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.8077929, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13140869, + "step": 9351, + "time_per_iteration": 2.56365704536438 + }, + { + "auxiliary_loss_clip": 0.0112617, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.04732776, + "balance_loss_mlp": 1.02220845, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.6429573612273516, + "language_loss": 0.5878613, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.60947728, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.13220215, + "step": 9352, + "time_per_iteration": 2.4859797954559326 + }, + { + "auxiliary_loss_clip": 0.01123557, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.04974782, + "balance_loss_mlp": 1.02004015, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.4272193981995842, + "language_loss": 0.71989244, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.74145055, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12207031, + "step": 9353, + "time_per_iteration": 2.4873244762420654 + }, + { + "auxiliary_loss_clip": 0.01127379, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.04849386, + "balance_loss_mlp": 1.01861763, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 3.1014759436536634, + "language_loss": 0.76054382, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.7821309, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12701416, + "step": 9354, + "time_per_iteration": 2.3969757556915283 + }, + { + "auxiliary_loss_clip": 0.01126288, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.04705596, + "balance_loss_mlp": 1.0187875, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.7596795840369797, + "language_loss": 0.72883278, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.75041127, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12774658, + "step": 9355, + "time_per_iteration": 2.4878225326538086 + }, + { + "auxiliary_loss_clip": 0.0112383, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.04748857, + "balance_loss_mlp": 1.01723897, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.4589866897031585, + "language_loss": 0.73200345, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75353897, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.125, + "step": 9356, + "time_per_iteration": 2.460885763168335 + }, + { + "auxiliary_loss_clip": 0.01120441, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.04681885, + "balance_loss_mlp": 1.01611495, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 2.7021815238473033, + "language_loss": 0.83306736, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85455495, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12194824, + "step": 9357, + "time_per_iteration": 3.967578411102295 + }, + { + "auxiliary_loss_clip": 0.01114678, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.03942752, + "balance_loss_mlp": 1.01746607, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 2.5537268962950046, + "language_loss": 0.72239268, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74384421, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13024902, + "step": 9358, + "time_per_iteration": 2.523599147796631 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.04072428, + "balance_loss_mlp": 1.02842999, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 1.6316009956244917, + "language_loss": 0.78025579, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.80182672, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12878418, + "step": 9359, + "time_per_iteration": 2.5129592418670654 + }, + { + "auxiliary_loss_clip": 0.01124485, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.04812431, + "balance_loss_mlp": 1.01994097, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.6736490750750337, + "language_loss": 0.70270491, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72426927, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12017822, + "step": 9360, + "time_per_iteration": 2.490211009979248 + }, + { + "auxiliary_loss_clip": 0.01059142, + "auxiliary_loss_mlp": 0.01004705, + "balance_loss_clip": 1.03274262, + "balance_loss_mlp": 1.00320745, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7760538985300698, + "language_loss": 0.5562588, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57689726, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.01498413, + "step": 9361, + "time_per_iteration": 3.0071942806243896 + }, + { + "auxiliary_loss_clip": 0.01112499, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.0401268, + "balance_loss_mlp": 1.02144814, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.507209812860859, + "language_loss": 0.82055348, + "learning_rate": 1.691036046141018e-06, + "loss": 0.8420217, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12872314, + "step": 9362, + "time_per_iteration": 2.4686529636383057 + }, + { + "auxiliary_loss_clip": 0.01116851, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.04212332, + "balance_loss_mlp": 1.02274275, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.6374181227089826, + "language_loss": 0.74437141, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76589173, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12445068, + "step": 9363, + "time_per_iteration": 2.7293457984924316 + }, + { + "auxiliary_loss_clip": 0.01126813, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.04806256, + "balance_loss_mlp": 1.01871765, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.6919370084531524, + "language_loss": 0.82883167, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85041785, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13085938, + "step": 9364, + "time_per_iteration": 2.5777788162231445 + }, + { + "auxiliary_loss_clip": 0.0112383, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.04991949, + "balance_loss_mlp": 1.02022898, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.400176723682962, + "language_loss": 0.6519357, + "learning_rate": 1.689881739637642e-06, + "loss": 0.673491, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11486816, + "step": 9365, + "time_per_iteration": 2.48539662361145 + }, + { + "auxiliary_loss_clip": 0.01130577, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.04973125, + "balance_loss_mlp": 1.02021766, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 3.1051799769187034, + "language_loss": 0.81834865, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83998781, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13104248, + "step": 9366, + "time_per_iteration": 3.896505117416382 + }, + { + "auxiliary_loss_clip": 0.01119679, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.04671812, + "balance_loss_mlp": 1.01607752, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4715731768004425, + "language_loss": 0.7344988, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75598061, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12432861, + "step": 9367, + "time_per_iteration": 2.447171926498413 + }, + { + "auxiliary_loss_clip": 0.01050275, + "auxiliary_loss_mlp": 0.01011202, + "balance_loss_clip": 1.02417314, + "balance_loss_mlp": 1.00951827, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6234249570694983, + "language_loss": 0.53490341, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55551815, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01687622, + "step": 9368, + "time_per_iteration": 3.286803960800171 + }, + { + "auxiliary_loss_clip": 0.01122838, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.04686189, + "balance_loss_mlp": 1.02131832, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.6603689157092647, + "language_loss": 0.69336456, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71492982, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1237793, + "step": 9369, + "time_per_iteration": 2.4947869777679443 + }, + { + "auxiliary_loss_clip": 0.01116288, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.04017758, + "balance_loss_mlp": 1.02129555, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.9802647142087402, + "language_loss": 0.75847238, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77997112, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12286377, + "step": 9370, + "time_per_iteration": 2.578400135040283 + }, + { + "auxiliary_loss_clip": 0.0112351, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.04536796, + "balance_loss_mlp": 1.01878035, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 2.19047613623197, + "language_loss": 0.75885618, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78042114, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.14221191, + "step": 9371, + "time_per_iteration": 2.4207711219787598 + }, + { + "auxiliary_loss_clip": 0.01113698, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.03976178, + "balance_loss_mlp": 1.02082062, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 1.7755629062407832, + "language_loss": 0.76122367, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78268629, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11737061, + "step": 9372, + "time_per_iteration": 2.473872661590576 + }, + { + "auxiliary_loss_clip": 0.01113132, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.04080856, + "balance_loss_mlp": 1.02018332, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.33957000222441, + "language_loss": 0.71606362, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73751771, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12091064, + "step": 9373, + "time_per_iteration": 2.4677236080169678 + }, + { + "auxiliary_loss_clip": 0.01121519, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.04591298, + "balance_loss_mlp": 1.01932824, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.085362074652458, + "language_loss": 0.82749045, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84903353, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13470459, + "step": 9374, + "time_per_iteration": 2.4555561542510986 + }, + { + "auxiliary_loss_clip": 0.01122102, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.04581869, + "balance_loss_mlp": 1.01724291, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.5659819969826536, + "language_loss": 0.66402137, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68553662, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12164307, + "step": 9375, + "time_per_iteration": 2.5060882568359375 + }, + { + "auxiliary_loss_clip": 0.01126037, + "auxiliary_loss_mlp": 0.01037829, + "balance_loss_clip": 1.04896557, + "balance_loss_mlp": 1.0258069, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 2.2135258491185406, + "language_loss": 0.81287259, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83451128, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12042236, + "step": 9376, + "time_per_iteration": 2.4744954109191895 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.04468679, + "balance_loss_mlp": 1.02013791, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.4061468569462072, + "language_loss": 0.69378269, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71533823, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12841797, + "step": 9377, + "time_per_iteration": 2.651381015777588 + }, + { + "auxiliary_loss_clip": 0.0112478, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.05156946, + "balance_loss_mlp": 1.02277946, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.553622357613345, + "language_loss": 0.74779296, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76939577, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12731934, + "step": 9378, + "time_per_iteration": 2.4717679023742676 + }, + { + "auxiliary_loss_clip": 0.01122726, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.04166806, + "balance_loss_mlp": 1.02301931, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.673252525546608, + "language_loss": 0.81702745, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.83863914, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1541748, + "step": 9379, + "time_per_iteration": 2.410052537918091 + }, + { + "auxiliary_loss_clip": 0.0112622, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.04764676, + "balance_loss_mlp": 1.02158189, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.118979051295456, + "language_loss": 0.71444786, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.736049, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12322998, + "step": 9380, + "time_per_iteration": 2.4786436557769775 + }, + { + "auxiliary_loss_clip": 0.01136001, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.05618048, + "balance_loss_mlp": 1.02019835, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 1.9599433136217546, + "language_loss": 0.7428996, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76459467, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13299561, + "step": 9381, + "time_per_iteration": 2.424362897872925 + }, + { + "auxiliary_loss_clip": 0.0112486, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.04913449, + "balance_loss_mlp": 1.02039361, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 2.2589800452159388, + "language_loss": 0.72257549, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74414742, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11938477, + "step": 9382, + "time_per_iteration": 2.526113748550415 + }, + { + "auxiliary_loss_clip": 0.01069606, + "auxiliary_loss_mlp": 0.01013551, + "balance_loss_clip": 1.04279232, + "balance_loss_mlp": 1.01206982, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7218013355337484, + "language_loss": 0.54349887, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56433046, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.26855469, + "router_z_loss_mlp": 0.01481628, + "step": 9383, + "time_per_iteration": 3.2180166244506836 + }, + { + "auxiliary_loss_clip": 0.01124176, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.04585457, + "balance_loss_mlp": 1.01737976, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.6628075631698336, + "language_loss": 0.70738918, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.72893465, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12976074, + "step": 9384, + "time_per_iteration": 2.447760581970215 + }, + { + "auxiliary_loss_clip": 0.01125599, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.04816997, + "balance_loss_mlp": 1.01827455, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 1.8950669838809995, + "language_loss": 0.76142383, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78299701, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13452148, + "step": 9385, + "time_per_iteration": 2.5502359867095947 + }, + { + "auxiliary_loss_clip": 0.01125888, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.0518105, + "balance_loss_mlp": 1.02049708, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 1.927850842424709, + "language_loss": 0.82708317, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84867156, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12445068, + "step": 9386, + "time_per_iteration": 2.480421304702759 + }, + { + "auxiliary_loss_clip": 0.01134434, + "auxiliary_loss_mlp": 0.01038891, + "balance_loss_clip": 1.05252361, + "balance_loss_mlp": 1.02508056, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 1.8285829657594865, + "language_loss": 0.69839454, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72012782, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13812256, + "step": 9387, + "time_per_iteration": 2.525970458984375 + }, + { + "auxiliary_loss_clip": 0.01119996, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.04213071, + "balance_loss_mlp": 1.02050555, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.5482741895735244, + "language_loss": 0.75017071, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.77170277, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12695312, + "step": 9388, + "time_per_iteration": 4.061593532562256 + }, + { + "auxiliary_loss_clip": 0.01118143, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.04431581, + "balance_loss_mlp": 1.01998568, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.5588613792325652, + "language_loss": 0.82387865, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84537673, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11688232, + "step": 9389, + "time_per_iteration": 2.4700398445129395 + }, + { + "auxiliary_loss_clip": 0.01126252, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.04702687, + "balance_loss_mlp": 1.02147472, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.2240311339367826, + "language_loss": 0.64290988, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66452694, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13977051, + "step": 9390, + "time_per_iteration": 2.432295799255371 + }, + { + "auxiliary_loss_clip": 0.01130309, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.05233383, + "balance_loss_mlp": 1.01966882, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.8292742342711332, + "language_loss": 0.92173648, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94334877, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11248779, + "step": 9391, + "time_per_iteration": 2.475503921508789 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.05591989, + "balance_loss_mlp": 1.01892996, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 2.7175191275332846, + "language_loss": 0.60557824, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62728953, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13555908, + "step": 9392, + "time_per_iteration": 2.5097897052764893 + }, + { + "auxiliary_loss_clip": 0.01127825, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.05012059, + "balance_loss_mlp": 1.01623535, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 2.500505024567279, + "language_loss": 0.81533337, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83691186, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13793945, + "step": 9393, + "time_per_iteration": 2.5009865760803223 + }, + { + "auxiliary_loss_clip": 0.01120864, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.04396045, + "balance_loss_mlp": 1.01793897, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.7675183510430534, + "language_loss": 0.87254405, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89405584, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.1237793, + "step": 9394, + "time_per_iteration": 3.8591713905334473 + }, + { + "auxiliary_loss_clip": 0.01129145, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.05341399, + "balance_loss_mlp": 1.01896906, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 1.799696097262039, + "language_loss": 0.84481943, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.86642379, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12316895, + "step": 9395, + "time_per_iteration": 2.436497449874878 + }, + { + "auxiliary_loss_clip": 0.01052559, + "auxiliary_loss_mlp": 0.01005077, + "balance_loss_clip": 1.02603292, + "balance_loss_mlp": 1.00361192, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.795884619997131, + "language_loss": 0.58326471, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60384107, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01464844, + "step": 9396, + "time_per_iteration": 3.0800282955169678 + }, + { + "auxiliary_loss_clip": 0.01127105, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.04772556, + "balance_loss_mlp": 1.01827931, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 1.842652479485552, + "language_loss": 0.70121014, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72279227, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.1282959, + "step": 9397, + "time_per_iteration": 2.4617831707000732 + }, + { + "auxiliary_loss_clip": 0.01125545, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.04627144, + "balance_loss_mlp": 1.02110934, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.7867863455590516, + "language_loss": 0.67356813, + "learning_rate": 1.67719144001275e-06, + "loss": 0.6951586, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.1239624, + "step": 9398, + "time_per_iteration": 2.4776668548583984 + }, + { + "auxiliary_loss_clip": 0.01062298, + "auxiliary_loss_mlp": 0.01010056, + "balance_loss_clip": 1.03617048, + "balance_loss_mlp": 1.00841689, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7602517957853642, + "language_loss": 0.58158088, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60230446, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.0164032, + "step": 9399, + "time_per_iteration": 3.0517327785491943 + }, + { + "auxiliary_loss_clip": 0.01127829, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.04881275, + "balance_loss_mlp": 1.01706553, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8299149564093842, + "language_loss": 0.73303324, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75462198, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13970947, + "step": 9400, + "time_per_iteration": 2.6900887489318848 + }, + { + "auxiliary_loss_clip": 0.01121459, + "auxiliary_loss_mlp": 0.01039405, + "balance_loss_clip": 1.04319298, + "balance_loss_mlp": 1.02382457, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.7791439212322073, + "language_loss": 0.60859489, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63020349, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.15588379, + "step": 9401, + "time_per_iteration": 3.8909995555877686 + }, + { + "auxiliary_loss_clip": 0.0111605, + "auxiliary_loss_mlp": 0.01028204, + "balance_loss_clip": 1.04089069, + "balance_loss_mlp": 1.01628852, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 2.5898737395962934, + "language_loss": 0.81166661, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83310908, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1192627, + "step": 9402, + "time_per_iteration": 2.4544715881347656 + }, + { + "auxiliary_loss_clip": 0.01126542, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.05083346, + "balance_loss_mlp": 1.02701163, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 2.251178833251342, + "language_loss": 0.78125376, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80293512, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.14581299, + "step": 9403, + "time_per_iteration": 2.615781307220459 + }, + { + "auxiliary_loss_clip": 0.01129093, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.05018198, + "balance_loss_mlp": 1.01837802, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.7412412638318617, + "language_loss": 0.68687308, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70847714, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.1293335, + "step": 9404, + "time_per_iteration": 2.5061049461364746 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.04497504, + "balance_loss_mlp": 1.01826191, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 4.1609024871289035, + "language_loss": 0.67067552, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69218397, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12091064, + "step": 9405, + "time_per_iteration": 2.536783218383789 + }, + { + "auxiliary_loss_clip": 0.01117799, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.0461278, + "balance_loss_mlp": 1.01763391, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.7443164762780345, + "language_loss": 0.74445128, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76591933, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11376953, + "step": 9406, + "time_per_iteration": 2.4978253841400146 + }, + { + "auxiliary_loss_clip": 0.01128579, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.04982185, + "balance_loss_mlp": 1.02216518, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.8129167012030543, + "language_loss": 0.80290091, + "learning_rate": 1.673732740698882e-06, + "loss": 0.82454699, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.13861084, + "step": 9407, + "time_per_iteration": 2.542346239089966 + }, + { + "auxiliary_loss_clip": 0.01116877, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.04415143, + "balance_loss_mlp": 1.01991463, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.3354483646571949, + "language_loss": 0.71018124, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73166788, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11871338, + "step": 9408, + "time_per_iteration": 4.069985628128052 + }, + { + "auxiliary_loss_clip": 0.01123304, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.04762852, + "balance_loss_mlp": 1.02411795, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 1.812382222477071, + "language_loss": 0.81108171, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83269691, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.14111328, + "step": 9409, + "time_per_iteration": 2.480656862258911 + }, + { + "auxiliary_loss_clip": 0.01125473, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.04699147, + "balance_loss_mlp": 1.01724935, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.63609819744688, + "language_loss": 0.78584093, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80739534, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12701416, + "step": 9410, + "time_per_iteration": 2.4695794582366943 + }, + { + "auxiliary_loss_clip": 0.01128472, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.04996014, + "balance_loss_mlp": 1.0235908, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.2971228550060037, + "language_loss": 0.83379376, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85543787, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12353516, + "step": 9411, + "time_per_iteration": 2.479914903640747 + }, + { + "auxiliary_loss_clip": 0.01128371, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.04806411, + "balance_loss_mlp": 1.01616466, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.6014202364528347, + "language_loss": 0.6693809, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69096255, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13647461, + "step": 9412, + "time_per_iteration": 2.556119918823242 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.04513979, + "balance_loss_mlp": 1.02219152, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.5375181981718662, + "language_loss": 0.5847671, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.606305, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11547852, + "step": 9413, + "time_per_iteration": 2.51627779006958 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01028136, + "balance_loss_clip": 1.04843283, + "balance_loss_mlp": 1.01653731, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.5187975294929208, + "language_loss": 0.69163692, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.7131412, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1159668, + "step": 9414, + "time_per_iteration": 2.50169038772583 + }, + { + "auxiliary_loss_clip": 0.01125232, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.05120528, + "balance_loss_mlp": 1.01858759, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.7900868428852807, + "language_loss": 0.78402472, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80557746, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11437988, + "step": 9415, + "time_per_iteration": 2.4847171306610107 + }, + { + "auxiliary_loss_clip": 0.01092898, + "auxiliary_loss_mlp": 0.01010129, + "balance_loss_clip": 1.06636262, + "balance_loss_mlp": 1.0078851, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6818543719279797, + "language_loss": 0.49183595, + "learning_rate": 1.670275043523822e-06, + "loss": 0.5128662, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.02246094, + "step": 9416, + "time_per_iteration": 3.285374879837036 + }, + { + "auxiliary_loss_clip": 0.01132178, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.05443811, + "balance_loss_mlp": 1.02389002, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 2.2576528384219414, + "language_loss": 0.63057822, + "learning_rate": 1.6698909172706e-06, + "loss": 0.65227187, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13311768, + "step": 9417, + "time_per_iteration": 2.5235891342163086 + }, + { + "auxiliary_loss_clip": 0.01117994, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.0411272, + "balance_loss_mlp": 1.02202618, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.7662449806299623, + "language_loss": 0.69150543, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71302819, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12243652, + "step": 9418, + "time_per_iteration": 2.5313427448272705 + }, + { + "auxiliary_loss_clip": 0.01126426, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.04856277, + "balance_loss_mlp": 1.01663661, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 2.0955965717296547, + "language_loss": 0.64525962, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66682756, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13726807, + "step": 9419, + "time_per_iteration": 2.479048728942871 + }, + { + "auxiliary_loss_clip": 0.01056093, + "auxiliary_loss_mlp": 0.01006689, + "balance_loss_clip": 1.02956533, + "balance_loss_mlp": 1.00524497, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 1.1883178763896673, + "language_loss": 0.59706414, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61769193, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01445007, + "step": 9420, + "time_per_iteration": 3.1571543216705322 + }, + { + "auxiliary_loss_clip": 0.01118507, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.04282451, + "balance_loss_mlp": 1.02470708, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.740326728684117, + "language_loss": 0.73874748, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76031923, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13964844, + "step": 9421, + "time_per_iteration": 2.502418041229248 + }, + { + "auxiliary_loss_clip": 0.0112923, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.05082214, + "balance_loss_mlp": 1.02003551, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 2.161874054338642, + "language_loss": 0.73117363, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.75279438, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12817383, + "step": 9422, + "time_per_iteration": 2.5094196796417236 + }, + { + "auxiliary_loss_clip": 0.01124454, + "auxiliary_loss_mlp": 0.01033875, + "balance_loss_clip": 1.04948401, + "balance_loss_mlp": 1.02235305, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.8901016213920063, + "language_loss": 0.8150869, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83667022, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11523438, + "step": 9423, + "time_per_iteration": 2.4729573726654053 + }, + { + "auxiliary_loss_clip": 0.01127157, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.05234385, + "balance_loss_mlp": 1.02440917, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 4.388243129151705, + "language_loss": 0.8075726, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82921213, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12390137, + "step": 9424, + "time_per_iteration": 2.525062322616577 + }, + { + "auxiliary_loss_clip": 0.01125756, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.04738712, + "balance_loss_mlp": 1.02227032, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 2.104758680546302, + "language_loss": 0.7930336, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.81464636, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13256836, + "step": 9425, + "time_per_iteration": 2.521284818649292 + }, + { + "auxiliary_loss_clip": 0.01121186, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.0451014, + "balance_loss_mlp": 1.02168977, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.9558893256416126, + "language_loss": 0.59144992, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61300653, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12792969, + "step": 9426, + "time_per_iteration": 2.4962353706359863 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01037992, + "balance_loss_clip": 1.05110741, + "balance_loss_mlp": 1.0248847, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 1.7383204135717591, + "language_loss": 0.81675035, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83849287, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13098145, + "step": 9427, + "time_per_iteration": 2.4502007961273193 + }, + { + "auxiliary_loss_clip": 0.01121783, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.04853702, + "balance_loss_mlp": 1.01855588, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 2.535867679624623, + "language_loss": 0.86056787, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.8820982, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12695312, + "step": 9428, + "time_per_iteration": 2.4843897819519043 + }, + { + "auxiliary_loss_clip": 0.01133257, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.05414712, + "balance_loss_mlp": 1.01768064, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.324901321688561, + "language_loss": 0.73699927, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75863326, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12463379, + "step": 9429, + "time_per_iteration": 2.4655086994171143 + }, + { + "auxiliary_loss_clip": 0.01121337, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.04221988, + "balance_loss_mlp": 1.0231142, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.827044290448607, + "language_loss": 0.75251889, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77410024, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13677979, + "step": 9430, + "time_per_iteration": 2.512176990509033 + }, + { + "auxiliary_loss_clip": 0.01121188, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.0433104, + "balance_loss_mlp": 1.01885056, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 1.9899681192904555, + "language_loss": 0.72944295, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75097072, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12738037, + "step": 9431, + "time_per_iteration": 3.9524457454681396 + }, + { + "auxiliary_loss_clip": 0.0111522, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.04445934, + "balance_loss_mlp": 1.02088475, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.6899014876544614, + "language_loss": 0.73924798, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.76072216, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11303711, + "step": 9432, + "time_per_iteration": 2.4425368309020996 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.04528439, + "balance_loss_mlp": 1.02082562, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.6677743181959437, + "language_loss": 0.77989948, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80146575, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1229248, + "step": 9433, + "time_per_iteration": 2.491861581802368 + }, + { + "auxiliary_loss_clip": 0.01131887, + "auxiliary_loss_mlp": 0.01036112, + "balance_loss_clip": 1.05081797, + "balance_loss_mlp": 1.0216881, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.9611006462785268, + "language_loss": 0.6362347, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65791476, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.14440918, + "step": 9434, + "time_per_iteration": 2.4659602642059326 + }, + { + "auxiliary_loss_clip": 0.01126048, + "auxiliary_loss_mlp": 0.0103011, + "balance_loss_clip": 1.05223179, + "balance_loss_mlp": 1.01804006, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.6501942312957323, + "language_loss": 0.66815591, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68971753, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12078857, + "step": 9435, + "time_per_iteration": 2.5111215114593506 + }, + { + "auxiliary_loss_clip": 0.01117628, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.04263151, + "balance_loss_mlp": 1.01755357, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 2.2727707614030725, + "language_loss": 0.71800375, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73948121, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12567139, + "step": 9436, + "time_per_iteration": 2.4775562286376953 + }, + { + "auxiliary_loss_clip": 0.01124552, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.04633141, + "balance_loss_mlp": 1.02388024, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.5427919519717193, + "language_loss": 0.741763, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76339364, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.14630127, + "step": 9437, + "time_per_iteration": 2.5803205966949463 + }, + { + "auxiliary_loss_clip": 0.01137438, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.05442309, + "balance_loss_mlp": 1.02496243, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 1.7783132626258762, + "language_loss": 0.61053324, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63229966, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14233398, + "step": 9438, + "time_per_iteration": 4.065711975097656 + }, + { + "auxiliary_loss_clip": 0.01127158, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.04734349, + "balance_loss_mlp": 1.01637995, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.670881006486617, + "language_loss": 0.75279021, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77435017, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12463379, + "step": 9439, + "time_per_iteration": 2.5578126907348633 + }, + { + "auxiliary_loss_clip": 0.01127372, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.05166268, + "balance_loss_mlp": 1.01901031, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8403696990993381, + "language_loss": 0.83623153, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.85783005, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13458252, + "step": 9440, + "time_per_iteration": 2.43896484375 + }, + { + "auxiliary_loss_clip": 0.01137371, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.05322742, + "balance_loss_mlp": 1.02152979, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.034535849088457, + "language_loss": 0.75454986, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77628112, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.14221191, + "step": 9441, + "time_per_iteration": 2.5108530521392822 + }, + { + "auxiliary_loss_clip": 0.01137513, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.05762196, + "balance_loss_mlp": 1.02263856, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 1.9080027679824056, + "language_loss": 0.8340354, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85577005, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13293457, + "step": 9442, + "time_per_iteration": 2.391279935836792 + }, + { + "auxiliary_loss_clip": 0.01126323, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.05489111, + "balance_loss_mlp": 1.01811397, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 1.9395563700290481, + "language_loss": 0.74564987, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76720756, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11328125, + "step": 9443, + "time_per_iteration": 3.9329354763031006 + }, + { + "auxiliary_loss_clip": 0.01124102, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.04798245, + "balance_loss_mlp": 1.02086234, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 2.7170049109992602, + "language_loss": 0.77311093, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79468393, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12341309, + "step": 9444, + "time_per_iteration": 2.4488227367401123 + }, + { + "auxiliary_loss_clip": 0.01131566, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.05138659, + "balance_loss_mlp": 1.02253973, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.662131683515023, + "language_loss": 0.80796504, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82963741, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13116455, + "step": 9445, + "time_per_iteration": 2.423755407333374 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.04569221, + "balance_loss_mlp": 1.01674604, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.225484541736018, + "language_loss": 0.70719242, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72870338, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12512207, + "step": 9446, + "time_per_iteration": 2.585200548171997 + }, + { + "auxiliary_loss_clip": 0.01132612, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.05188155, + "balance_loss_mlp": 1.0177412, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 2.312462556519545, + "language_loss": 0.73473835, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75636685, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12512207, + "step": 9447, + "time_per_iteration": 2.4742305278778076 + }, + { + "auxiliary_loss_clip": 0.01135011, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.05558085, + "balance_loss_mlp": 1.020666, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 2.242575184238567, + "language_loss": 0.75089568, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77258271, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13031006, + "step": 9448, + "time_per_iteration": 2.50823712348938 + }, + { + "auxiliary_loss_clip": 0.01132527, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.05311513, + "balance_loss_mlp": 1.02589631, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 3.242079680183094, + "language_loss": 0.7675426, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78925347, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12677002, + "step": 9449, + "time_per_iteration": 2.615133762359619 + }, + { + "auxiliary_loss_clip": 0.01123636, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.04500926, + "balance_loss_mlp": 1.02225971, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.6511635327439191, + "language_loss": 0.74744201, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76903337, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13244629, + "step": 9450, + "time_per_iteration": 2.639753580093384 + }, + { + "auxiliary_loss_clip": 0.01138539, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.0569452, + "balance_loss_mlp": 1.02413094, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 1.7450183221599314, + "language_loss": 0.66606522, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.68781769, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12579346, + "step": 9451, + "time_per_iteration": 2.4510421752929688 + }, + { + "auxiliary_loss_clip": 0.01132405, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.04845989, + "balance_loss_mlp": 1.02026677, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 1.8136725740684865, + "language_loss": 0.72043633, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74210984, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14685059, + "step": 9452, + "time_per_iteration": 3.971433639526367 + }, + { + "auxiliary_loss_clip": 0.01120947, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.04601669, + "balance_loss_mlp": 1.02302241, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.5802240024112046, + "language_loss": 0.7034505, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72502244, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13201904, + "step": 9453, + "time_per_iteration": 2.492628812789917 + }, + { + "auxiliary_loss_clip": 0.01127241, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.04878926, + "balance_loss_mlp": 1.03032529, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 2.2925478351914124, + "language_loss": 0.69764906, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71936065, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13598633, + "step": 9454, + "time_per_iteration": 2.4842441082000732 + }, + { + "auxiliary_loss_clip": 0.01122562, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.04688871, + "balance_loss_mlp": 1.02147245, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 2.948507168375977, + "language_loss": 0.60049462, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62206256, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12756348, + "step": 9455, + "time_per_iteration": 2.4612553119659424 + }, + { + "auxiliary_loss_clip": 0.01133706, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.05207062, + "balance_loss_mlp": 1.01934338, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 1.8247839176557465, + "language_loss": 0.73205256, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75370902, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12597656, + "step": 9456, + "time_per_iteration": 2.5020320415496826 + }, + { + "auxiliary_loss_clip": 0.01131725, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.0539881, + "balance_loss_mlp": 1.0209254, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.6552731512143188, + "language_loss": 0.77044261, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79208362, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11450195, + "step": 9457, + "time_per_iteration": 2.4766077995300293 + }, + { + "auxiliary_loss_clip": 0.01129161, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.0470593, + "balance_loss_mlp": 1.02168834, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.7246652472228199, + "language_loss": 0.66332501, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68497574, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.14221191, + "step": 9458, + "time_per_iteration": 2.500950813293457 + }, + { + "auxiliary_loss_clip": 0.01132856, + "auxiliary_loss_mlp": 0.01028644, + "balance_loss_clip": 1.05151939, + "balance_loss_mlp": 1.0150423, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.1101259375643475, + "language_loss": 0.68218708, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70380211, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13604736, + "step": 9459, + "time_per_iteration": 2.429779052734375 + }, + { + "auxiliary_loss_clip": 0.01131906, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.05287242, + "balance_loss_mlp": 1.01862395, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 3.242758514594942, + "language_loss": 0.77104664, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79267955, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12762451, + "step": 9460, + "time_per_iteration": 2.4207651615142822 + }, + { + "auxiliary_loss_clip": 0.01124135, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.0460093, + "balance_loss_mlp": 1.02595425, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7087240528073664, + "language_loss": 0.72222805, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74386215, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13330078, + "step": 9461, + "time_per_iteration": 2.543415069580078 + }, + { + "auxiliary_loss_clip": 0.01124812, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.04791021, + "balance_loss_mlp": 1.01757455, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 1.9366382259005346, + "language_loss": 0.73283827, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75439423, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13201904, + "step": 9462, + "time_per_iteration": 2.513410806655884 + }, + { + "auxiliary_loss_clip": 0.0113039, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.05301297, + "balance_loss_mlp": 1.01565504, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.812153859191364, + "language_loss": 0.72905338, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75063294, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11901855, + "step": 9463, + "time_per_iteration": 2.436211585998535 + }, + { + "auxiliary_loss_clip": 0.01128718, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.0492816, + "balance_loss_mlp": 1.01893437, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 2.6767851828992812, + "language_loss": 0.74253231, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76413798, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12915039, + "step": 9464, + "time_per_iteration": 2.44221568107605 + }, + { + "auxiliary_loss_clip": 0.01129883, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.0510093, + "balance_loss_mlp": 1.02305126, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.6500221247384854, + "language_loss": 0.84200907, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.8636663, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12799072, + "step": 9465, + "time_per_iteration": 2.449526309967041 + }, + { + "auxiliary_loss_clip": 0.01115228, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.04111278, + "balance_loss_mlp": 1.01825261, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.5844998809478121, + "language_loss": 0.72070718, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74216402, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12188721, + "step": 9466, + "time_per_iteration": 2.5155210494995117 + }, + { + "auxiliary_loss_clip": 0.01059274, + "auxiliary_loss_mlp": 0.01002625, + "balance_loss_clip": 1.0320816, + "balance_loss_mlp": 1.00124037, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7096830006409874, + "language_loss": 0.55357426, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57419324, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01383972, + "step": 9467, + "time_per_iteration": 3.140618324279785 + }, + { + "auxiliary_loss_clip": 0.01127299, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.04604721, + "balance_loss_mlp": 1.02113986, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 1.949913205500618, + "language_loss": 0.63639957, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65804046, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.15625, + "step": 9468, + "time_per_iteration": 2.469447612762451 + }, + { + "auxiliary_loss_clip": 0.01128348, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.05124331, + "balance_loss_mlp": 1.01959264, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 3.7870938266933694, + "language_loss": 0.79282534, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81443256, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12780762, + "step": 9469, + "time_per_iteration": 2.544903516769409 + }, + { + "auxiliary_loss_clip": 0.01132031, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.04990721, + "balance_loss_mlp": 1.02987278, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.3560962952986366, + "language_loss": 0.69313371, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71488619, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13342285, + "step": 9470, + "time_per_iteration": 2.4756011962890625 + }, + { + "auxiliary_loss_clip": 0.01126398, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.04949832, + "balance_loss_mlp": 1.02215695, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.6159585454270249, + "language_loss": 0.74967277, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77129441, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13616943, + "step": 9471, + "time_per_iteration": 2.529419422149658 + }, + { + "auxiliary_loss_clip": 0.01126837, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.0496397, + "balance_loss_mlp": 1.02412677, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9786851823432716, + "language_loss": 0.57666671, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59831983, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.14349365, + "step": 9472, + "time_per_iteration": 2.4307775497436523 + }, + { + "auxiliary_loss_clip": 0.01122684, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.04691648, + "balance_loss_mlp": 1.01910186, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.8835078821572262, + "language_loss": 0.73790926, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75945616, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12902832, + "step": 9473, + "time_per_iteration": 2.506723642349243 + }, + { + "auxiliary_loss_clip": 0.01063472, + "auxiliary_loss_mlp": 0.0100444, + "balance_loss_clip": 1.03569305, + "balance_loss_mlp": 1.00304997, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6628782613768984, + "language_loss": 0.57614052, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59681964, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01390076, + "step": 9474, + "time_per_iteration": 4.513007402420044 + }, + { + "auxiliary_loss_clip": 0.01124638, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.04974604, + "balance_loss_mlp": 1.0232687, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 2.8317928854043575, + "language_loss": 0.53923458, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.56084687, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.13317871, + "step": 9475, + "time_per_iteration": 2.5777535438537598 + }, + { + "auxiliary_loss_clip": 0.01132753, + "auxiliary_loss_mlp": 0.01038661, + "balance_loss_clip": 1.05330038, + "balance_loss_mlp": 1.025208, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.4621835662730913, + "language_loss": 0.79373133, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81544548, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13458252, + "step": 9476, + "time_per_iteration": 2.526350498199463 + }, + { + "auxiliary_loss_clip": 0.01128985, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.04817867, + "balance_loss_mlp": 1.0291332, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.9944604151314773, + "language_loss": 0.66759384, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68930501, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13000488, + "step": 9477, + "time_per_iteration": 2.47216796875 + }, + { + "auxiliary_loss_clip": 0.01129079, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.04903042, + "balance_loss_mlp": 1.01827776, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.6949142157461279, + "language_loss": 0.71116829, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73277724, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13531494, + "step": 9478, + "time_per_iteration": 2.5055782794952393 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.04487038, + "balance_loss_mlp": 1.01868582, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 2.1261348992007965, + "language_loss": 0.69478106, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71625578, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1137085, + "step": 9479, + "time_per_iteration": 2.5080957412719727 + }, + { + "auxiliary_loss_clip": 0.01121965, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.04867077, + "balance_loss_mlp": 1.0232985, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4272497347375237, + "language_loss": 0.71339071, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73495996, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11657715, + "step": 9480, + "time_per_iteration": 2.4753336906433105 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.01035083, + "balance_loss_clip": 1.04457831, + "balance_loss_mlp": 1.02130842, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.0054293806687338, + "language_loss": 0.71850771, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74008107, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13781738, + "step": 9481, + "time_per_iteration": 3.8761794567108154 + }, + { + "auxiliary_loss_clip": 0.0112286, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.04639482, + "balance_loss_mlp": 1.02212787, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6425381517703117, + "language_loss": 0.78345525, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.8050313, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12615967, + "step": 9482, + "time_per_iteration": 2.501793146133423 + }, + { + "auxiliary_loss_clip": 0.01125919, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.04844916, + "balance_loss_mlp": 1.01889849, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 1.5663553351156025, + "language_loss": 0.77655935, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79813337, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12597656, + "step": 9483, + "time_per_iteration": 2.467855930328369 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.04522014, + "balance_loss_mlp": 1.02990711, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 1.6837134909202058, + "language_loss": 0.81916869, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.84081584, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12432861, + "step": 9484, + "time_per_iteration": 2.4967520236968994 + }, + { + "auxiliary_loss_clip": 0.01129166, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.04918146, + "balance_loss_mlp": 1.02474785, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 2.1684672711193667, + "language_loss": 0.60879171, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.63047016, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13934326, + "step": 9485, + "time_per_iteration": 2.5206520557403564 + }, + { + "auxiliary_loss_clip": 0.01131875, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.05120659, + "balance_loss_mlp": 1.01807714, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.6735523495787505, + "language_loss": 0.6531691, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.6747992, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13049316, + "step": 9486, + "time_per_iteration": 2.4734714031219482 + }, + { + "auxiliary_loss_clip": 0.01083817, + "auxiliary_loss_mlp": 0.01004927, + "balance_loss_clip": 1.05819035, + "balance_loss_mlp": 1.00270116, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6680591420253164, + "language_loss": 0.47958025, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50046766, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.02227783, + "step": 9487, + "time_per_iteration": 4.667771577835083 + }, + { + "auxiliary_loss_clip": 0.01129779, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.05185354, + "balance_loss_mlp": 1.02141619, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.635319648639558, + "language_loss": 0.85970461, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88134849, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13183594, + "step": 9488, + "time_per_iteration": 2.454709529876709 + }, + { + "auxiliary_loss_clip": 0.01135369, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.05245852, + "balance_loss_mlp": 1.01983368, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.579619971203564, + "language_loss": 0.79025745, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81194508, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13543701, + "step": 9489, + "time_per_iteration": 2.5076699256896973 + }, + { + "auxiliary_loss_clip": 0.01124127, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.04740727, + "balance_loss_mlp": 1.02102327, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7436761546650672, + "language_loss": 0.69763404, + "learning_rate": 1.641884454927604e-06, + "loss": 0.71920955, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.1239624, + "step": 9490, + "time_per_iteration": 2.437112808227539 + }, + { + "auxiliary_loss_clip": 0.01131686, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.05278063, + "balance_loss_mlp": 1.02036369, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.718171374980811, + "language_loss": 0.76554799, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78719682, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.128479, + "step": 9491, + "time_per_iteration": 2.511455774307251 + }, + { + "auxiliary_loss_clip": 0.01056929, + "auxiliary_loss_mlp": 0.01006349, + "balance_loss_clip": 1.02919328, + "balance_loss_mlp": 1.00452161, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7923704920657448, + "language_loss": 0.57431746, + "learning_rate": 1.641118147266011e-06, + "loss": 0.5949502, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01828003, + "step": 9492, + "time_per_iteration": 3.0509157180786133 + }, + { + "auxiliary_loss_clip": 0.01124292, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.04742885, + "balance_loss_mlp": 1.02336597, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.9865882752567185, + "language_loss": 0.72078735, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.74240887, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.14477539, + "step": 9493, + "time_per_iteration": 2.4881796836853027 + }, + { + "auxiliary_loss_clip": 0.01131608, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.05137765, + "balance_loss_mlp": 1.01725507, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.8808756967652245, + "language_loss": 0.78253686, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80415964, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13421631, + "step": 9494, + "time_per_iteration": 2.471018075942993 + }, + { + "auxiliary_loss_clip": 0.01129434, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.0481832, + "balance_loss_mlp": 1.02005577, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.2890090387764492, + "language_loss": 0.80961746, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.83124912, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13659668, + "step": 9495, + "time_per_iteration": 3.997600793838501 + }, + { + "auxiliary_loss_clip": 0.0113263, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_clip": 1.05004287, + "balance_loss_mlp": 1.03195167, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 2.7782703330577485, + "language_loss": 0.66789001, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68969953, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.16363525, + "step": 9496, + "time_per_iteration": 2.5067036151885986 + }, + { + "auxiliary_loss_clip": 0.0113271, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_clip": 1.04722917, + "balance_loss_mlp": 1.03113055, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.569982848731982, + "language_loss": 0.69746834, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71925938, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.15258789, + "step": 9497, + "time_per_iteration": 2.553104877471924 + }, + { + "auxiliary_loss_clip": 0.01123015, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.0440948, + "balance_loss_mlp": 1.01782227, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.8957978807554419, + "language_loss": 0.81324136, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83478492, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13531494, + "step": 9498, + "time_per_iteration": 2.667893171310425 + }, + { + "auxiliary_loss_clip": 0.01125154, + "auxiliary_loss_mlp": 0.01047709, + "balance_loss_clip": 1.04531205, + "balance_loss_mlp": 1.03028083, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 2.0581631039565744, + "language_loss": 0.66577232, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68750095, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.17419434, + "step": 9499, + "time_per_iteration": 2.4755234718322754 + }, + { + "auxiliary_loss_clip": 0.01132515, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.0537281, + "balance_loss_mlp": 1.02448761, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.8223799709958997, + "language_loss": 0.71745414, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73915458, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.13043213, + "step": 9500, + "time_per_iteration": 2.520432710647583 + }, + { + "auxiliary_loss_clip": 0.01131385, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.04826188, + "balance_loss_mlp": 1.02089012, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 2.0473374021907063, + "language_loss": 0.76423359, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78589582, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13952637, + "step": 9501, + "time_per_iteration": 2.52559494972229 + }, + { + "auxiliary_loss_clip": 0.0112764, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.04640174, + "balance_loss_mlp": 1.01878572, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.5667680289901038, + "language_loss": 0.75114477, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77273953, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13067627, + "step": 9502, + "time_per_iteration": 2.4693939685821533 + }, + { + "auxiliary_loss_clip": 0.01125867, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.04821157, + "balance_loss_mlp": 1.015203, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 2.3198066948258114, + "language_loss": 0.82607001, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84760702, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12634277, + "step": 9503, + "time_per_iteration": 2.47383451461792 + }, + { + "auxiliary_loss_clip": 0.01122773, + "auxiliary_loss_mlp": 0.01036124, + "balance_loss_clip": 1.04473042, + "balance_loss_mlp": 1.0231421, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 1.9505868872414367, + "language_loss": 0.86194217, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88353109, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12976074, + "step": 9504, + "time_per_iteration": 2.442952871322632 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.0433898, + "balance_loss_mlp": 1.01963305, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.9301864029841576, + "language_loss": 0.75376868, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77530426, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.14941406, + "step": 9505, + "time_per_iteration": 2.4301090240478516 + }, + { + "auxiliary_loss_clip": 0.01124741, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.04774642, + "balance_loss_mlp": 1.01924682, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.5644259951527646, + "language_loss": 0.81659752, + "learning_rate": 1.635755524332509e-06, + "loss": 0.83815646, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11895752, + "step": 9506, + "time_per_iteration": 2.4466402530670166 + }, + { + "auxiliary_loss_clip": 0.01117343, + "auxiliary_loss_mlp": 0.01025906, + "balance_loss_clip": 1.04151726, + "balance_loss_mlp": 1.01355004, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6288214396265814, + "language_loss": 0.77423054, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79566312, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12359619, + "step": 9507, + "time_per_iteration": 2.437872886657715 + }, + { + "auxiliary_loss_clip": 0.01126971, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.04731584, + "balance_loss_mlp": 1.02471137, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.5883374544615414, + "language_loss": 0.68508565, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70674825, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14575195, + "step": 9508, + "time_per_iteration": 2.50894832611084 + }, + { + "auxiliary_loss_clip": 0.01125985, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.04542923, + "balance_loss_mlp": 1.01808488, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.322869249389916, + "language_loss": 0.79888761, + "learning_rate": 1.634606741699593e-06, + "loss": 0.82046658, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13842773, + "step": 9509, + "time_per_iteration": 2.415663242340088 + }, + { + "auxiliary_loss_clip": 0.01126889, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.05019164, + "balance_loss_mlp": 1.01804924, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.9236076107931375, + "language_loss": 0.71869457, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74027562, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13171387, + "step": 9510, + "time_per_iteration": 2.480544090270996 + }, + { + "auxiliary_loss_clip": 0.01118458, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.04137683, + "balance_loss_mlp": 1.01670349, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.4390394967838358, + "language_loss": 0.69395602, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71543753, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12994385, + "step": 9511, + "time_per_iteration": 2.55804181098938 + }, + { + "auxiliary_loss_clip": 0.01122922, + "auxiliary_loss_mlp": 0.01039253, + "balance_loss_clip": 1.04498994, + "balance_loss_mlp": 1.02631879, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.8809821324685976, + "language_loss": 0.61507177, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63669348, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12939453, + "step": 9512, + "time_per_iteration": 2.4894959926605225 + }, + { + "auxiliary_loss_clip": 0.0111922, + "auxiliary_loss_mlp": 0.01027876, + "balance_loss_clip": 1.04196894, + "balance_loss_mlp": 1.0152998, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.5588151664337593, + "language_loss": 0.76032937, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78180033, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12561035, + "step": 9513, + "time_per_iteration": 2.4584224224090576 + }, + { + "auxiliary_loss_clip": 0.01053603, + "auxiliary_loss_mlp": 0.01002731, + "balance_loss_clip": 1.02586675, + "balance_loss_mlp": 1.00096416, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8920465171864879, + "language_loss": 0.66826367, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68882704, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01766968, + "step": 9514, + "time_per_iteration": 3.08951997756958 + }, + { + "auxiliary_loss_clip": 0.01129205, + "auxiliary_loss_mlp": 0.0103676, + "balance_loss_clip": 1.04765475, + "balance_loss_mlp": 1.02362323, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.3895331208566977, + "language_loss": 0.81175399, + "learning_rate": 1.63230955093099e-06, + "loss": 0.8334136, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13153076, + "step": 9515, + "time_per_iteration": 2.4851012229919434 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.0436945, + "balance_loss_mlp": 1.01672363, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.5931397610176476, + "language_loss": 0.864169, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88565266, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12817383, + "step": 9516, + "time_per_iteration": 2.5073153972625732 + }, + { + "auxiliary_loss_clip": 0.01119212, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.04219115, + "balance_loss_mlp": 1.01743126, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.6416159951603262, + "language_loss": 0.87307453, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89457291, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13195801, + "step": 9517, + "time_per_iteration": 3.9269065856933594 + }, + { + "auxiliary_loss_clip": 0.01119955, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.04273415, + "balance_loss_mlp": 1.01800919, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.6811425776188464, + "language_loss": 0.85439646, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87591159, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13549805, + "step": 9518, + "time_per_iteration": 2.5320329666137695 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.05047083, + "balance_loss_mlp": 1.01878726, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 2.522750169858726, + "language_loss": 0.78840089, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80995607, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11956787, + "step": 9519, + "time_per_iteration": 2.4976284503936768 + }, + { + "auxiliary_loss_clip": 0.01124628, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.0471822, + "balance_loss_mlp": 1.02021742, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.5253772758132924, + "language_loss": 0.82753235, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.84910595, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12512207, + "step": 9520, + "time_per_iteration": 2.5682740211486816 + }, + { + "auxiliary_loss_clip": 0.01122657, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.0453099, + "balance_loss_mlp": 1.0269376, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.184233995845609, + "language_loss": 0.72087443, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74250424, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13391113, + "step": 9521, + "time_per_iteration": 2.4939565658569336 + }, + { + "auxiliary_loss_clip": 0.01125399, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.04764223, + "balance_loss_mlp": 1.02054632, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.5944874227909058, + "language_loss": 0.78128731, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80288112, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13433838, + "step": 9522, + "time_per_iteration": 2.5835390090942383 + }, + { + "auxiliary_loss_clip": 0.01119369, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.04628456, + "balance_loss_mlp": 1.0167613, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.6747354900174598, + "language_loss": 0.72011161, + "learning_rate": 1.629247411248102e-06, + "loss": 0.74158531, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11242676, + "step": 9523, + "time_per_iteration": 2.454183578491211 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.03886366, + "balance_loss_mlp": 1.01584053, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.7386433294536876, + "language_loss": 0.70150602, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72292519, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12945557, + "step": 9524, + "time_per_iteration": 2.511066436767578 + }, + { + "auxiliary_loss_clip": 0.01130478, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.05314255, + "balance_loss_mlp": 1.0168674, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.342054368078536, + "language_loss": 0.65429771, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67589104, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.11981201, + "step": 9525, + "time_per_iteration": 4.047931432723999 + }, + { + "auxiliary_loss_clip": 0.01120759, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.04614747, + "balance_loss_mlp": 1.0181756, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 2.2279264641416257, + "language_loss": 0.72697818, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74848485, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11730957, + "step": 9526, + "time_per_iteration": 2.4750208854675293 + }, + { + "auxiliary_loss_clip": 0.01119348, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.04435253, + "balance_loss_mlp": 1.02643394, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.655995143582986, + "language_loss": 0.80465829, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82625186, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13580322, + "step": 9527, + "time_per_iteration": 2.524458169937134 + }, + { + "auxiliary_loss_clip": 0.01122449, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.04699707, + "balance_loss_mlp": 1.02224493, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.7022047168465453, + "language_loss": 0.7203517, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74192679, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12817383, + "step": 9528, + "time_per_iteration": 2.4101617336273193 + }, + { + "auxiliary_loss_clip": 0.01120969, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.04528654, + "balance_loss_mlp": 1.02269244, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.828454865627144, + "language_loss": 0.86051267, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88207197, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12261963, + "step": 9529, + "time_per_iteration": 2.457782506942749 + }, + { + "auxiliary_loss_clip": 0.01072545, + "auxiliary_loss_mlp": 0.0101098, + "balance_loss_clip": 1.04362702, + "balance_loss_mlp": 1.00945151, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7607699888657553, + "language_loss": 0.56082511, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58166039, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.28955078, + "router_z_loss_mlp": 0.01527405, + "step": 9530, + "time_per_iteration": 2.9356014728546143 + }, + { + "auxiliary_loss_clip": 0.01126587, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.0480597, + "balance_loss_mlp": 1.01899099, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 1.642916190732835, + "language_loss": 0.66635931, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68794894, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13372803, + "step": 9531, + "time_per_iteration": 4.017182111740112 + }, + { + "auxiliary_loss_clip": 0.01121544, + "auxiliary_loss_mlp": 0.01040941, + "balance_loss_clip": 1.04370475, + "balance_loss_mlp": 1.02652216, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.192802116523884, + "language_loss": 0.75774384, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.7793687, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.14422607, + "step": 9532, + "time_per_iteration": 2.6442067623138428 + }, + { + "auxiliary_loss_clip": 0.01125712, + "auxiliary_loss_mlp": 0.0104223, + "balance_loss_clip": 1.04541945, + "balance_loss_mlp": 1.02890861, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.257215081244595, + "language_loss": 0.78975964, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81143904, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13305664, + "step": 9533, + "time_per_iteration": 2.480520725250244 + }, + { + "auxiliary_loss_clip": 0.01118501, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.04405653, + "balance_loss_mlp": 1.01548457, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.811070570303669, + "language_loss": 0.85817224, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87963152, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1194458, + "step": 9534, + "time_per_iteration": 2.4974300861358643 + }, + { + "auxiliary_loss_clip": 0.01130085, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.05352068, + "balance_loss_mlp": 1.0232842, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.8926538110175624, + "language_loss": 0.7513113, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77298141, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13641357, + "step": 9535, + "time_per_iteration": 2.498764753341675 + }, + { + "auxiliary_loss_clip": 0.01142139, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.05675924, + "balance_loss_mlp": 1.02491081, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.5817729756807324, + "language_loss": 0.71104926, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73285758, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.13787842, + "step": 9536, + "time_per_iteration": 2.5108604431152344 + }, + { + "auxiliary_loss_clip": 0.01120546, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.04535699, + "balance_loss_mlp": 1.01583934, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.8270644484092475, + "language_loss": 0.70059967, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.7221033, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13952637, + "step": 9537, + "time_per_iteration": 2.567920207977295 + }, + { + "auxiliary_loss_clip": 0.01124473, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.04892921, + "balance_loss_mlp": 1.01880229, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.8084940091930193, + "language_loss": 0.63039273, + "learning_rate": 1.623508330355902e-06, + "loss": 0.6519559, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13031006, + "step": 9538, + "time_per_iteration": 2.5524439811706543 + }, + { + "auxiliary_loss_clip": 0.01125292, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.0479188, + "balance_loss_mlp": 1.02208817, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.5964903273351159, + "language_loss": 0.82681823, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.8484301, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13830566, + "step": 9539, + "time_per_iteration": 3.9105870723724365 + }, + { + "auxiliary_loss_clip": 0.01125155, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.04511106, + "balance_loss_mlp": 1.02049565, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 2.1841363705940906, + "language_loss": 0.7301625, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75175369, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13482666, + "step": 9540, + "time_per_iteration": 2.4739227294921875 + }, + { + "auxiliary_loss_clip": 0.01123565, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.04603994, + "balance_loss_mlp": 1.0153811, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 2.248105626367658, + "language_loss": 0.80234915, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82385725, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11865234, + "step": 9541, + "time_per_iteration": 2.502157688140869 + }, + { + "auxiliary_loss_clip": 0.01127481, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.04804325, + "balance_loss_mlp": 1.0180192, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.556276375666524, + "language_loss": 0.65002477, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.67160666, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12683105, + "step": 9542, + "time_per_iteration": 2.5583317279815674 + }, + { + "auxiliary_loss_clip": 0.01121088, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.04442167, + "balance_loss_mlp": 1.01948214, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.050631494596343, + "language_loss": 0.83168352, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.8532151, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12579346, + "step": 9543, + "time_per_iteration": 2.4682679176330566 + }, + { + "auxiliary_loss_clip": 0.01130009, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.04992366, + "balance_loss_mlp": 1.01685929, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.077460438290567, + "language_loss": 0.74087405, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.76247776, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13500977, + "step": 9544, + "time_per_iteration": 2.535921096801758 + }, + { + "auxiliary_loss_clip": 0.01127078, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.04887068, + "balance_loss_mlp": 1.01531804, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.015395944005153, + "language_loss": 0.76093638, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78248942, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12927246, + "step": 9545, + "time_per_iteration": 2.4871721267700195 + }, + { + "auxiliary_loss_clip": 0.01119123, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.04175544, + "balance_loss_mlp": 1.02000761, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.9774825456426208, + "language_loss": 0.57001549, + "learning_rate": 1.620448797546459e-06, + "loss": 0.59154046, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13372803, + "step": 9546, + "time_per_iteration": 2.572134256362915 + }, + { + "auxiliary_loss_clip": 0.01126863, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.04550314, + "balance_loss_mlp": 1.02084684, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.418040096137857, + "language_loss": 0.76090103, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78250802, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12988281, + "step": 9547, + "time_per_iteration": 2.6541380882263184 + }, + { + "auxiliary_loss_clip": 0.01126834, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.02081943, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 1.8893007262488932, + "language_loss": 0.74302757, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76464462, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.14056396, + "step": 9548, + "time_per_iteration": 2.4652199745178223 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.04384589, + "balance_loss_mlp": 1.02034402, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 3.5612848519768705, + "language_loss": 0.69966602, + "learning_rate": 1.619301709822355e-06, + "loss": 0.72121871, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13018799, + "step": 9549, + "time_per_iteration": 2.51668381690979 + }, + { + "auxiliary_loss_clip": 0.01125712, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.05039263, + "balance_loss_mlp": 1.01979041, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.466989162718987, + "language_loss": 0.79546487, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81704164, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12176514, + "step": 9550, + "time_per_iteration": 2.462775945663452 + }, + { + "auxiliary_loss_clip": 0.0112369, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.04644275, + "balance_loss_mlp": 1.01849437, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 2.0011109259244333, + "language_loss": 0.67634177, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.69790077, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13720703, + "step": 9551, + "time_per_iteration": 2.465975284576416 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.05182385, + "balance_loss_mlp": 1.02197552, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 2.50686384722412, + "language_loss": 0.7209565, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74262273, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13446045, + "step": 9552, + "time_per_iteration": 2.4849092960357666 + }, + { + "auxiliary_loss_clip": 0.01126784, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.0502702, + "balance_loss_mlp": 1.02180219, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.7772389554825834, + "language_loss": 0.79973423, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82134676, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12683105, + "step": 9553, + "time_per_iteration": 2.445591449737549 + }, + { + "auxiliary_loss_clip": 0.01128332, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.04654944, + "balance_loss_mlp": 1.02093959, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.6025150449891754, + "language_loss": 0.83786905, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.8594932, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.1315918, + "step": 9554, + "time_per_iteration": 2.540036916732788 + }, + { + "auxiliary_loss_clip": 0.01126494, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.04570746, + "balance_loss_mlp": 1.02558029, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 3.197682069225451, + "language_loss": 0.7065438, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72820103, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13641357, + "step": 9555, + "time_per_iteration": 2.5013463497161865 + }, + { + "auxiliary_loss_clip": 0.01120638, + "auxiliary_loss_mlp": 0.01026795, + "balance_loss_clip": 1.04442489, + "balance_loss_mlp": 1.01389658, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.0115119429010773, + "language_loss": 0.72556567, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.74704003, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12902832, + "step": 9556, + "time_per_iteration": 2.411289691925049 + }, + { + "auxiliary_loss_clip": 0.01123765, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04735053, + "balance_loss_mlp": 1.01831162, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.6397894282355512, + "language_loss": 0.74443722, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76598096, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1229248, + "step": 9557, + "time_per_iteration": 2.494767427444458 + }, + { + "auxiliary_loss_clip": 0.01130091, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.05329013, + "balance_loss_mlp": 1.02216613, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.6870282508985823, + "language_loss": 0.67757064, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69922149, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.1282959, + "step": 9558, + "time_per_iteration": 2.4543278217315674 + }, + { + "auxiliary_loss_clip": 0.01130846, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.04663253, + "balance_loss_mlp": 1.02685082, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 7.576318209376366, + "language_loss": 0.71218592, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73392868, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.16577148, + "step": 9559, + "time_per_iteration": 2.427574872970581 + }, + { + "auxiliary_loss_clip": 0.01123739, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.04811692, + "balance_loss_mlp": 1.0226047, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.5792423870920163, + "language_loss": 0.79093248, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81250519, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.10931396, + "step": 9560, + "time_per_iteration": 2.5655839443206787 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.04352736, + "balance_loss_mlp": 1.01945007, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 1.6495685974521175, + "language_loss": 0.64242291, + "learning_rate": 1.614714662090588e-06, + "loss": 0.6639719, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12988281, + "step": 9561, + "time_per_iteration": 3.9875335693359375 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.05387592, + "balance_loss_mlp": 1.02605188, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 2.189192912081808, + "language_loss": 0.71450669, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.7362653, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.14282227, + "step": 9562, + "time_per_iteration": 2.4701685905456543 + }, + { + "auxiliary_loss_clip": 0.01126391, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.04812348, + "balance_loss_mlp": 1.02828288, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.7044171204573273, + "language_loss": 0.84215558, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86382508, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.1227417, + "step": 9563, + "time_per_iteration": 2.563295602798462 + }, + { + "auxiliary_loss_clip": 0.01129586, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.04663944, + "balance_loss_mlp": 1.02635181, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 2.7367549417522334, + "language_loss": 0.57218575, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59387887, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13391113, + "step": 9564, + "time_per_iteration": 2.4466638565063477 + }, + { + "auxiliary_loss_clip": 0.01124209, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.04959059, + "balance_loss_mlp": 1.01546538, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.667519027479137, + "language_loss": 0.75984561, + "learning_rate": 1.613186112465078e-06, + "loss": 0.7813614, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11895752, + "step": 9565, + "time_per_iteration": 2.522583484649658 + }, + { + "auxiliary_loss_clip": 0.01060879, + "auxiliary_loss_mlp": 0.01004453, + "balance_loss_clip": 1.03384781, + "balance_loss_mlp": 1.00281215, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7419709666835139, + "language_loss": 0.60773587, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62838924, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.0164032, + "step": 9566, + "time_per_iteration": 3.1683506965637207 + }, + { + "auxiliary_loss_clip": 0.01125784, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.04959226, + "balance_loss_mlp": 1.02105749, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.8569437735324001, + "language_loss": 0.75566769, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77726322, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12713623, + "step": 9567, + "time_per_iteration": 2.534740686416626 + }, + { + "auxiliary_loss_clip": 0.01124392, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.04831004, + "balance_loss_mlp": 1.01660299, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.6052929187810614, + "language_loss": 0.74864244, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.77017641, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.1239624, + "step": 9568, + "time_per_iteration": 3.9402763843536377 + }, + { + "auxiliary_loss_clip": 0.0112942, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.04966092, + "balance_loss_mlp": 1.01896977, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.6328890441858361, + "language_loss": 0.70944703, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73105586, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12487793, + "step": 9569, + "time_per_iteration": 2.4456417560577393 + }, + { + "auxiliary_loss_clip": 0.01131941, + "auxiliary_loss_mlp": 0.01039846, + "balance_loss_clip": 1.05408156, + "balance_loss_mlp": 1.02623248, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.111789696341004, + "language_loss": 0.55276906, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57448691, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1361084, + "step": 9570, + "time_per_iteration": 2.501189947128296 + }, + { + "auxiliary_loss_clip": 0.01122327, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.04696131, + "balance_loss_mlp": 1.02094531, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6037327370474765, + "language_loss": 0.64727134, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66882384, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11987305, + "step": 9571, + "time_per_iteration": 2.4384829998016357 + }, + { + "auxiliary_loss_clip": 0.01125574, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.04921913, + "balance_loss_mlp": 1.01439452, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5559566068396227, + "language_loss": 0.67106175, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69258797, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12640381, + "step": 9572, + "time_per_iteration": 2.797344446182251 + }, + { + "auxiliary_loss_clip": 0.01131292, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.05457342, + "balance_loss_mlp": 1.02025485, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.925526095034712, + "language_loss": 0.72148955, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74312937, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12432861, + "step": 9573, + "time_per_iteration": 2.436403751373291 + }, + { + "auxiliary_loss_clip": 0.01118471, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.04767561, + "balance_loss_mlp": 1.02233982, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 2.203899668485085, + "language_loss": 0.76458561, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78610873, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.1149292, + "step": 9574, + "time_per_iteration": 2.633592128753662 + }, + { + "auxiliary_loss_clip": 0.01124589, + "auxiliary_loss_mlp": 0.01037245, + "balance_loss_clip": 1.04401374, + "balance_loss_mlp": 1.02202225, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.3100818713595466, + "language_loss": 0.66528922, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.68690753, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.15222168, + "step": 9575, + "time_per_iteration": 3.892916440963745 + }, + { + "auxiliary_loss_clip": 0.01113725, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.04105854, + "balance_loss_mlp": 1.02007961, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5198755960993748, + "language_loss": 0.79975814, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.8212204, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12414551, + "step": 9576, + "time_per_iteration": 2.5370354652404785 + }, + { + "auxiliary_loss_clip": 0.01118857, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.04391587, + "balance_loss_mlp": 1.01907635, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.6896369848228292, + "language_loss": 0.69687366, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71837252, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.1194458, + "step": 9577, + "time_per_iteration": 2.513319253921509 + }, + { + "auxiliary_loss_clip": 0.01120997, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.04335165, + "balance_loss_mlp": 1.02473176, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.8314658164832913, + "language_loss": 0.66719717, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68877715, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.1227417, + "step": 9578, + "time_per_iteration": 2.487029790878296 + }, + { + "auxiliary_loss_clip": 0.01119301, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.04471302, + "balance_loss_mlp": 1.01750791, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.704403303271895, + "language_loss": 0.72699916, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74848402, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11663818, + "step": 9579, + "time_per_iteration": 2.496060848236084 + }, + { + "auxiliary_loss_clip": 0.01122996, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.04385793, + "balance_loss_mlp": 1.0214572, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.7261002562006866, + "language_loss": 0.65030992, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.6718899, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13562012, + "step": 9580, + "time_per_iteration": 2.5083370208740234 + }, + { + "auxiliary_loss_clip": 0.01116467, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.04046583, + "balance_loss_mlp": 1.0235728, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.7618548436709789, + "language_loss": 0.85827279, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87980115, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12792969, + "step": 9581, + "time_per_iteration": 2.527251720428467 + }, + { + "auxiliary_loss_clip": 0.01131104, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.04855204, + "balance_loss_mlp": 1.02326155, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.248736652329738, + "language_loss": 0.67956382, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.70124447, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13702393, + "step": 9582, + "time_per_iteration": 3.903993606567383 + }, + { + "auxiliary_loss_clip": 0.01062682, + "auxiliary_loss_mlp": 0.01001864, + "balance_loss_clip": 1.0347836, + "balance_loss_mlp": 1.00027442, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6415226022808372, + "language_loss": 0.57161307, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59225845, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.27832031, + "router_z_loss_mlp": 0.01591492, + "step": 9583, + "time_per_iteration": 3.236809015274048 + }, + { + "auxiliary_loss_clip": 0.01128042, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.04915476, + "balance_loss_mlp": 1.01720977, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 2.224510061309499, + "language_loss": 0.82320154, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84477991, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.1260376, + "step": 9584, + "time_per_iteration": 2.4161741733551025 + }, + { + "auxiliary_loss_clip": 0.01087359, + "auxiliary_loss_mlp": 0.01010323, + "balance_loss_clip": 1.06100059, + "balance_loss_mlp": 1.00822449, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.621397482413313, + "language_loss": 0.49478325, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51576006, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.26416016, + "router_z_loss_mlp": 0.02099609, + "step": 9585, + "time_per_iteration": 3.1627345085144043 + }, + { + "auxiliary_loss_clip": 0.01122751, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.04719543, + "balance_loss_mlp": 1.01695323, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.6179796320674698, + "language_loss": 0.84567499, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86719275, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12072754, + "step": 9586, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.04555738, + "balance_loss_mlp": 1.02035713, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 2.0013444564072906, + "language_loss": 0.80180192, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82334971, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13220215, + "step": 9587, + "time_per_iteration": 2.446946620941162 + }, + { + "auxiliary_loss_clip": 0.01120818, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.04504585, + "balance_loss_mlp": 1.01805103, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.6751512025001354, + "language_loss": 0.6624577, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68397737, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13092041, + "step": 9588, + "time_per_iteration": 2.5523838996887207 + }, + { + "auxiliary_loss_clip": 0.01128817, + "auxiliary_loss_mlp": 0.01035676, + "balance_loss_clip": 1.05073845, + "balance_loss_mlp": 1.02227068, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.8408283955835942, + "language_loss": 0.78571033, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80735523, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13409424, + "step": 9589, + "time_per_iteration": 2.4857492446899414 + }, + { + "auxiliary_loss_clip": 0.01120625, + "auxiliary_loss_mlp": 0.01026836, + "balance_loss_clip": 1.04609776, + "balance_loss_mlp": 1.01484334, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 2.3916173294760616, + "language_loss": 0.79227173, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81374633, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11993408, + "step": 9590, + "time_per_iteration": 2.5321357250213623 + }, + { + "auxiliary_loss_clip": 0.01120584, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.04526007, + "balance_loss_mlp": 1.01827538, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.8412184357570978, + "language_loss": 0.63470078, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65620089, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11157227, + "step": 9591, + "time_per_iteration": 2.4907875061035156 + }, + { + "auxiliary_loss_clip": 0.0112572, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.04772472, + "balance_loss_mlp": 1.02376938, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6551898933955658, + "language_loss": 0.77953076, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80115575, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13006592, + "step": 9592, + "time_per_iteration": 2.5327136516571045 + }, + { + "auxiliary_loss_clip": 0.01060178, + "auxiliary_loss_mlp": 0.01006012, + "balance_loss_clip": 1.03223705, + "balance_loss_mlp": 1.00435805, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7414369587208811, + "language_loss": 0.59619099, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61685288, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 0.01652527, + "step": 9593, + "time_per_iteration": 3.20910382270813 + }, + { + "auxiliary_loss_clip": 0.01133346, + "auxiliary_loss_mlp": 0.01037972, + "balance_loss_clip": 1.0531925, + "balance_loss_mlp": 1.02447772, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 2.4113576447966314, + "language_loss": 0.71082103, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73253417, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13482666, + "step": 9594, + "time_per_iteration": 2.5947623252868652 + }, + { + "auxiliary_loss_clip": 0.01128354, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.04995871, + "balance_loss_mlp": 1.02152038, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.7648087497842337, + "language_loss": 0.71052778, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.732144, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11755371, + "step": 9595, + "time_per_iteration": 2.435338020324707 + }, + { + "auxiliary_loss_clip": 0.01120765, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.0455246, + "balance_loss_mlp": 1.01588988, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.096505801536373, + "language_loss": 0.69516522, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71666002, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1282959, + "step": 9596, + "time_per_iteration": 2.5468108654022217 + }, + { + "auxiliary_loss_clip": 0.01128453, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.04653668, + "balance_loss_mlp": 1.02195764, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.5399908799666355, + "language_loss": 0.67391729, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69556272, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14123535, + "step": 9597, + "time_per_iteration": 2.5882370471954346 + }, + { + "auxiliary_loss_clip": 0.0112165, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.04490185, + "balance_loss_mlp": 1.02342987, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 1.813002936608854, + "language_loss": 0.819489, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.84106839, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12866211, + "step": 9598, + "time_per_iteration": 2.469316244125366 + }, + { + "auxiliary_loss_clip": 0.01131467, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.05371428, + "balance_loss_mlp": 1.02008152, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.469893201288035, + "language_loss": 0.72921979, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.75087166, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13641357, + "step": 9599, + "time_per_iteration": 2.4709644317626953 + }, + { + "auxiliary_loss_clip": 0.01121211, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.0481025, + "balance_loss_mlp": 1.0180943, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 2.072486168419378, + "language_loss": 0.78231078, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80381811, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11431885, + "step": 9600, + "time_per_iteration": 2.5154857635498047 + }, + { + "auxiliary_loss_clip": 0.01119604, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.04313159, + "balance_loss_mlp": 1.02784693, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.7820414415232355, + "language_loss": 0.72128308, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74289012, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13262939, + "step": 9601, + "time_per_iteration": 2.517834424972534 + }, + { + "auxiliary_loss_clip": 0.01121065, + "auxiliary_loss_mlp": 0.01034184, + "balance_loss_clip": 1.04581618, + "balance_loss_mlp": 1.02201891, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.8197192451503537, + "language_loss": 0.68670917, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70826173, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12176514, + "step": 9602, + "time_per_iteration": 2.5143282413482666 + }, + { + "auxiliary_loss_clip": 0.01114943, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.04257369, + "balance_loss_mlp": 1.02421963, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.5536193077452947, + "language_loss": 0.73643327, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75795138, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12658691, + "step": 9603, + "time_per_iteration": 2.495978832244873 + }, + { + "auxiliary_loss_clip": 0.01130253, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.0561769, + "balance_loss_mlp": 1.01825619, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.6249350290714175, + "language_loss": 0.76499009, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78659892, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12384033, + "step": 9604, + "time_per_iteration": 3.881814479827881 + }, + { + "auxiliary_loss_clip": 0.01116946, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.04005992, + "balance_loss_mlp": 1.0202179, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 2.0883512781220563, + "language_loss": 0.83767462, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85918558, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13946533, + "step": 9605, + "time_per_iteration": 2.541367769241333 + }, + { + "auxiliary_loss_clip": 0.01127161, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.04584217, + "balance_loss_mlp": 1.016114, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 2.1189955731571417, + "language_loss": 0.77957332, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.8011573, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.15112305, + "step": 9606, + "time_per_iteration": 2.4767704010009766 + }, + { + "auxiliary_loss_clip": 0.01119393, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.04414344, + "balance_loss_mlp": 1.02487111, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.6776040693237815, + "language_loss": 0.73874569, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76030535, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11706543, + "step": 9607, + "time_per_iteration": 2.4765617847442627 + }, + { + "auxiliary_loss_clip": 0.01126723, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.04972517, + "balance_loss_mlp": 1.02306962, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.8077288389881045, + "language_loss": 0.69387162, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71549833, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12884521, + "step": 9608, + "time_per_iteration": 2.5127642154693604 + }, + { + "auxiliary_loss_clip": 0.01134139, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.05659795, + "balance_loss_mlp": 1.02261305, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.8097480487017241, + "language_loss": 0.76352215, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78522098, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13140869, + "step": 9609, + "time_per_iteration": 2.580217123031616 + }, + { + "auxiliary_loss_clip": 0.01124393, + "auxiliary_loss_mlp": 0.01030253, + "balance_loss_clip": 1.0469228, + "balance_loss_mlp": 1.01782584, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.6243344276781757, + "language_loss": 0.77230203, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79384851, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12432861, + "step": 9610, + "time_per_iteration": 2.508500814437866 + }, + { + "auxiliary_loss_clip": 0.01119406, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.04315352, + "balance_loss_mlp": 1.01850319, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.4801212605066927, + "language_loss": 0.69259822, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.71411288, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13555908, + "step": 9611, + "time_per_iteration": 2.529691457748413 + }, + { + "auxiliary_loss_clip": 0.01122566, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.04420352, + "balance_loss_mlp": 1.01961684, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.7356840640063464, + "language_loss": 0.82707298, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.84862411, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1293335, + "step": 9612, + "time_per_iteration": 3.841846466064453 + }, + { + "auxiliary_loss_clip": 0.01125587, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.04921651, + "balance_loss_mlp": 1.01862979, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.5785093111419692, + "language_loss": 0.79923207, + "learning_rate": 1.594862087742667e-06, + "loss": 0.8208009, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12658691, + "step": 9613, + "time_per_iteration": 2.4600300788879395 + }, + { + "auxiliary_loss_clip": 0.01124051, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.04878664, + "balance_loss_mlp": 1.02287078, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.9307717425388145, + "language_loss": 0.77411258, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79570127, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11950684, + "step": 9614, + "time_per_iteration": 2.4321746826171875 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.04140377, + "balance_loss_mlp": 1.01919055, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.3268732198661333, + "language_loss": 0.81060958, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83210933, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12225342, + "step": 9615, + "time_per_iteration": 2.475541114807129 + }, + { + "auxiliary_loss_clip": 0.01117146, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.04062033, + "balance_loss_mlp": 1.02044415, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.7321241416308302, + "language_loss": 0.67226696, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69376945, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12658691, + "step": 9616, + "time_per_iteration": 2.506876230239868 + }, + { + "auxiliary_loss_clip": 0.0112743, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.05187249, + "balance_loss_mlp": 1.01838255, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.7519079463873184, + "language_loss": 0.77826762, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79985535, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12976074, + "step": 9617, + "time_per_iteration": 2.43389630317688 + }, + { + "auxiliary_loss_clip": 0.01131829, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.05376756, + "balance_loss_mlp": 1.01706004, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.443938058427296, + "language_loss": 0.74987316, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.7714957, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13366699, + "step": 9618, + "time_per_iteration": 2.5211234092712402 + }, + { + "auxiliary_loss_clip": 0.01117846, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.04434085, + "balance_loss_mlp": 1.01927686, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 3.701459265023351, + "language_loss": 0.80765355, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.82914364, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11871338, + "step": 9619, + "time_per_iteration": 3.8638622760772705 + }, + { + "auxiliary_loss_clip": 0.01120899, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.04558063, + "balance_loss_mlp": 1.01958644, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 1.6572406224054461, + "language_loss": 0.72270584, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74422789, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11724854, + "step": 9620, + "time_per_iteration": 2.504183769226074 + }, + { + "auxiliary_loss_clip": 0.01118662, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.04189563, + "balance_loss_mlp": 1.01645195, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.6246893906629907, + "language_loss": 0.76954496, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79102886, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.1328125, + "step": 9621, + "time_per_iteration": 2.4804930686950684 + }, + { + "auxiliary_loss_clip": 0.01121958, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.04541707, + "balance_loss_mlp": 1.01895666, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.4646168804992736, + "language_loss": 0.70334238, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72488678, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13513184, + "step": 9622, + "time_per_iteration": 2.597935676574707 + }, + { + "auxiliary_loss_clip": 0.01069128, + "auxiliary_loss_mlp": 0.01006788, + "balance_loss_clip": 1.04176676, + "balance_loss_mlp": 1.0051446, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7778825149680442, + "language_loss": 0.55976629, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58052552, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.01644897, + "step": 9623, + "time_per_iteration": 3.1196975708007812 + }, + { + "auxiliary_loss_clip": 0.0112263, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.04374707, + "balance_loss_mlp": 1.02302623, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.0779092601767157, + "language_loss": 0.71555305, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73714733, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13763428, + "step": 9624, + "time_per_iteration": 2.638850450515747 + }, + { + "auxiliary_loss_clip": 0.0111831, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.04321694, + "balance_loss_mlp": 1.02846956, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.438121596547636, + "language_loss": 0.81991184, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.841524, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.14434814, + "step": 9625, + "time_per_iteration": 3.9296507835388184 + }, + { + "auxiliary_loss_clip": 0.01122587, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.04670596, + "balance_loss_mlp": 1.01987004, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.6927677761173259, + "language_loss": 0.70203948, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72360516, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.14105225, + "step": 9626, + "time_per_iteration": 2.4952118396759033 + }, + { + "auxiliary_loss_clip": 0.01121785, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.04483199, + "balance_loss_mlp": 1.02683973, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.571892912814127, + "language_loss": 0.71728539, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73889685, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12524414, + "step": 9627, + "time_per_iteration": 2.549995183944702 + }, + { + "auxiliary_loss_clip": 0.01121796, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.04712665, + "balance_loss_mlp": 1.01743758, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.6538479494448022, + "language_loss": 0.83684897, + "learning_rate": 1.589143013764458e-06, + "loss": 0.85836554, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12426758, + "step": 9628, + "time_per_iteration": 2.484215021133423 + }, + { + "auxiliary_loss_clip": 0.01121685, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.04377222, + "balance_loss_mlp": 1.01785505, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.575749360131119, + "language_loss": 0.72208118, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74360657, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12988281, + "step": 9629, + "time_per_iteration": 2.4943459033966064 + }, + { + "auxiliary_loss_clip": 0.01129711, + "auxiliary_loss_mlp": 0.01039462, + "balance_loss_clip": 1.0536468, + "balance_loss_mlp": 1.02555585, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.121284161756466, + "language_loss": 0.74630654, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76799834, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13928223, + "step": 9630, + "time_per_iteration": 2.4878528118133545 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.04426336, + "balance_loss_mlp": 1.01705599, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.5669028228829591, + "language_loss": 0.78842461, + "learning_rate": 1.587999618060523e-06, + "loss": 0.80990309, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12335205, + "step": 9631, + "time_per_iteration": 2.5042660236358643 + }, + { + "auxiliary_loss_clip": 0.01122028, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.044842, + "balance_loss_mlp": 1.01730943, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.5913421920801516, + "language_loss": 0.74845982, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.769979, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12585449, + "step": 9632, + "time_per_iteration": 2.5072553157806396 + }, + { + "auxiliary_loss_clip": 0.01122286, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.04538536, + "balance_loss_mlp": 1.01610112, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 1.842806177042475, + "language_loss": 0.79416168, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81567812, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13250732, + "step": 9633, + "time_per_iteration": 2.5288546085357666 + }, + { + "auxiliary_loss_clip": 0.01134102, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.05122328, + "balance_loss_mlp": 1.02137208, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.6972554994256566, + "language_loss": 0.77889711, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.80059135, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13952637, + "step": 9634, + "time_per_iteration": 2.4831924438476562 + }, + { + "auxiliary_loss_clip": 0.01119351, + "auxiliary_loss_mlp": 0.01039701, + "balance_loss_clip": 1.0414784, + "balance_loss_mlp": 1.0247817, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.725049595517748, + "language_loss": 0.62954634, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65113688, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.14923096, + "step": 9635, + "time_per_iteration": 2.4983811378479004 + }, + { + "auxiliary_loss_clip": 0.01118962, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.04352462, + "balance_loss_mlp": 1.02222741, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.4484606414726569, + "language_loss": 0.77173138, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79328126, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13793945, + "step": 9636, + "time_per_iteration": 2.459996223449707 + }, + { + "auxiliary_loss_clip": 0.01112611, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.03985465, + "balance_loss_mlp": 1.01934671, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 2.3580085564069715, + "language_loss": 0.68953663, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.71098673, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.13049316, + "step": 9637, + "time_per_iteration": 2.468273878097534 + }, + { + "auxiliary_loss_clip": 0.01124323, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.04564238, + "balance_loss_mlp": 1.02075052, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.5770848126552397, + "language_loss": 0.72294044, + "learning_rate": 1.585332242234043e-06, + "loss": 0.744524, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13275146, + "step": 9638, + "time_per_iteration": 2.423008680343628 + }, + { + "auxiliary_loss_clip": 0.01130798, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.05368137, + "balance_loss_mlp": 1.02098584, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 2.083685751631243, + "language_loss": 0.72584611, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74749899, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13482666, + "step": 9639, + "time_per_iteration": 2.4735865592956543 + }, + { + "auxiliary_loss_clip": 0.01128821, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.04940629, + "balance_loss_mlp": 1.02241135, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.1796162012386935, + "language_loss": 0.69761628, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.719262, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13342285, + "step": 9640, + "time_per_iteration": 2.4321625232696533 + }, + { + "auxiliary_loss_clip": 0.01135654, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.05476379, + "balance_loss_mlp": 1.02315521, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.4005519693790474, + "language_loss": 0.78069013, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80241942, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.14099121, + "step": 9641, + "time_per_iteration": 2.47526216506958 + }, + { + "auxiliary_loss_clip": 0.01121927, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.04522729, + "balance_loss_mlp": 1.01953161, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.9836163740561334, + "language_loss": 0.73729402, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75883305, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12432861, + "step": 9642, + "time_per_iteration": 2.4833567142486572 + }, + { + "auxiliary_loss_clip": 0.01123719, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.04710174, + "balance_loss_mlp": 1.02068532, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.5234941874324806, + "language_loss": 0.7324397, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75401539, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13153076, + "step": 9643, + "time_per_iteration": 2.5342280864715576 + }, + { + "auxiliary_loss_clip": 0.01129351, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.04782248, + "balance_loss_mlp": 1.02357078, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.8120678574600624, + "language_loss": 0.67131889, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69298244, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13439941, + "step": 9644, + "time_per_iteration": 2.4529221057891846 + }, + { + "auxiliary_loss_clip": 0.01129893, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.0496521, + "balance_loss_mlp": 1.02042401, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.4240263818853136, + "language_loss": 0.85814226, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87978017, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13470459, + "step": 9645, + "time_per_iteration": 2.5352845191955566 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.05072904, + "balance_loss_mlp": 1.01980114, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.7813031426628825, + "language_loss": 0.75278592, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77442122, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13079834, + "step": 9646, + "time_per_iteration": 2.685397148132324 + }, + { + "auxiliary_loss_clip": 0.01132771, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.05154753, + "balance_loss_mlp": 1.02110291, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.7066771714879927, + "language_loss": 0.59512681, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61680484, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13922119, + "step": 9647, + "time_per_iteration": 2.694101572036743 + }, + { + "auxiliary_loss_clip": 0.0113167, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.05270863, + "balance_loss_mlp": 1.02564991, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.6305163040687718, + "language_loss": 0.84077835, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86248624, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.1348877, + "step": 9648, + "time_per_iteration": 3.883089780807495 + }, + { + "auxiliary_loss_clip": 0.01053071, + "auxiliary_loss_mlp": 0.01006853, + "balance_loss_clip": 1.02560854, + "balance_loss_mlp": 1.00501764, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8433439637068939, + "language_loss": 0.6303283, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65092748, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.27441406, + "router_z_loss_mlp": 0.01837158, + "step": 9649, + "time_per_iteration": 3.114362955093384 + }, + { + "auxiliary_loss_clip": 0.01121676, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.04556274, + "balance_loss_mlp": 1.0191586, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.6859446138501821, + "language_loss": 0.82194752, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.8434819, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12597656, + "step": 9650, + "time_per_iteration": 2.4543867111206055 + }, + { + "auxiliary_loss_clip": 0.01128924, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.04857588, + "balance_loss_mlp": 1.02127743, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.6691882503299595, + "language_loss": 0.77519733, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79683149, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13214111, + "step": 9651, + "time_per_iteration": 2.4565157890319824 + }, + { + "auxiliary_loss_clip": 0.01127261, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.04973853, + "balance_loss_mlp": 1.02691746, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 1.8120277086152536, + "language_loss": 0.73973578, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76141405, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13647461, + "step": 9652, + "time_per_iteration": 2.4381303787231445 + }, + { + "auxiliary_loss_clip": 0.01128853, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.05030179, + "balance_loss_mlp": 1.01732707, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.159474333243404, + "language_loss": 0.76858521, + "learning_rate": 1.579619037747193e-06, + "loss": 0.7901848, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13793945, + "step": 9653, + "time_per_iteration": 2.531759023666382 + }, + { + "auxiliary_loss_clip": 0.01127591, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.04963923, + "balance_loss_mlp": 1.01938474, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 1.9109150613726618, + "language_loss": 0.74391973, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76552945, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.14007568, + "step": 9654, + "time_per_iteration": 2.4328792095184326 + }, + { + "auxiliary_loss_clip": 0.01127086, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.05412149, + "balance_loss_mlp": 1.02486444, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 2.9497732332119524, + "language_loss": 0.70640391, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72804469, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12133789, + "step": 9655, + "time_per_iteration": 2.4982569217681885 + }, + { + "auxiliary_loss_clip": 0.01126612, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.04532433, + "balance_loss_mlp": 1.0206095, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 1.8035424238522277, + "language_loss": 0.69581699, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71742082, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13171387, + "step": 9656, + "time_per_iteration": 3.9448416233062744 + }, + { + "auxiliary_loss_clip": 0.01125108, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.05184603, + "balance_loss_mlp": 1.02066481, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 2.799476005480815, + "language_loss": 0.71888745, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.74046397, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11883545, + "step": 9657, + "time_per_iteration": 2.576895236968994 + }, + { + "auxiliary_loss_clip": 0.01140625, + "auxiliary_loss_mlp": 0.01037604, + "balance_loss_clip": 1.05875397, + "balance_loss_mlp": 1.02331114, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.6662852808337156, + "language_loss": 0.71474636, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73652864, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.1428833, + "step": 9658, + "time_per_iteration": 2.549525260925293 + }, + { + "auxiliary_loss_clip": 0.01074847, + "auxiliary_loss_mlp": 0.01009142, + "balance_loss_clip": 1.04925513, + "balance_loss_mlp": 1.00721645, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6505259500155668, + "language_loss": 0.53572065, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55656052, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01925659, + "step": 9659, + "time_per_iteration": 3.086421012878418 + }, + { + "auxiliary_loss_clip": 0.01126834, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.04903793, + "balance_loss_mlp": 1.02960086, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 1.9052964829281427, + "language_loss": 0.61985272, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64154601, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12902832, + "step": 9660, + "time_per_iteration": 2.5304884910583496 + }, + { + "auxiliary_loss_clip": 0.01124358, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.04510987, + "balance_loss_mlp": 1.02276421, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.5389848132869166, + "language_loss": 0.656739, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67834514, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13500977, + "step": 9661, + "time_per_iteration": 2.461526870727539 + }, + { + "auxiliary_loss_clip": 0.01125386, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.04973125, + "balance_loss_mlp": 1.0204072, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.4979209576959818, + "language_loss": 0.74580407, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76737797, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1159668, + "step": 9662, + "time_per_iteration": 2.450573444366455 + }, + { + "auxiliary_loss_clip": 0.01059149, + "auxiliary_loss_mlp": 0.01005523, + "balance_loss_clip": 1.03118205, + "balance_loss_mlp": 1.00401473, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8844735506185188, + "language_loss": 0.58385015, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.6044969, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.28027344, + "router_z_loss_mlp": 0.0151062, + "step": 9663, + "time_per_iteration": 4.659381866455078 + }, + { + "auxiliary_loss_clip": 0.01125383, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.0498023, + "balance_loss_mlp": 1.02218258, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.4555013705423927, + "language_loss": 0.82477546, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.8463825, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.13146973, + "step": 9664, + "time_per_iteration": 2.4480578899383545 + }, + { + "auxiliary_loss_clip": 0.01120703, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.04248929, + "balance_loss_mlp": 1.01997018, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.7872849778730073, + "language_loss": 0.81623662, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83777994, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13665771, + "step": 9665, + "time_per_iteration": 2.562596559524536 + }, + { + "auxiliary_loss_clip": 0.01139096, + "auxiliary_loss_mlp": 0.01037736, + "balance_loss_clip": 1.05796576, + "balance_loss_mlp": 1.02304971, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.6341542042556585, + "language_loss": 0.8124187, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83418703, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14678955, + "step": 9666, + "time_per_iteration": 2.5056610107421875 + }, + { + "auxiliary_loss_clip": 0.01117851, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.0436058, + "balance_loss_mlp": 1.02349567, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 2.2902006654796137, + "language_loss": 0.80437309, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82590169, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11517334, + "step": 9667, + "time_per_iteration": 2.4861514568328857 + }, + { + "auxiliary_loss_clip": 0.01130968, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.04749107, + "balance_loss_mlp": 1.02409434, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.4297358804851428, + "language_loss": 0.78614604, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80784255, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.14611816, + "step": 9668, + "time_per_iteration": 2.598633289337158 + }, + { + "auxiliary_loss_clip": 0.01121591, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.04543018, + "balance_loss_mlp": 1.02082598, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 1.813291506727583, + "language_loss": 0.6450848, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66662556, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11676025, + "step": 9669, + "time_per_iteration": 4.1478493213653564 + }, + { + "auxiliary_loss_clip": 0.01119083, + "auxiliary_loss_mlp": 0.01040726, + "balance_loss_clip": 1.04356706, + "balance_loss_mlp": 1.02667165, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 2.0303503030119603, + "language_loss": 0.73257685, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75417495, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.14050293, + "step": 9670, + "time_per_iteration": 2.548137664794922 + }, + { + "auxiliary_loss_clip": 0.01117307, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.04051971, + "balance_loss_mlp": 1.02521849, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.071093245509157, + "language_loss": 0.79171228, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81326437, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12683105, + "step": 9671, + "time_per_iteration": 2.5133161544799805 + }, + { + "auxiliary_loss_clip": 0.01128754, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.04789817, + "balance_loss_mlp": 1.0228349, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.8717631316250842, + "language_loss": 0.6082921, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6299516, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.14361572, + "step": 9672, + "time_per_iteration": 2.516207456588745 + }, + { + "auxiliary_loss_clip": 0.0111983, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.04500532, + "balance_loss_mlp": 1.01888573, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 2.454113660947219, + "language_loss": 0.81453264, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83604199, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12219238, + "step": 9673, + "time_per_iteration": 2.485460042953491 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01038459, + "balance_loss_clip": 1.04566729, + "balance_loss_mlp": 1.02442813, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.7883820631644367, + "language_loss": 0.88247877, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90411603, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.14025879, + "step": 9674, + "time_per_iteration": 2.471027135848999 + }, + { + "auxiliary_loss_clip": 0.01124148, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.04773831, + "balance_loss_mlp": 1.02260947, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.9646258360104873, + "language_loss": 0.78997731, + "learning_rate": 1.571246172811984e-06, + "loss": 0.81156456, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11968994, + "step": 9675, + "time_per_iteration": 2.4794740676879883 + }, + { + "auxiliary_loss_clip": 0.01116052, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.04130054, + "balance_loss_mlp": 1.02031422, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.166011749499901, + "language_loss": 0.70383954, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72533762, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.13452148, + "step": 9676, + "time_per_iteration": 2.4942307472229004 + }, + { + "auxiliary_loss_clip": 0.01116076, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.04043734, + "balance_loss_mlp": 1.0205549, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.3173284511575574, + "language_loss": 0.63810885, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65960896, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13366699, + "step": 9677, + "time_per_iteration": 2.5038998126983643 + }, + { + "auxiliary_loss_clip": 0.01051921, + "auxiliary_loss_mlp": 0.01009829, + "balance_loss_clip": 1.02565479, + "balance_loss_mlp": 1.0084002, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8062233670764815, + "language_loss": 0.54175216, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56236964, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01431274, + "step": 9678, + "time_per_iteration": 3.1734914779663086 + }, + { + "auxiliary_loss_clip": 0.01063692, + "auxiliary_loss_mlp": 0.01003935, + "balance_loss_clip": 1.03802001, + "balance_loss_mlp": 1.00203395, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7273702746559824, + "language_loss": 0.56164163, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58231789, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01899719, + "step": 9679, + "time_per_iteration": 2.9923245906829834 + }, + { + "auxiliary_loss_clip": 0.01125389, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.04815388, + "balance_loss_mlp": 1.01597559, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.6026608803738327, + "language_loss": 0.65756679, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67909575, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11529541, + "step": 9680, + "time_per_iteration": 2.464019298553467 + }, + { + "auxiliary_loss_clip": 0.01121823, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.04441524, + "balance_loss_mlp": 1.01495767, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.8296011152502578, + "language_loss": 0.83424902, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85573781, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12097168, + "step": 9681, + "time_per_iteration": 2.4248101711273193 + }, + { + "auxiliary_loss_clip": 0.01119618, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.04284203, + "balance_loss_mlp": 1.01812255, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.633129152175467, + "language_loss": 0.75798571, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77949393, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13098145, + "step": 9682, + "time_per_iteration": 2.466475009918213 + }, + { + "auxiliary_loss_clip": 0.01127177, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.04857898, + "balance_loss_mlp": 1.01640558, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.4644014947399304, + "language_loss": 0.75280082, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77436215, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12548828, + "step": 9683, + "time_per_iteration": 2.4751973152160645 + }, + { + "auxiliary_loss_clip": 0.01128409, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.04812932, + "balance_loss_mlp": 1.01772094, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 2.123845154298117, + "language_loss": 0.73658276, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.75817466, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.1305542, + "step": 9684, + "time_per_iteration": 2.483035087585449 + }, + { + "auxiliary_loss_clip": 0.01118297, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.04084373, + "balance_loss_mlp": 1.02539945, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.1165213168554757, + "language_loss": 0.78279495, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80436337, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13134766, + "step": 9685, + "time_per_iteration": 2.5090749263763428 + }, + { + "auxiliary_loss_clip": 0.01123018, + "auxiliary_loss_mlp": 0.01035271, + "balance_loss_clip": 1.0473845, + "balance_loss_mlp": 1.02270043, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.692563883067565, + "language_loss": 0.75086582, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77244866, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12567139, + "step": 9686, + "time_per_iteration": 2.455142021179199 + }, + { + "auxiliary_loss_clip": 0.01062375, + "auxiliary_loss_mlp": 0.010051, + "balance_loss_clip": 1.03514886, + "balance_loss_mlp": 1.00368416, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.811728366431584, + "language_loss": 0.57320452, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59387928, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01417542, + "step": 9687, + "time_per_iteration": 2.898810625076294 + }, + { + "auxiliary_loss_clip": 0.01118879, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.04197955, + "balance_loss_mlp": 1.01523232, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 2.4100615110720134, + "language_loss": 0.69631481, + "learning_rate": 1.566302259738727e-06, + "loss": 0.71779197, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13598633, + "step": 9688, + "time_per_iteration": 2.500392198562622 + }, + { + "auxiliary_loss_clip": 0.0111993, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.04465449, + "balance_loss_mlp": 1.01981306, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 4.708137217385512, + "language_loss": 0.6569652, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67848194, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11932373, + "step": 9689, + "time_per_iteration": 2.487229108810425 + }, + { + "auxiliary_loss_clip": 0.0111759, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.04372084, + "balance_loss_mlp": 1.01989841, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.9501000615943131, + "language_loss": 0.73352146, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75504589, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.14959717, + "step": 9690, + "time_per_iteration": 2.474400281906128 + }, + { + "auxiliary_loss_clip": 0.01126887, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.04818308, + "balance_loss_mlp": 1.01985455, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.5898755127047308, + "language_loss": 0.75743139, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.77903974, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.14093018, + "step": 9691, + "time_per_iteration": 3.8903629779815674 + }, + { + "auxiliary_loss_clip": 0.01122107, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.04567242, + "balance_loss_mlp": 1.01878273, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.74123177628788, + "language_loss": 0.80686438, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.8284021, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12878418, + "step": 9692, + "time_per_iteration": 2.5341694355010986 + }, + { + "auxiliary_loss_clip": 0.01054731, + "auxiliary_loss_mlp": 0.00999624, + "balance_loss_clip": 1.02853918, + "balance_loss_mlp": 0.99817061, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.757155273233321, + "language_loss": 0.56938207, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58992565, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01452637, + "step": 9693, + "time_per_iteration": 3.058920383453369 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01038415, + "balance_loss_clip": 1.0402317, + "balance_loss_mlp": 1.02573085, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 1.6844259651658369, + "language_loss": 0.7979399, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81948048, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12677002, + "step": 9694, + "time_per_iteration": 2.5719351768493652 + }, + { + "auxiliary_loss_clip": 0.01111968, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.04112816, + "balance_loss_mlp": 1.02082253, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.337773995978951, + "language_loss": 0.76235259, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78379339, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11291504, + "step": 9695, + "time_per_iteration": 2.6346490383148193 + }, + { + "auxiliary_loss_clip": 0.01054518, + "auxiliary_loss_mlp": 0.01001033, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 0.99967527, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7646340305068824, + "language_loss": 0.54954839, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57010388, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.27392578, + "router_z_loss_mlp": 0.01356506, + "step": 9696, + "time_per_iteration": 3.2118470668792725 + }, + { + "auxiliary_loss_clip": 0.0111704, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.04261994, + "balance_loss_mlp": 1.01906085, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 1.9795800570676327, + "language_loss": 0.76300502, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78449202, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12609863, + "step": 9697, + "time_per_iteration": 2.4666950702667236 + }, + { + "auxiliary_loss_clip": 0.01125292, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.0474689, + "balance_loss_mlp": 1.02165914, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.5908876434389951, + "language_loss": 0.77633035, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79792863, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12860107, + "step": 9698, + "time_per_iteration": 2.530775547027588 + }, + { + "auxiliary_loss_clip": 0.01118953, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.04238212, + "balance_loss_mlp": 1.02122593, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.6764840756364134, + "language_loss": 0.8384043, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85993922, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13330078, + "step": 9699, + "time_per_iteration": 4.03092098236084 + }, + { + "auxiliary_loss_clip": 0.01116429, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.04027987, + "balance_loss_mlp": 1.01721728, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.571193415796491, + "language_loss": 0.66332388, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68478549, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12518311, + "step": 9700, + "time_per_iteration": 2.543407678604126 + }, + { + "auxiliary_loss_clip": 0.01120293, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.04381669, + "balance_loss_mlp": 1.0215255, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.773665142734831, + "language_loss": 0.71498293, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73653162, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1305542, + "step": 9701, + "time_per_iteration": 2.537517786026001 + }, + { + "auxiliary_loss_clip": 0.01119191, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.04247904, + "balance_loss_mlp": 1.02561736, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.8995303254390485, + "language_loss": 0.85462904, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87620246, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12530518, + "step": 9702, + "time_per_iteration": 2.5440428256988525 + }, + { + "auxiliary_loss_clip": 0.01119088, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.04616427, + "balance_loss_mlp": 1.02137887, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.551038398832604, + "language_loss": 0.78019929, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80172205, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11816406, + "step": 9703, + "time_per_iteration": 2.429821729660034 + }, + { + "auxiliary_loss_clip": 0.01131783, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.05357265, + "balance_loss_mlp": 1.01848745, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.685963579763396, + "language_loss": 0.71165121, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73328817, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13421631, + "step": 9704, + "time_per_iteration": 2.4754467010498047 + }, + { + "auxiliary_loss_clip": 0.0111807, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.04269314, + "balance_loss_mlp": 1.03185606, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 1.753553030008467, + "language_loss": 0.81644773, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83808482, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13787842, + "step": 9705, + "time_per_iteration": 2.4329166412353516 + }, + { + "auxiliary_loss_clip": 0.01117231, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.04291606, + "balance_loss_mlp": 1.01978707, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 1.849889773346526, + "language_loss": 0.8075695, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82907283, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.13311768, + "step": 9706, + "time_per_iteration": 3.8858683109283447 + }, + { + "auxiliary_loss_clip": 0.01120533, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.04449368, + "balance_loss_mlp": 1.02105975, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 1.8917024724117386, + "language_loss": 0.74966294, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77121353, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13458252, + "step": 9707, + "time_per_iteration": 2.7163453102111816 + }, + { + "auxiliary_loss_clip": 0.01121259, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.0475316, + "balance_loss_mlp": 1.01917887, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.8524837971330745, + "language_loss": 0.81838548, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83990628, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11639404, + "step": 9708, + "time_per_iteration": 2.4890060424804688 + }, + { + "auxiliary_loss_clip": 0.01127712, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.05207539, + "balance_loss_mlp": 1.01701081, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.5198567540023569, + "language_loss": 0.78321624, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.8047905, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12695312, + "step": 9709, + "time_per_iteration": 2.4718117713928223 + }, + { + "auxiliary_loss_clip": 0.01065286, + "auxiliary_loss_mlp": 0.01015816, + "balance_loss_clip": 1.03856754, + "balance_loss_mlp": 1.01444411, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7662456038832433, + "language_loss": 0.56540382, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58621484, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01373291, + "step": 9710, + "time_per_iteration": 3.0938968658447266 + }, + { + "auxiliary_loss_clip": 0.01121294, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.04656887, + "balance_loss_mlp": 1.02343774, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.4647813043549534, + "language_loss": 0.65780711, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67937535, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12097168, + "step": 9711, + "time_per_iteration": 3.904360294342041 + }, + { + "auxiliary_loss_clip": 0.01123794, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.04280233, + "balance_loss_mlp": 1.02017343, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.9012262093647405, + "language_loss": 0.78635973, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80794156, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.14202881, + "step": 9712, + "time_per_iteration": 2.501713752746582 + }, + { + "auxiliary_loss_clip": 0.01127105, + "auxiliary_loss_mlp": 0.01028008, + "balance_loss_clip": 1.05137801, + "balance_loss_mlp": 1.01501465, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.6300946069152762, + "language_loss": 0.73276174, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75431287, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12976074, + "step": 9713, + "time_per_iteration": 2.5692882537841797 + }, + { + "auxiliary_loss_clip": 0.01126893, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.04535043, + "balance_loss_mlp": 1.02156997, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.7985200462793463, + "language_loss": 0.69353354, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71516687, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.14868164, + "step": 9714, + "time_per_iteration": 2.4922983646392822 + }, + { + "auxiliary_loss_clip": 0.01129323, + "auxiliary_loss_mlp": 0.01040598, + "balance_loss_clip": 1.04752517, + "balance_loss_mlp": 1.02695477, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.7675221886691168, + "language_loss": 0.80614281, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82784206, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.1362915, + "step": 9715, + "time_per_iteration": 2.4610989093780518 + }, + { + "auxiliary_loss_clip": 0.01119838, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.04421353, + "balance_loss_mlp": 1.01830935, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 2.7739991796153762, + "language_loss": 0.73170328, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.75321233, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12762451, + "step": 9716, + "time_per_iteration": 2.5309743881225586 + }, + { + "auxiliary_loss_clip": 0.01117655, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.04443765, + "balance_loss_mlp": 1.01963198, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.84491294727122, + "language_loss": 0.75191164, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77340096, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11639404, + "step": 9717, + "time_per_iteration": 2.4774279594421387 + }, + { + "auxiliary_loss_clip": 0.01123417, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.04870987, + "balance_loss_mlp": 1.02175927, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 6.020971616854301, + "language_loss": 0.79730284, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.8188827, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12805176, + "step": 9718, + "time_per_iteration": 2.470301866531372 + }, + { + "auxiliary_loss_clip": 0.01114199, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.03982544, + "balance_loss_mlp": 1.01607311, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.7534573625297567, + "language_loss": 0.67375737, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69519305, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13299561, + "step": 9719, + "time_per_iteration": 2.48513126373291 + }, + { + "auxiliary_loss_clip": 0.01121953, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.04433537, + "balance_loss_mlp": 1.01736045, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.590286389351638, + "language_loss": 0.75829947, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77981573, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12322998, + "step": 9720, + "time_per_iteration": 2.5852386951446533 + }, + { + "auxiliary_loss_clip": 0.01113234, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.03710163, + "balance_loss_mlp": 1.02348244, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.6901595333764534, + "language_loss": 0.82996106, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85145313, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.125, + "step": 9721, + "time_per_iteration": 2.493800640106201 + }, + { + "auxiliary_loss_clip": 0.01066342, + "auxiliary_loss_mlp": 0.01006727, + "balance_loss_clip": 1.04075038, + "balance_loss_mlp": 1.00487185, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9342321280788166, + "language_loss": 0.71392184, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73465252, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.25537109, + "router_z_loss_mlp": 0.01855469, + "step": 9722, + "time_per_iteration": 3.186224937438965 + }, + { + "auxiliary_loss_clip": 0.01114411, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.04114795, + "balance_loss_mlp": 1.02120447, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.147249054465471, + "language_loss": 0.89086938, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91235042, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12481689, + "step": 9723, + "time_per_iteration": 2.4255003929138184 + }, + { + "auxiliary_loss_clip": 0.0111325, + "auxiliary_loss_mlp": 0.01030176, + "balance_loss_clip": 1.03986573, + "balance_loss_mlp": 1.01796317, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.409802170024147, + "language_loss": 0.68298608, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70442033, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12219238, + "step": 9724, + "time_per_iteration": 2.525447130203247 + }, + { + "auxiliary_loss_clip": 0.01128998, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.0512377, + "balance_loss_mlp": 1.02316403, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.9245391691780236, + "language_loss": 0.86250907, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88416213, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13134766, + "step": 9725, + "time_per_iteration": 2.4369657039642334 + }, + { + "auxiliary_loss_clip": 0.01118658, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.04151416, + "balance_loss_mlp": 1.02435052, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 2.9275050610634183, + "language_loss": 0.82542276, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.84697908, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12609863, + "step": 9726, + "time_per_iteration": 2.507153034210205 + }, + { + "auxiliary_loss_clip": 0.01119548, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.04307842, + "balance_loss_mlp": 1.02367806, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 2.703305158151573, + "language_loss": 0.66583002, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68740481, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.14257812, + "step": 9727, + "time_per_iteration": 2.4629385471343994 + }, + { + "auxiliary_loss_clip": 0.01122252, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.04482245, + "balance_loss_mlp": 1.02794576, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 2.2244804439955304, + "language_loss": 0.81792182, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83955824, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13452148, + "step": 9728, + "time_per_iteration": 2.465203046798706 + }, + { + "auxiliary_loss_clip": 0.01121968, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.04734969, + "balance_loss_mlp": 1.02094531, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.8067980255430638, + "language_loss": 0.77869141, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80023396, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11346436, + "step": 9729, + "time_per_iteration": 2.4307453632354736 + }, + { + "auxiliary_loss_clip": 0.01122751, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.04479742, + "balance_loss_mlp": 1.01747906, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 1.9218589430918416, + "language_loss": 0.70257986, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72412109, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13885498, + "step": 9730, + "time_per_iteration": 2.5386104583740234 + }, + { + "auxiliary_loss_clip": 0.01124877, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.04761577, + "balance_loss_mlp": 1.02089453, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.817687797753251, + "language_loss": 0.79176956, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81335634, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12908936, + "step": 9731, + "time_per_iteration": 2.445300817489624 + }, + { + "auxiliary_loss_clip": 0.0112407, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.04737234, + "balance_loss_mlp": 1.02017701, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 2.517363012005306, + "language_loss": 0.69974357, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72131646, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13049316, + "step": 9732, + "time_per_iteration": 2.5216450691223145 + }, + { + "auxiliary_loss_clip": 0.01122243, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.0442996, + "balance_loss_mlp": 1.01951277, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 2.0776318312295095, + "language_loss": 0.526631, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54820627, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.15777588, + "step": 9733, + "time_per_iteration": 2.483859062194824 + }, + { + "auxiliary_loss_clip": 0.01127996, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.05140734, + "balance_loss_mlp": 1.02126825, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.1198301423476145, + "language_loss": 0.87379146, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89543188, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.14794922, + "step": 9734, + "time_per_iteration": 3.8953120708465576 + }, + { + "auxiliary_loss_clip": 0.01123786, + "auxiliary_loss_mlp": 0.01037091, + "balance_loss_clip": 1.05061388, + "balance_loss_mlp": 1.02540278, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 5.9662856713879995, + "language_loss": 0.72297108, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74457985, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11694336, + "step": 9735, + "time_per_iteration": 2.5113604068756104 + }, + { + "auxiliary_loss_clip": 0.01130786, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.05082417, + "balance_loss_mlp": 1.02237809, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.8561001365185947, + "language_loss": 0.74279302, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76446611, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14135742, + "step": 9736, + "time_per_iteration": 2.4914441108703613 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.04757476, + "balance_loss_mlp": 1.01587677, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.8220931640289082, + "language_loss": 0.7066015, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72810984, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12316895, + "step": 9737, + "time_per_iteration": 2.688478708267212 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.04996622, + "balance_loss_mlp": 1.02053571, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7842355586915994, + "language_loss": 0.82973206, + "learning_rate": 1.547313391573169e-06, + "loss": 0.85129887, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11767578, + "step": 9738, + "time_per_iteration": 2.479539155960083 + }, + { + "auxiliary_loss_clip": 0.01130429, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.05207801, + "balance_loss_mlp": 1.02201676, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.7318877662147147, + "language_loss": 0.68799967, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70966047, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13616943, + "step": 9739, + "time_per_iteration": 2.4316580295562744 + }, + { + "auxiliary_loss_clip": 0.0112102, + "auxiliary_loss_mlp": 0.01025798, + "balance_loss_clip": 1.0440824, + "balance_loss_mlp": 1.01261353, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 2.6786443160557694, + "language_loss": 0.58585858, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60732681, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13195801, + "step": 9740, + "time_per_iteration": 2.513929843902588 + }, + { + "auxiliary_loss_clip": 0.01115541, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.03999186, + "balance_loss_mlp": 1.01537621, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 2.0915059453666647, + "language_loss": 0.75270456, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77413893, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12512207, + "step": 9741, + "time_per_iteration": 2.4405853748321533 + }, + { + "auxiliary_loss_clip": 0.01136053, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.05714512, + "balance_loss_mlp": 1.01762366, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 1.532263068544888, + "language_loss": 0.75683272, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77849579, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12628174, + "step": 9742, + "time_per_iteration": 3.8575751781463623 + }, + { + "auxiliary_loss_clip": 0.01118808, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.04415452, + "balance_loss_mlp": 1.01874447, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.578209295070444, + "language_loss": 0.75114202, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77263904, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12145996, + "step": 9743, + "time_per_iteration": 2.587789297103882 + }, + { + "auxiliary_loss_clip": 0.01117639, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.04484892, + "balance_loss_mlp": 1.01694894, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.6978249319992196, + "language_loss": 0.81139374, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83285677, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11706543, + "step": 9744, + "time_per_iteration": 2.4735336303710938 + }, + { + "auxiliary_loss_clip": 0.01122419, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.04371297, + "balance_loss_mlp": 1.01881683, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.6792291478375263, + "language_loss": 0.71703827, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73857659, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12591553, + "step": 9745, + "time_per_iteration": 2.5669353008270264 + }, + { + "auxiliary_loss_clip": 0.01051752, + "auxiliary_loss_mlp": 0.01002421, + "balance_loss_clip": 1.02573299, + "balance_loss_mlp": 1.00104403, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7202201054084216, + "language_loss": 0.53310144, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55364317, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01376343, + "step": 9746, + "time_per_iteration": 3.1559536457061768 + }, + { + "auxiliary_loss_clip": 0.01131575, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.05077982, + "balance_loss_mlp": 1.02006721, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.1567995141147978, + "language_loss": 0.73354834, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75519967, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13500977, + "step": 9747, + "time_per_iteration": 2.5136754512786865 + }, + { + "auxiliary_loss_clip": 0.01119011, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.04265475, + "balance_loss_mlp": 1.01945329, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 3.069273697040366, + "language_loss": 0.80909568, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83061171, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13134766, + "step": 9748, + "time_per_iteration": 2.5057015419006348 + }, + { + "auxiliary_loss_clip": 0.01123604, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.04731989, + "balance_loss_mlp": 1.01717901, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 2.1553404348203005, + "language_loss": 0.71892655, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74046218, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12799072, + "step": 9749, + "time_per_iteration": 3.9632933139801025 + }, + { + "auxiliary_loss_clip": 0.01123132, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.05043674, + "balance_loss_mlp": 1.01706696, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.4651501104961793, + "language_loss": 0.75027555, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77181876, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.14135742, + "step": 9750, + "time_per_iteration": 2.521857976913452 + }, + { + "auxiliary_loss_clip": 0.01118297, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.04371357, + "balance_loss_mlp": 1.02045524, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.6330973339619113, + "language_loss": 0.70975935, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73129505, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.14819336, + "step": 9751, + "time_per_iteration": 2.4397199153900146 + }, + { + "auxiliary_loss_clip": 0.01123776, + "auxiliary_loss_mlp": 0.01043774, + "balance_loss_clip": 1.04479384, + "balance_loss_mlp": 1.02683449, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.8689272141867386, + "language_loss": 0.74336922, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76504475, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.16931152, + "step": 9752, + "time_per_iteration": 2.4694840908050537 + }, + { + "auxiliary_loss_clip": 0.01125318, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.05113494, + "balance_loss_mlp": 1.02288437, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 6.880800819944892, + "language_loss": 0.77239352, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79400897, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.13336182, + "step": 9753, + "time_per_iteration": 2.441624402999878 + }, + { + "auxiliary_loss_clip": 0.01116615, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.04385257, + "balance_loss_mlp": 1.01707387, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.8904050321351915, + "language_loss": 0.7103318, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73178351, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11486816, + "step": 9754, + "time_per_iteration": 3.8792121410369873 + }, + { + "auxiliary_loss_clip": 0.01122161, + "auxiliary_loss_mlp": 0.01031997, + "balance_loss_clip": 1.04441404, + "balance_loss_mlp": 1.01835978, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 2.825676952492924, + "language_loss": 0.71870708, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74024874, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13623047, + "step": 9755, + "time_per_iteration": 2.463338851928711 + }, + { + "auxiliary_loss_clip": 0.01052191, + "auxiliary_loss_mlp": 0.01008312, + "balance_loss_clip": 1.02564597, + "balance_loss_mlp": 1.00703406, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7345401558349844, + "language_loss": 0.56880569, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58941072, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01277161, + "step": 9756, + "time_per_iteration": 3.066589832305908 + }, + { + "auxiliary_loss_clip": 0.0112346, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.05006456, + "balance_loss_mlp": 1.02179337, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.765422464913055, + "language_loss": 0.76227027, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78383851, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11572266, + "step": 9757, + "time_per_iteration": 2.5250484943389893 + }, + { + "auxiliary_loss_clip": 0.01055281, + "auxiliary_loss_mlp": 0.01015868, + "balance_loss_clip": 1.02874947, + "balance_loss_mlp": 1.01445866, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8526105947329864, + "language_loss": 0.60411489, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62482637, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01409912, + "step": 9758, + "time_per_iteration": 3.0808398723602295 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.04962206, + "balance_loss_mlp": 1.02047491, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.23446789986047, + "language_loss": 0.72141826, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.74308395, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13366699, + "step": 9759, + "time_per_iteration": 2.4851105213165283 + }, + { + "auxiliary_loss_clip": 0.01124899, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04915476, + "balance_loss_mlp": 1.02003872, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5633223117747694, + "language_loss": 0.73256755, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75413144, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11444092, + "step": 9760, + "time_per_iteration": 2.5451841354370117 + }, + { + "auxiliary_loss_clip": 0.01118158, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.04308474, + "balance_loss_mlp": 1.01601315, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 2.0895638260500395, + "language_loss": 0.73152918, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.75300294, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13214111, + "step": 9761, + "time_per_iteration": 2.470534324645996 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.04369175, + "balance_loss_mlp": 1.01763844, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 2.992385145689423, + "language_loss": 0.75150752, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77304411, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14038086, + "step": 9762, + "time_per_iteration": 2.464542865753174 + }, + { + "auxiliary_loss_clip": 0.01120481, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.04485095, + "balance_loss_mlp": 1.01991057, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.3736957746362606, + "language_loss": 0.72581124, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74734783, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1328125, + "step": 9763, + "time_per_iteration": 2.8907058238983154 + }, + { + "auxiliary_loss_clip": 0.01124106, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.05006862, + "balance_loss_mlp": 1.01852131, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.6172306973918638, + "language_loss": 0.7997281, + "learning_rate": 1.53745602625755e-06, + "loss": 0.821271, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11676025, + "step": 9764, + "time_per_iteration": 2.4112021923065186 + }, + { + "auxiliary_loss_clip": 0.01136923, + "auxiliary_loss_mlp": 0.01033289, + "balance_loss_clip": 1.05821824, + "balance_loss_mlp": 1.02079558, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.6380365389458487, + "language_loss": 0.7910949, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81279701, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.125, + "step": 9765, + "time_per_iteration": 2.4899380207061768 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.05042589, + "balance_loss_mlp": 1.01999927, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 2.012446953088344, + "language_loss": 0.83608681, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85765362, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.13104248, + "step": 9766, + "time_per_iteration": 2.4194228649139404 + }, + { + "auxiliary_loss_clip": 0.0112911, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.04943657, + "balance_loss_mlp": 1.02042222, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.6938746125673363, + "language_loss": 0.69760513, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71922755, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12713623, + "step": 9767, + "time_per_iteration": 2.535102605819702 + }, + { + "auxiliary_loss_clip": 0.01120272, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.041695, + "balance_loss_mlp": 1.02421868, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.81865819977945, + "language_loss": 0.63128483, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65287876, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.14898682, + "step": 9768, + "time_per_iteration": 2.4968180656433105 + }, + { + "auxiliary_loss_clip": 0.01043319, + "auxiliary_loss_mlp": 0.01007923, + "balance_loss_clip": 1.01683974, + "balance_loss_mlp": 1.00655687, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7296535385089998, + "language_loss": 0.53865635, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55916882, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01365662, + "step": 9769, + "time_per_iteration": 3.09540057182312 + }, + { + "auxiliary_loss_clip": 0.01122195, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.04705536, + "balance_loss_mlp": 1.02003098, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4562507943842995, + "language_loss": 0.70671093, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.72826076, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12756348, + "step": 9770, + "time_per_iteration": 2.466515064239502 + }, + { + "auxiliary_loss_clip": 0.01128219, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.05197859, + "balance_loss_mlp": 1.01871777, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.8653273234645131, + "language_loss": 0.67631119, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.697909, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12841797, + "step": 9771, + "time_per_iteration": 2.5254604816436768 + }, + { + "auxiliary_loss_clip": 0.01123322, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.04490292, + "balance_loss_mlp": 1.01949239, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.4329748448996587, + "language_loss": 0.66616154, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68772662, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13690186, + "step": 9772, + "time_per_iteration": 2.5458719730377197 + }, + { + "auxiliary_loss_clip": 0.0112754, + "auxiliary_loss_mlp": 0.01041457, + "balance_loss_clip": 1.04956019, + "balance_loss_mlp": 1.0265739, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.793438558943517, + "language_loss": 0.74576735, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76745737, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.14892578, + "step": 9773, + "time_per_iteration": 2.5155014991760254 + }, + { + "auxiliary_loss_clip": 0.01117168, + "auxiliary_loss_mlp": 0.01035665, + "balance_loss_clip": 1.04153013, + "balance_loss_mlp": 1.02217627, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.3149837098754147, + "language_loss": 0.53321266, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55474102, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1348877, + "step": 9774, + "time_per_iteration": 2.515782594680786 + }, + { + "auxiliary_loss_clip": 0.01123167, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.04728842, + "balance_loss_mlp": 1.01898599, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.2097980316734405, + "language_loss": 0.6516102, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67316675, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.13513184, + "step": 9775, + "time_per_iteration": 2.6082944869995117 + }, + { + "auxiliary_loss_clip": 0.01115353, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.04026985, + "balance_loss_mlp": 1.01837587, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.7664802622740376, + "language_loss": 0.73508799, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75655901, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13366699, + "step": 9776, + "time_per_iteration": 2.5509114265441895 + }, + { + "auxiliary_loss_clip": 0.01123285, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.04672289, + "balance_loss_mlp": 1.02092338, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 1.7776019741164593, + "language_loss": 0.74521053, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76677763, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12506104, + "step": 9777, + "time_per_iteration": 2.533569812774658 + }, + { + "auxiliary_loss_clip": 0.01118926, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.04663146, + "balance_loss_mlp": 1.02190781, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.5756340448463368, + "language_loss": 0.74473178, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76626003, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11993408, + "step": 9778, + "time_per_iteration": 4.103631496429443 + }, + { + "auxiliary_loss_clip": 0.01126261, + "auxiliary_loss_mlp": 0.01027531, + "balance_loss_clip": 1.04947186, + "balance_loss_mlp": 1.0150497, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.833538145407385, + "language_loss": 0.69971943, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72125733, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12487793, + "step": 9779, + "time_per_iteration": 2.4731619358062744 + }, + { + "auxiliary_loss_clip": 0.01129021, + "auxiliary_loss_mlp": 0.01033241, + "balance_loss_clip": 1.05110574, + "balance_loss_mlp": 1.01992512, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 2.1341207582763793, + "language_loss": 0.6695658, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.6911884, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13317871, + "step": 9780, + "time_per_iteration": 2.5296692848205566 + }, + { + "auxiliary_loss_clip": 0.01127235, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.05130816, + "balance_loss_mlp": 1.02080262, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.983280379464627, + "language_loss": 0.72460151, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74621451, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13269043, + "step": 9781, + "time_per_iteration": 2.4629037380218506 + }, + { + "auxiliary_loss_clip": 0.01123152, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.04713297, + "balance_loss_mlp": 1.01985228, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.3243344375449662, + "language_loss": 0.70543754, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72699344, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12567139, + "step": 9782, + "time_per_iteration": 2.5631656646728516 + }, + { + "auxiliary_loss_clip": 0.0112496, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.04538989, + "balance_loss_mlp": 1.01747799, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 2.617475644441782, + "language_loss": 0.70177627, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72332478, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12414551, + "step": 9783, + "time_per_iteration": 2.396085500717163 + }, + { + "auxiliary_loss_clip": 0.01119644, + "auxiliary_loss_mlp": 0.01037515, + "balance_loss_clip": 1.041991, + "balance_loss_mlp": 1.02247679, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 1.9087659488797037, + "language_loss": 0.6906811, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71225262, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.15032959, + "step": 9784, + "time_per_iteration": 2.47577166557312 + }, + { + "auxiliary_loss_clip": 0.0112418, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04673719, + "balance_loss_mlp": 1.01939082, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 2.053979096404244, + "language_loss": 0.69073641, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71229774, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12554932, + "step": 9785, + "time_per_iteration": 2.556626319885254 + }, + { + "auxiliary_loss_clip": 0.01120705, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.04614186, + "balance_loss_mlp": 1.01562333, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.1777300623546973, + "language_loss": 0.77823341, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79971224, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11566162, + "step": 9786, + "time_per_iteration": 3.811506748199463 + }, + { + "auxiliary_loss_clip": 0.01117551, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.0407697, + "balance_loss_mlp": 1.02024889, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.6196175232284615, + "language_loss": 0.79618335, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81770355, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.14221191, + "step": 9787, + "time_per_iteration": 2.450746774673462 + }, + { + "auxiliary_loss_clip": 0.01125856, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.05059969, + "balance_loss_mlp": 1.01868057, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.592189887513454, + "language_loss": 0.66617918, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68774343, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11877441, + "step": 9788, + "time_per_iteration": 2.533174991607666 + }, + { + "auxiliary_loss_clip": 0.01123493, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.04836702, + "balance_loss_mlp": 1.02303863, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.3087547801745454, + "language_loss": 0.80716074, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82875967, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13366699, + "step": 9789, + "time_per_iteration": 2.4751808643341064 + }, + { + "auxiliary_loss_clip": 0.01115527, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.04102039, + "balance_loss_mlp": 1.01753521, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.7431611327919208, + "language_loss": 0.70110232, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72256559, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13262939, + "step": 9790, + "time_per_iteration": 2.494112014770508 + }, + { + "auxiliary_loss_clip": 0.01120142, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.04416609, + "balance_loss_mlp": 1.02064872, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.7814869037821044, + "language_loss": 0.83499157, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85654271, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.14312744, + "step": 9791, + "time_per_iteration": 2.472787380218506 + }, + { + "auxiliary_loss_clip": 0.01127474, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.05083716, + "balance_loss_mlp": 1.02339721, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.6342256784302767, + "language_loss": 0.76364172, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78529084, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.14038086, + "step": 9792, + "time_per_iteration": 4.0162200927734375 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.04221869, + "balance_loss_mlp": 1.02118039, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 5.0558168655175875, + "language_loss": 0.69414032, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71568727, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.1385498, + "step": 9793, + "time_per_iteration": 2.5307066440582275 + }, + { + "auxiliary_loss_clip": 0.01119255, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.04355454, + "balance_loss_mlp": 1.02292252, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 2.1577947204891403, + "language_loss": 0.6038326, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62539971, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.14544678, + "step": 9794, + "time_per_iteration": 2.4148237705230713 + }, + { + "auxiliary_loss_clip": 0.01125955, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.04863214, + "balance_loss_mlp": 1.02235746, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.5295612628914077, + "language_loss": 0.65163171, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67324597, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13116455, + "step": 9795, + "time_per_iteration": 2.5276906490325928 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.04788113, + "balance_loss_mlp": 1.01929116, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.5631762468804995, + "language_loss": 0.74623287, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76777601, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11560059, + "step": 9796, + "time_per_iteration": 2.458475112915039 + }, + { + "auxiliary_loss_clip": 0.01127453, + "auxiliary_loss_mlp": 0.01028764, + "balance_loss_clip": 1.0508194, + "balance_loss_mlp": 1.01640165, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.4625827858230174, + "language_loss": 0.83248514, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.8540473, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12359619, + "step": 9797, + "time_per_iteration": 2.5361485481262207 + }, + { + "auxiliary_loss_clip": 0.01123192, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.04990804, + "balance_loss_mlp": 1.01842999, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.5869433661050794, + "language_loss": 0.7887947, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81033349, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12261963, + "step": 9798, + "time_per_iteration": 3.854172945022583 + }, + { + "auxiliary_loss_clip": 0.01129389, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.05545568, + "balance_loss_mlp": 1.02030742, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.081869666585704, + "language_loss": 0.74227959, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76389027, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11364746, + "step": 9799, + "time_per_iteration": 2.4407896995544434 + }, + { + "auxiliary_loss_clip": 0.01124925, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.04761672, + "balance_loss_mlp": 1.01571822, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 2.733050892936197, + "language_loss": 0.76532042, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78686106, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13415527, + "step": 9800, + "time_per_iteration": 2.4807069301605225 + }, + { + "auxiliary_loss_clip": 0.01120764, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.04404712, + "balance_loss_mlp": 1.02607012, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.7272569475343353, + "language_loss": 0.78628862, + "learning_rate": 1.523448741022722e-06, + "loss": 0.80787623, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11938477, + "step": 9801, + "time_per_iteration": 2.4765655994415283 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.04152572, + "balance_loss_mlp": 1.02016723, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 1.8723070288887467, + "language_loss": 0.66449344, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68603694, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.14154053, + "step": 9802, + "time_per_iteration": 2.5382511615753174 + }, + { + "auxiliary_loss_clip": 0.01119517, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.0452131, + "balance_loss_mlp": 1.01759267, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.6473792131303973, + "language_loss": 0.78319001, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80468476, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12359619, + "step": 9803, + "time_per_iteration": 2.615596294403076 + }, + { + "auxiliary_loss_clip": 0.01121824, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.04444194, + "balance_loss_mlp": 1.02536941, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.4977464010077424, + "language_loss": 0.7310791, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75268286, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.1317749, + "step": 9804, + "time_per_iteration": 2.5188205242156982 + }, + { + "auxiliary_loss_clip": 0.01126057, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.04973578, + "balance_loss_mlp": 1.02130008, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.5818364318354057, + "language_loss": 0.74660027, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76820147, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12762451, + "step": 9805, + "time_per_iteration": 2.469740629196167 + }, + { + "auxiliary_loss_clip": 0.01129856, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.04581523, + "balance_loss_mlp": 1.02773428, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 1.7972874786926742, + "language_loss": 0.78056228, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.80228114, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14294434, + "step": 9806, + "time_per_iteration": 2.4666478633880615 + }, + { + "auxiliary_loss_clip": 0.01124791, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.04808068, + "balance_loss_mlp": 1.01755226, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 2.042356402514908, + "language_loss": 0.76967961, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79123425, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13122559, + "step": 9807, + "time_per_iteration": 2.43696928024292 + }, + { + "auxiliary_loss_clip": 0.0113129, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.04988742, + "balance_loss_mlp": 1.01717091, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 3.9871677298475454, + "language_loss": 0.74636024, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.7679764, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13165283, + "step": 9808, + "time_per_iteration": 2.4337966442108154 + }, + { + "auxiliary_loss_clip": 0.01125589, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.04780531, + "balance_loss_mlp": 1.01441348, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 2.0693428275730454, + "language_loss": 0.72478902, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.74632227, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13323975, + "step": 9809, + "time_per_iteration": 2.478588819503784 + }, + { + "auxiliary_loss_clip": 0.0112917, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.04962778, + "balance_loss_mlp": 1.0188303, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 1.9847692786810995, + "language_loss": 0.82263649, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84424895, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13244629, + "step": 9810, + "time_per_iteration": 2.4712460041046143 + }, + { + "auxiliary_loss_clip": 0.01123851, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.04864144, + "balance_loss_mlp": 1.01509237, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.6298523594084826, + "language_loss": 0.81097615, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83248526, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11962891, + "step": 9811, + "time_per_iteration": 2.4478824138641357 + }, + { + "auxiliary_loss_clip": 0.01125005, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.04760456, + "balance_loss_mlp": 1.01388788, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 2.161225015661129, + "language_loss": 0.77036262, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79188836, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13690186, + "step": 9812, + "time_per_iteration": 2.462970018386841 + }, + { + "auxiliary_loss_clip": 0.01120197, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.04658532, + "balance_loss_mlp": 1.0164454, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 1.8281697455098307, + "language_loss": 0.70775509, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72923428, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11279297, + "step": 9813, + "time_per_iteration": 2.4456796646118164 + }, + { + "auxiliary_loss_clip": 0.01120915, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.04547834, + "balance_loss_mlp": 1.02077925, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.9589090290670066, + "language_loss": 0.72222304, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74375951, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1194458, + "step": 9814, + "time_per_iteration": 2.4569382667541504 + }, + { + "auxiliary_loss_clip": 0.01125587, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.05035031, + "balance_loss_mlp": 1.01451933, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.916182825264156, + "language_loss": 0.78625721, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80778283, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12463379, + "step": 9815, + "time_per_iteration": 2.4744417667388916 + }, + { + "auxiliary_loss_clip": 0.01124774, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.04468286, + "balance_loss_mlp": 1.02029109, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 3.033842852143528, + "language_loss": 0.75937313, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.78095782, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13397217, + "step": 9816, + "time_per_iteration": 2.5025222301483154 + }, + { + "auxiliary_loss_clip": 0.01124595, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.04887199, + "balance_loss_mlp": 1.01855218, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.0393637520414107, + "language_loss": 0.8095758, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83113188, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12445068, + "step": 9817, + "time_per_iteration": 2.4696896076202393 + }, + { + "auxiliary_loss_clip": 0.01127162, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.05141962, + "balance_loss_mlp": 1.02080619, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.6473787067022836, + "language_loss": 0.76161981, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78321993, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1204834, + "step": 9818, + "time_per_iteration": 2.499192476272583 + }, + { + "auxiliary_loss_clip": 0.011243, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.04850197, + "balance_loss_mlp": 1.01794887, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 2.8048658747021036, + "language_loss": 0.66636211, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68791628, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13171387, + "step": 9819, + "time_per_iteration": 2.4531333446502686 + }, + { + "auxiliary_loss_clip": 0.01129273, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.05250967, + "balance_loss_mlp": 1.02167821, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.6303531324789367, + "language_loss": 0.78001601, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80164957, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12402344, + "step": 9820, + "time_per_iteration": 2.463719367980957 + }, + { + "auxiliary_loss_clip": 0.01055944, + "auxiliary_loss_mlp": 0.01005233, + "balance_loss_clip": 1.02882051, + "balance_loss_mlp": 1.0039587, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9327616202043155, + "language_loss": 0.65080297, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67141473, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01274109, + "step": 9821, + "time_per_iteration": 4.456084251403809 + }, + { + "auxiliary_loss_clip": 0.01121521, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.04804873, + "balance_loss_mlp": 1.02021444, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 2.0098955573358066, + "language_loss": 0.61630827, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63784504, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11938477, + "step": 9822, + "time_per_iteration": 2.470621109008789 + }, + { + "auxiliary_loss_clip": 0.01123533, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.04723239, + "balance_loss_mlp": 1.02141142, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.8573131494487125, + "language_loss": 0.82346451, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84503889, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.125, + "step": 9823, + "time_per_iteration": 2.44046950340271 + }, + { + "auxiliary_loss_clip": 0.01123982, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.05004752, + "balance_loss_mlp": 1.01803088, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 2.1887887428927253, + "language_loss": 0.73195446, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75350553, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13079834, + "step": 9824, + "time_per_iteration": 2.465787649154663 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.04805481, + "balance_loss_mlp": 1.01926279, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.103225804219206, + "language_loss": 0.82836795, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84999937, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.14001465, + "step": 9825, + "time_per_iteration": 2.518066644668579 + }, + { + "auxiliary_loss_clip": 0.01121996, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.04908001, + "balance_loss_mlp": 1.0195142, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7048697170546665, + "language_loss": 0.76822495, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78975081, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11083984, + "step": 9826, + "time_per_iteration": 2.474781036376953 + }, + { + "auxiliary_loss_clip": 0.01114982, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.04106116, + "balance_loss_mlp": 1.0160712, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.710712214301092, + "language_loss": 0.72199404, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74343312, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12866211, + "step": 9827, + "time_per_iteration": 2.4833221435546875 + }, + { + "auxiliary_loss_clip": 0.01126001, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.05001736, + "balance_loss_mlp": 1.01505876, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.6848671673087536, + "language_loss": 0.79707378, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81860292, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11859131, + "step": 9828, + "time_per_iteration": 2.476552963256836 + }, + { + "auxiliary_loss_clip": 0.01124238, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.045807, + "balance_loss_mlp": 1.02282321, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.3759453724295367, + "language_loss": 0.8869257, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90852803, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.1317749, + "step": 9829, + "time_per_iteration": 2.537320375442505 + }, + { + "auxiliary_loss_clip": 0.01048178, + "auxiliary_loss_mlp": 0.01008414, + "balance_loss_clip": 1.0217762, + "balance_loss_mlp": 1.00711715, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7722841500290841, + "language_loss": 0.57880569, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59937155, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.26416016, + "router_z_loss_mlp": 0.01295471, + "step": 9830, + "time_per_iteration": 4.411566257476807 + }, + { + "auxiliary_loss_clip": 0.01128141, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.04783654, + "balance_loss_mlp": 1.0257163, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 3.984450180294788, + "language_loss": 0.76059163, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78228176, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.15148926, + "step": 9831, + "time_per_iteration": 2.6198716163635254 + }, + { + "auxiliary_loss_clip": 0.01116951, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.04583097, + "balance_loss_mlp": 1.0189271, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.6758760048827352, + "language_loss": 0.77476752, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79624784, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.12158203, + "step": 9832, + "time_per_iteration": 2.6452248096466064 + }, + { + "auxiliary_loss_clip": 0.0112332, + "auxiliary_loss_mlp": 0.01026988, + "balance_loss_clip": 1.04893148, + "balance_loss_mlp": 1.01509094, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.6909837956695526, + "language_loss": 0.83244622, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85394925, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11907959, + "step": 9833, + "time_per_iteration": 2.598599672317505 + }, + { + "auxiliary_loss_clip": 0.01118336, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.04417992, + "balance_loss_mlp": 1.02254736, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.4719309239751464, + "language_loss": 0.74306476, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76459813, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12463379, + "step": 9834, + "time_per_iteration": 2.6528563499450684 + }, + { + "auxiliary_loss_clip": 0.01119206, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.04487014, + "balance_loss_mlp": 1.019243, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.231385991446024, + "language_loss": 0.78108001, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80258262, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11798096, + "step": 9835, + "time_per_iteration": 2.581256866455078 + }, + { + "auxiliary_loss_clip": 0.01120748, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.04455721, + "balance_loss_mlp": 1.020118, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 3.6719883801290365, + "language_loss": 0.74068463, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76221669, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12347412, + "step": 9836, + "time_per_iteration": 2.6173946857452393 + }, + { + "auxiliary_loss_clip": 0.01124625, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.04778934, + "balance_loss_mlp": 1.01668966, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.0277302046932264, + "language_loss": 0.81851763, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84005487, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12414551, + "step": 9837, + "time_per_iteration": 3.8588311672210693 + }, + { + "auxiliary_loss_clip": 0.01127794, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.04925942, + "balance_loss_mlp": 1.02098393, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.639701779189396, + "language_loss": 0.79787004, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81949759, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13977051, + "step": 9838, + "time_per_iteration": 2.4727842807769775 + }, + { + "auxiliary_loss_clip": 0.01124214, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.0484848, + "balance_loss_mlp": 1.0211966, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 2.0127909564158357, + "language_loss": 0.69789135, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71947193, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12652588, + "step": 9839, + "time_per_iteration": 2.452669620513916 + }, + { + "auxiliary_loss_clip": 0.01124041, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.04583216, + "balance_loss_mlp": 1.0265398, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 7.028590564789231, + "language_loss": 0.65649235, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67812753, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.1293335, + "step": 9840, + "time_per_iteration": 2.456721544265747 + }, + { + "auxiliary_loss_clip": 0.0112272, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.04412055, + "balance_loss_mlp": 1.02124953, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 3.6818709542849484, + "language_loss": 0.81815743, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83974123, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.14398193, + "step": 9841, + "time_per_iteration": 2.6390528678894043 + }, + { + "auxiliary_loss_clip": 0.01118771, + "auxiliary_loss_mlp": 0.01027073, + "balance_loss_clip": 1.04562449, + "balance_loss_mlp": 1.01521134, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.840182588605284, + "language_loss": 0.69395995, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71541846, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11859131, + "step": 9842, + "time_per_iteration": 3.969723701477051 + }, + { + "auxiliary_loss_clip": 0.01121097, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.04427803, + "balance_loss_mlp": 1.01770544, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.839180850061387, + "language_loss": 0.83034325, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85185844, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.1270752, + "step": 9843, + "time_per_iteration": 2.5443761348724365 + }, + { + "auxiliary_loss_clip": 0.01120136, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.04413533, + "balance_loss_mlp": 1.01348639, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.6067502240442373, + "language_loss": 0.81902504, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84049249, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13116455, + "step": 9844, + "time_per_iteration": 2.471306324005127 + }, + { + "auxiliary_loss_clip": 0.01134891, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.05617976, + "balance_loss_mlp": 1.01624799, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 2.009770599314686, + "language_loss": 0.74385071, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.7654916, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.1295166, + "step": 9845, + "time_per_iteration": 2.457730770111084 + }, + { + "auxiliary_loss_clip": 0.01126306, + "auxiliary_loss_mlp": 0.01025727, + "balance_loss_clip": 1.04820347, + "balance_loss_mlp": 1.01230955, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.959872500656595, + "language_loss": 0.63746107, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65898144, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13421631, + "step": 9846, + "time_per_iteration": 2.642493486404419 + }, + { + "auxiliary_loss_clip": 0.01114094, + "auxiliary_loss_mlp": 0.01025874, + "balance_loss_clip": 1.04378378, + "balance_loss_mlp": 1.01442993, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.8189988354631597, + "language_loss": 0.76729512, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78869486, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11437988, + "step": 9847, + "time_per_iteration": 2.480903148651123 + }, + { + "auxiliary_loss_clip": 0.01119878, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.04463243, + "balance_loss_mlp": 1.02324545, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.8806061038640682, + "language_loss": 0.62463325, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64620423, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13995361, + "step": 9848, + "time_per_iteration": 2.487900733947754 + }, + { + "auxiliary_loss_clip": 0.01128377, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.05019379, + "balance_loss_mlp": 1.02034879, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.246921422470875, + "language_loss": 0.75843519, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.78004766, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12530518, + "step": 9849, + "time_per_iteration": 2.4771368503570557 + }, + { + "auxiliary_loss_clip": 0.01127074, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.05028737, + "balance_loss_mlp": 1.02360487, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.922269088293407, + "language_loss": 0.75103021, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77266455, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12762451, + "step": 9850, + "time_per_iteration": 2.513317108154297 + }, + { + "auxiliary_loss_clip": 0.01123747, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.04711938, + "balance_loss_mlp": 1.0205493, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 4.461100650646086, + "language_loss": 0.75674391, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77832508, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13824463, + "step": 9851, + "time_per_iteration": 2.4495902061462402 + }, + { + "auxiliary_loss_clip": 0.01123711, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.04760194, + "balance_loss_mlp": 1.01736641, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.942009067476582, + "language_loss": 0.70198119, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72351694, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12493896, + "step": 9852, + "time_per_iteration": 2.5052413940429688 + }, + { + "auxiliary_loss_clip": 0.01123816, + "auxiliary_loss_mlp": 0.01041442, + "balance_loss_clip": 1.04557598, + "balance_loss_mlp": 1.02798963, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 2.170573258485386, + "language_loss": 0.80115813, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82281071, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13458252, + "step": 9853, + "time_per_iteration": 2.4827566146850586 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.04207134, + "balance_loss_mlp": 1.01873779, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.760713787121741, + "language_loss": 0.67396021, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69540262, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.1114502, + "step": 9854, + "time_per_iteration": 2.564026355743408 + }, + { + "auxiliary_loss_clip": 0.01113178, + "auxiliary_loss_mlp": 0.01024559, + "balance_loss_clip": 1.0401063, + "balance_loss_mlp": 1.01222682, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.795251882458641, + "language_loss": 0.88400131, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.9053787, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12341309, + "step": 9855, + "time_per_iteration": 2.469878673553467 + }, + { + "auxiliary_loss_clip": 0.0111519, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.04355323, + "balance_loss_mlp": 1.01693296, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 2.09555352762323, + "language_loss": 0.86814988, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88958758, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11639404, + "step": 9856, + "time_per_iteration": 2.517563819885254 + }, + { + "auxiliary_loss_clip": 0.01117916, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.04155457, + "balance_loss_mlp": 1.0216682, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 1.8852525349932683, + "language_loss": 0.77605057, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79756898, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12249756, + "step": 9857, + "time_per_iteration": 2.470034599304199 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.04593658, + "balance_loss_mlp": 1.02284884, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.1016827536048837, + "language_loss": 0.64280391, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66435552, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12268066, + "step": 9858, + "time_per_iteration": 2.522404670715332 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.04669046, + "balance_loss_mlp": 1.01550841, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 4.595707774698071, + "language_loss": 0.77210081, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79358459, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12298584, + "step": 9859, + "time_per_iteration": 2.557234287261963 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.04533696, + "balance_loss_mlp": 1.01895428, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.12184607505423, + "language_loss": 0.75210863, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77365637, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12738037, + "step": 9860, + "time_per_iteration": 2.492539167404175 + }, + { + "auxiliary_loss_clip": 0.01122688, + "auxiliary_loss_mlp": 0.01036725, + "balance_loss_clip": 1.04718113, + "balance_loss_mlp": 1.02526343, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 2.2258359937919345, + "language_loss": 0.76142144, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78301555, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11468506, + "step": 9861, + "time_per_iteration": 2.5096683502197266 + }, + { + "auxiliary_loss_clip": 0.01116326, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.04237747, + "balance_loss_mlp": 1.01600909, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.503721930936593, + "language_loss": 0.70574838, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72718316, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11151123, + "step": 9862, + "time_per_iteration": 2.498541831970215 + }, + { + "auxiliary_loss_clip": 0.01122394, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.04631376, + "balance_loss_mlp": 1.01901603, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.7387819738086658, + "language_loss": 0.77740693, + "learning_rate": 1.500032899685832e-06, + "loss": 0.79894298, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12194824, + "step": 9863, + "time_per_iteration": 2.5048635005950928 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.0103778, + "balance_loss_clip": 1.04696465, + "balance_loss_mlp": 1.02357602, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.8152469150318804, + "language_loss": 0.70446748, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72606385, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.14196777, + "step": 9864, + "time_per_iteration": 2.5469133853912354 + }, + { + "auxiliary_loss_clip": 0.01117602, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.04110432, + "balance_loss_mlp": 1.02115154, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.5284302660296671, + "language_loss": 0.67459196, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69611549, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13592529, + "step": 9865, + "time_per_iteration": 3.99873685836792 + }, + { + "auxiliary_loss_clip": 0.01122028, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.04503226, + "balance_loss_mlp": 1.02118039, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 2.0087029624662294, + "language_loss": 0.78077382, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80233359, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12780762, + "step": 9866, + "time_per_iteration": 2.449985980987549 + }, + { + "auxiliary_loss_clip": 0.01125831, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.04857635, + "balance_loss_mlp": 1.01852417, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 2.507813076750523, + "language_loss": 0.723212, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74477541, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11999512, + "step": 9867, + "time_per_iteration": 2.5636093616485596 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.04754114, + "balance_loss_mlp": 1.0155369, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.5073308687375975, + "language_loss": 0.66559267, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68708348, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.13195801, + "step": 9868, + "time_per_iteration": 2.48244309425354 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.04211116, + "balance_loss_mlp": 1.0188942, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5334512348987959, + "language_loss": 0.7568258, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77832186, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.13696289, + "step": 9869, + "time_per_iteration": 2.5593161582946777 + }, + { + "auxiliary_loss_clip": 0.01129379, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.05233622, + "balance_loss_mlp": 1.0230186, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.7088263992800163, + "language_loss": 0.74197853, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76363337, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13079834, + "step": 9870, + "time_per_iteration": 2.8020284175872803 + }, + { + "auxiliary_loss_clip": 0.01124281, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.01565254, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.192567474398917, + "language_loss": 0.71577132, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.73729432, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1237793, + "step": 9871, + "time_per_iteration": 2.5127968788146973 + }, + { + "auxiliary_loss_clip": 0.0112482, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.04650164, + "balance_loss_mlp": 1.01782513, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 2.193243731526478, + "language_loss": 0.74673957, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76829576, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12957764, + "step": 9872, + "time_per_iteration": 2.4943909645080566 + }, + { + "auxiliary_loss_clip": 0.01127963, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.05031788, + "balance_loss_mlp": 1.01957917, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.3626564147493134, + "language_loss": 0.79216182, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.8137694, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13232422, + "step": 9873, + "time_per_iteration": 3.852592706680298 + }, + { + "auxiliary_loss_clip": 0.01124681, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.04826665, + "balance_loss_mlp": 1.02171564, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.7527874650083362, + "language_loss": 0.84948558, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87108463, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13500977, + "step": 9874, + "time_per_iteration": 2.5261435508728027 + }, + { + "auxiliary_loss_clip": 0.01053906, + "auxiliary_loss_mlp": 0.01004369, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.00296688, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7594948398946907, + "language_loss": 0.60058534, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62116802, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.27392578, + "router_z_loss_mlp": 0.01403809, + "step": 9875, + "time_per_iteration": 3.153069496154785 + }, + { + "auxiliary_loss_clip": 0.01127904, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.04932976, + "balance_loss_mlp": 1.01885867, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.1539817231932408, + "language_loss": 0.77954602, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.80114877, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.1350708, + "step": 9876, + "time_per_iteration": 2.450866460800171 + }, + { + "auxiliary_loss_clip": 0.01117188, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.04350305, + "balance_loss_mlp": 1.01778483, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.4922259218537866, + "language_loss": 0.75680429, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77828455, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.13067627, + "step": 9877, + "time_per_iteration": 2.4762983322143555 + }, + { + "auxiliary_loss_clip": 0.01116362, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.01771569, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.242370433243268, + "language_loss": 0.80823684, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.82970941, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13189697, + "step": 9878, + "time_per_iteration": 2.579270601272583 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.042454, + "balance_loss_mlp": 1.02379048, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.8133116747461737, + "language_loss": 0.71331799, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73485267, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12182617, + "step": 9879, + "time_per_iteration": 2.658696413040161 + }, + { + "auxiliary_loss_clip": 0.0112201, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.04677582, + "balance_loss_mlp": 1.02453184, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.793577978562507, + "language_loss": 0.57772779, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59932172, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12860107, + "step": 9880, + "time_per_iteration": 3.8987152576446533 + }, + { + "auxiliary_loss_clip": 0.01128652, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_clip": 1.04684854, + "balance_loss_mlp": 1.02927995, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 2.429171112162047, + "language_loss": 0.77387518, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79559314, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.1385498, + "step": 9881, + "time_per_iteration": 2.5545685291290283 + }, + { + "auxiliary_loss_clip": 0.01128948, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_clip": 1.0513072, + "balance_loss_mlp": 1.01482236, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 4.720981691420864, + "language_loss": 0.82757205, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.8491376, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12805176, + "step": 9882, + "time_per_iteration": 2.4637482166290283 + }, + { + "auxiliary_loss_clip": 0.01123852, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.04975057, + "balance_loss_mlp": 1.02350092, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.4276200321085546, + "language_loss": 0.7925297, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81411976, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11657715, + "step": 9883, + "time_per_iteration": 2.495879650115967 + }, + { + "auxiliary_loss_clip": 0.01122998, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.04535246, + "balance_loss_mlp": 1.01995707, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.8237054924685265, + "language_loss": 0.74528468, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76685131, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13726807, + "step": 9884, + "time_per_iteration": 2.512617349624634 + }, + { + "auxiliary_loss_clip": 0.01126894, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.05138254, + "balance_loss_mlp": 1.01702702, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.6178980598938497, + "language_loss": 0.66539311, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68695426, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12194824, + "step": 9885, + "time_per_iteration": 4.003280162811279 + }, + { + "auxiliary_loss_clip": 0.01115699, + "auxiliary_loss_mlp": 0.01036093, + "balance_loss_clip": 1.04278243, + "balance_loss_mlp": 1.02319479, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 3.3456756247309944, + "language_loss": 0.76903105, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79054898, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12908936, + "step": 9886, + "time_per_iteration": 2.5588207244873047 + }, + { + "auxiliary_loss_clip": 0.01040507, + "auxiliary_loss_mlp": 0.01006607, + "balance_loss_clip": 1.01467407, + "balance_loss_mlp": 1.00499463, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8362178115921183, + "language_loss": 0.64528167, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66575271, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01611328, + "step": 9887, + "time_per_iteration": 2.969545841217041 + }, + { + "auxiliary_loss_clip": 0.01124274, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.04860067, + "balance_loss_mlp": 1.01551652, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 2.3442021815204894, + "language_loss": 0.69212985, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71365333, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12542725, + "step": 9888, + "time_per_iteration": 2.499249219894409 + }, + { + "auxiliary_loss_clip": 0.01124207, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.05013359, + "balance_loss_mlp": 1.018641, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.7515655165241943, + "language_loss": 0.79389501, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81545699, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.13348389, + "step": 9889, + "time_per_iteration": 2.587723731994629 + }, + { + "auxiliary_loss_clip": 0.01117867, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.04385364, + "balance_loss_mlp": 1.02130425, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.543321753853075, + "language_loss": 0.71034145, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.73184931, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11627197, + "step": 9890, + "time_per_iteration": 2.6174545288085938 + }, + { + "auxiliary_loss_clip": 0.01116426, + "auxiliary_loss_mlp": 0.01038063, + "balance_loss_clip": 1.04061925, + "balance_loss_mlp": 1.02475905, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 1.9775468294891738, + "language_loss": 0.69539225, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71693712, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13317871, + "step": 9891, + "time_per_iteration": 2.4294240474700928 + }, + { + "auxiliary_loss_clip": 0.01113982, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.04047775, + "balance_loss_mlp": 1.02149332, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 2.4085506312576603, + "language_loss": 0.53810799, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55958623, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12341309, + "step": 9892, + "time_per_iteration": 2.510800361633301 + }, + { + "auxiliary_loss_clip": 0.01048913, + "auxiliary_loss_mlp": 0.01006832, + "balance_loss_clip": 1.02379012, + "balance_loss_mlp": 1.00531507, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6627655882623728, + "language_loss": 0.5449388, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56549633, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01515198, + "step": 9893, + "time_per_iteration": 3.161133289337158 + }, + { + "auxiliary_loss_clip": 0.01116069, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.04288065, + "balance_loss_mlp": 1.02459061, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.6948349833616332, + "language_loss": 0.74544209, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.7669673, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11853027, + "step": 9894, + "time_per_iteration": 2.4808077812194824 + }, + { + "auxiliary_loss_clip": 0.01118853, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.04304743, + "balance_loss_mlp": 1.01947439, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.973696249770836, + "language_loss": 0.77685505, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79836404, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12573242, + "step": 9895, + "time_per_iteration": 2.432368278503418 + }, + { + "auxiliary_loss_clip": 0.01129296, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.04949665, + "balance_loss_mlp": 1.02255344, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.5812529664990111, + "language_loss": 0.79017043, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81182086, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13195801, + "step": 9896, + "time_per_iteration": 2.5204150676727295 + }, + { + "auxiliary_loss_clip": 0.01117842, + "auxiliary_loss_mlp": 0.01030222, + "balance_loss_clip": 1.04168141, + "balance_loss_mlp": 1.01753187, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.5148740314436475, + "language_loss": 0.83415651, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85563719, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12689209, + "step": 9897, + "time_per_iteration": 2.535837411880493 + }, + { + "auxiliary_loss_clip": 0.01123458, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.04783905, + "balance_loss_mlp": 1.01947522, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.960686783546354, + "language_loss": 0.71029323, + "learning_rate": 1.486846243389939e-06, + "loss": 0.73184764, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12524414, + "step": 9898, + "time_per_iteration": 2.480782985687256 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.04508257, + "balance_loss_mlp": 1.02591753, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.4304779044477254, + "language_loss": 0.64049894, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66216636, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.16296387, + "step": 9899, + "time_per_iteration": 2.5349700450897217 + }, + { + "auxiliary_loss_clip": 0.01119639, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.04594922, + "balance_loss_mlp": 1.01750898, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.6608513374673677, + "language_loss": 0.72109133, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74257427, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11157227, + "step": 9900, + "time_per_iteration": 2.5876407623291016 + }, + { + "auxiliary_loss_clip": 0.01118516, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.04615581, + "balance_loss_mlp": 1.01695633, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.7304513421312335, + "language_loss": 0.84212619, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86360645, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12561035, + "step": 9901, + "time_per_iteration": 2.468116521835327 + }, + { + "auxiliary_loss_clip": 0.01040116, + "auxiliary_loss_mlp": 0.01002138, + "balance_loss_clip": 1.01422668, + "balance_loss_mlp": 1.00075221, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.796628642852917, + "language_loss": 0.58249074, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60291326, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01387024, + "step": 9902, + "time_per_iteration": 3.031348705291748 + }, + { + "auxiliary_loss_clip": 0.01126873, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.05121434, + "balance_loss_mlp": 1.02219868, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 1.7212075149692594, + "language_loss": 0.77243936, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79405904, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12890625, + "step": 9903, + "time_per_iteration": 2.5227065086364746 + }, + { + "auxiliary_loss_clip": 0.01115338, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.04031944, + "balance_loss_mlp": 1.02643824, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.7748236898659804, + "language_loss": 0.7807802, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.80232155, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12353516, + "step": 9904, + "time_per_iteration": 2.674957513809204 + }, + { + "auxiliary_loss_clip": 0.01125771, + "auxiliary_loss_mlp": 0.01041096, + "balance_loss_clip": 1.04603529, + "balance_loss_mlp": 1.02748787, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.450104954016037, + "language_loss": 0.72835886, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75002754, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13623047, + "step": 9905, + "time_per_iteration": 2.5744917392730713 + }, + { + "auxiliary_loss_clip": 0.01126486, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.04759192, + "balance_loss_mlp": 1.02603245, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.9291716046866092, + "language_loss": 0.69924664, + "learning_rate": 1.483835475336295e-06, + "loss": 0.72090149, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12963867, + "step": 9906, + "time_per_iteration": 2.472910165786743 + }, + { + "auxiliary_loss_clip": 0.01121219, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.04617524, + "balance_loss_mlp": 1.01907504, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.8686938087641312, + "language_loss": 0.75314909, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77467394, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12194824, + "step": 9907, + "time_per_iteration": 2.503499746322632 + }, + { + "auxiliary_loss_clip": 0.01117375, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.04308498, + "balance_loss_mlp": 1.02239633, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.5354627570705364, + "language_loss": 0.67173254, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69325089, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12072754, + "step": 9908, + "time_per_iteration": 4.004208564758301 + }, + { + "auxiliary_loss_clip": 0.01121097, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.04833138, + "balance_loss_mlp": 1.01606297, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 1.9888801933130105, + "language_loss": 0.76171505, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78320479, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1182251, + "step": 9909, + "time_per_iteration": 2.5736286640167236 + }, + { + "auxiliary_loss_clip": 0.01045422, + "auxiliary_loss_mlp": 0.01000961, + "balance_loss_clip": 1.01935077, + "balance_loss_mlp": 0.99952358, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.926822303296698, + "language_loss": 0.73434216, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75480604, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.26123047, + "router_z_loss_mlp": 0.01437378, + "step": 9910, + "time_per_iteration": 3.1571059226989746 + }, + { + "auxiliary_loss_clip": 0.01122108, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.04473245, + "balance_loss_mlp": 1.02022433, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.8539415151443426, + "language_loss": 0.69404823, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71560496, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13330078, + "step": 9911, + "time_per_iteration": 2.6010913848876953 + }, + { + "auxiliary_loss_clip": 0.01127835, + "auxiliary_loss_mlp": 0.01037915, + "balance_loss_clip": 1.0489074, + "balance_loss_mlp": 1.02331221, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 2.069808096107547, + "language_loss": 0.65804762, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.67970514, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14593506, + "step": 9912, + "time_per_iteration": 2.4363386631011963 + }, + { + "auxiliary_loss_clip": 0.01119061, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.04418862, + "balance_loss_mlp": 1.02884269, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 1.928798034630625, + "language_loss": 0.73456001, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75617045, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13146973, + "step": 9913, + "time_per_iteration": 2.572793483734131 + }, + { + "auxiliary_loss_clip": 0.01124104, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.04442477, + "balance_loss_mlp": 1.01587081, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 1.8735535898430464, + "language_loss": 0.79891646, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.82043934, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12322998, + "step": 9914, + "time_per_iteration": 2.51008939743042 + }, + { + "auxiliary_loss_clip": 0.01122513, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.04790723, + "balance_loss_mlp": 1.02250552, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.8029124194245139, + "language_loss": 0.67993534, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.70150942, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12384033, + "step": 9915, + "time_per_iteration": 2.52583909034729 + }, + { + "auxiliary_loss_clip": 0.01114604, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.03892469, + "balance_loss_mlp": 1.01935506, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.7920557829861565, + "language_loss": 0.7858035, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.80726814, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12506104, + "step": 9916, + "time_per_iteration": 2.487076997756958 + }, + { + "auxiliary_loss_clip": 0.01123167, + "auxiliary_loss_mlp": 0.01032847, + "balance_loss_clip": 1.04555988, + "balance_loss_mlp": 1.01972246, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.901628243176088, + "language_loss": 0.82801276, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84957302, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13122559, + "step": 9917, + "time_per_iteration": 3.866901397705078 + }, + { + "auxiliary_loss_clip": 0.01126956, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.05064726, + "balance_loss_mlp": 1.02430117, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.8231393737948263, + "language_loss": 0.77591401, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79754764, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12103271, + "step": 9918, + "time_per_iteration": 2.4822800159454346 + }, + { + "auxiliary_loss_clip": 0.01128224, + "auxiliary_loss_mlp": 0.01037571, + "balance_loss_clip": 1.05079246, + "balance_loss_mlp": 1.02458334, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.5074509112037644, + "language_loss": 0.7918756, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.8135336, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12988281, + "step": 9919, + "time_per_iteration": 2.5096542835235596 + }, + { + "auxiliary_loss_clip": 0.01125214, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.04861951, + "balance_loss_mlp": 1.02293205, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 2.0046975757515986, + "language_loss": 0.77886188, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.80047548, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13226318, + "step": 9920, + "time_per_iteration": 2.5906147956848145 + }, + { + "auxiliary_loss_clip": 0.01128035, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.05090404, + "balance_loss_mlp": 1.01966047, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.7686313884303217, + "language_loss": 0.82678157, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84838104, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12249756, + "step": 9921, + "time_per_iteration": 2.53135347366333 + }, + { + "auxiliary_loss_clip": 0.0112784, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.05202413, + "balance_loss_mlp": 1.01697767, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.9626055148175185, + "language_loss": 0.80635375, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82793105, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12921143, + "step": 9922, + "time_per_iteration": 2.5117428302764893 + }, + { + "auxiliary_loss_clip": 0.01124119, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.04859149, + "balance_loss_mlp": 1.01758659, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.8486741629736845, + "language_loss": 0.76569909, + "learning_rate": 1.477441761580111e-06, + "loss": 0.78723854, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12231445, + "step": 9923, + "time_per_iteration": 4.045170068740845 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.04429078, + "balance_loss_mlp": 1.01965106, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.817410314154555, + "language_loss": 0.75879467, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.7803601, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.14343262, + "step": 9924, + "time_per_iteration": 2.4806272983551025 + }, + { + "auxiliary_loss_clip": 0.01122127, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.04881334, + "balance_loss_mlp": 1.01646471, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 1.734861348987026, + "language_loss": 0.66243541, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68394464, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12316895, + "step": 9925, + "time_per_iteration": 2.455192804336548 + }, + { + "auxiliary_loss_clip": 0.01119368, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.04718995, + "balance_loss_mlp": 1.02128434, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.0436357420829325, + "language_loss": 0.71847481, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.7400049, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12347412, + "step": 9926, + "time_per_iteration": 2.4605607986450195 + }, + { + "auxiliary_loss_clip": 0.0113166, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.05359507, + "balance_loss_mlp": 1.01904488, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.8804150752317774, + "language_loss": 0.70332146, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72495866, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13012695, + "step": 9927, + "time_per_iteration": 2.6741809844970703 + }, + { + "auxiliary_loss_clip": 0.01131593, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.05061984, + "balance_loss_mlp": 1.01666188, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.8957837167576532, + "language_loss": 0.64131963, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.66293585, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13372803, + "step": 9928, + "time_per_iteration": 2.6229965686798096 + }, + { + "auxiliary_loss_clip": 0.01121138, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.04668283, + "balance_loss_mlp": 1.02193308, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.5353568494834071, + "language_loss": 0.69260639, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71415913, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12188721, + "step": 9929, + "time_per_iteration": 3.9351181983947754 + }, + { + "auxiliary_loss_clip": 0.01121531, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.05009544, + "balance_loss_mlp": 1.02298999, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 2.0790371597136024, + "language_loss": 0.76835382, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.7899121, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11309814, + "step": 9930, + "time_per_iteration": 2.450035572052002 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.0538274, + "balance_loss_mlp": 1.01962245, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.667963676999191, + "language_loss": 0.68776852, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.70942628, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13970947, + "step": 9931, + "time_per_iteration": 2.5549216270446777 + }, + { + "auxiliary_loss_clip": 0.01049078, + "auxiliary_loss_mlp": 0.01005609, + "balance_loss_clip": 1.02264297, + "balance_loss_mlp": 1.00411892, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8581263740069831, + "language_loss": 0.64261198, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66315889, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01490784, + "step": 9932, + "time_per_iteration": 3.0191891193389893 + }, + { + "auxiliary_loss_clip": 0.01119767, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.04535341, + "balance_loss_mlp": 1.01706398, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 2.112761631355607, + "language_loss": 0.74492359, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76642448, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13250732, + "step": 9933, + "time_per_iteration": 2.5057153701782227 + }, + { + "auxiliary_loss_clip": 0.01055518, + "auxiliary_loss_mlp": 0.0100215, + "balance_loss_clip": 1.03060627, + "balance_loss_mlp": 1.00078094, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6596535351151525, + "language_loss": 0.52011597, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54069269, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01368713, + "step": 9934, + "time_per_iteration": 3.1775074005126953 + }, + { + "auxiliary_loss_clip": 0.01050565, + "auxiliary_loss_mlp": 0.01006704, + "balance_loss_clip": 1.02458286, + "balance_loss_mlp": 1.00525403, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8288220027518215, + "language_loss": 0.54204863, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.5626213, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01448059, + "step": 9935, + "time_per_iteration": 3.047811508178711 + }, + { + "auxiliary_loss_clip": 0.01122465, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.04767942, + "balance_loss_mlp": 1.01949668, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.6704664481458624, + "language_loss": 0.65903729, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68058491, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12811279, + "step": 9936, + "time_per_iteration": 2.5350677967071533 + }, + { + "auxiliary_loss_clip": 0.01127067, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.05052042, + "balance_loss_mlp": 1.02398145, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.054417890971753, + "language_loss": 0.67865986, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.70029104, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1206665, + "step": 9937, + "time_per_iteration": 2.4393558502197266 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.05762863, + "balance_loss_mlp": 1.01999116, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 10.67122628677303, + "language_loss": 0.77321345, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.7949096, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13830566, + "step": 9938, + "time_per_iteration": 2.631931781768799 + }, + { + "auxiliary_loss_clip": 0.0112945, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.04970598, + "balance_loss_mlp": 1.01997995, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.817917963864522, + "language_loss": 0.76180547, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78343016, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13031006, + "step": 9939, + "time_per_iteration": 2.5425758361816406 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.05016053, + "balance_loss_mlp": 1.01607454, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.097205167688779, + "language_loss": 0.69205821, + "learning_rate": 1.471053774486878e-06, + "loss": 0.71366131, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.14239502, + "step": 9940, + "time_per_iteration": 2.527844190597534 + }, + { + "auxiliary_loss_clip": 0.01118965, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.04624641, + "balance_loss_mlp": 1.0215956, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3236936510690507, + "language_loss": 0.70088208, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72240376, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11608887, + "step": 9941, + "time_per_iteration": 2.6084518432617188 + }, + { + "auxiliary_loss_clip": 0.01126123, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.05145907, + "balance_loss_mlp": 1.02061939, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 1.9554267885665768, + "language_loss": 0.775545, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79714227, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12969971, + "step": 9942, + "time_per_iteration": 2.596303939819336 + }, + { + "auxiliary_loss_clip": 0.01127708, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.04809642, + "balance_loss_mlp": 1.02475154, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.845436485575461, + "language_loss": 0.75412023, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.77577662, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13183594, + "step": 9943, + "time_per_iteration": 2.4966275691986084 + }, + { + "auxiliary_loss_clip": 0.01125663, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.05105972, + "balance_loss_mlp": 1.01610088, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.9031796432037562, + "language_loss": 0.62231195, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64384639, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11688232, + "step": 9944, + "time_per_iteration": 2.6332085132598877 + }, + { + "auxiliary_loss_clip": 0.01118914, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04345536, + "balance_loss_mlp": 1.0210551, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6818360779323642, + "language_loss": 0.72492129, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74645138, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13049316, + "step": 9945, + "time_per_iteration": 2.6996889114379883 + }, + { + "auxiliary_loss_clip": 0.01118133, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04322076, + "balance_loss_mlp": 1.02122998, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 1.9759063978834874, + "language_loss": 0.67025578, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69177699, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12756348, + "step": 9946, + "time_per_iteration": 2.618741035461426 + }, + { + "auxiliary_loss_clip": 0.01121261, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.04342473, + "balance_loss_mlp": 1.02527809, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 1.8943284630321133, + "language_loss": 0.88484621, + "learning_rate": 1.468425107717461e-06, + "loss": 0.90646809, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.15625, + "step": 9947, + "time_per_iteration": 2.487306594848633 + }, + { + "auxiliary_loss_clip": 0.01117874, + "auxiliary_loss_mlp": 0.01038495, + "balance_loss_clip": 1.04567754, + "balance_loss_mlp": 1.02525055, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.934340406386108, + "language_loss": 0.72065258, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74221623, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.13238525, + "step": 9948, + "time_per_iteration": 2.5433409214019775 + }, + { + "auxiliary_loss_clip": 0.01121042, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.04514766, + "balance_loss_mlp": 1.02247739, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.7478323815137695, + "language_loss": 0.89267194, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91424596, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13885498, + "step": 9949, + "time_per_iteration": 2.5258829593658447 + }, + { + "auxiliary_loss_clip": 0.01126859, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.04913139, + "balance_loss_mlp": 1.01779091, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 1.9068389948798419, + "language_loss": 0.70590484, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72747272, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12127686, + "step": 9950, + "time_per_iteration": 2.5664820671081543 + }, + { + "auxiliary_loss_clip": 0.01124389, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.04905736, + "balance_loss_mlp": 1.02057111, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.6948818191193813, + "language_loss": 0.78184795, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80342376, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1262207, + "step": 9951, + "time_per_iteration": 2.5022056102752686 + }, + { + "auxiliary_loss_clip": 0.01125399, + "auxiliary_loss_mlp": 0.0103893, + "balance_loss_clip": 1.0468961, + "balance_loss_mlp": 1.02561426, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.4128598271597503, + "language_loss": 0.73739773, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.75904101, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13317871, + "step": 9952, + "time_per_iteration": 3.942561149597168 + }, + { + "auxiliary_loss_clip": 0.01125899, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.04544425, + "balance_loss_mlp": 1.01985097, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.7274412237617693, + "language_loss": 0.78809333, + "learning_rate": 1.466172750724613e-06, + "loss": 0.80969238, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.14172363, + "step": 9953, + "time_per_iteration": 2.5066921710968018 + }, + { + "auxiliary_loss_clip": 0.01126295, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.05075538, + "balance_loss_mlp": 1.01993918, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.8260450207232521, + "language_loss": 0.69721889, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.7188046, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12347412, + "step": 9954, + "time_per_iteration": 2.544367551803589 + }, + { + "auxiliary_loss_clip": 0.01132839, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.05456805, + "balance_loss_mlp": 1.02005458, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 2.317103290274124, + "language_loss": 0.73173666, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75339115, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12561035, + "step": 9955, + "time_per_iteration": 2.5249340534210205 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.04901862, + "balance_loss_mlp": 1.02009892, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.7514827204351708, + "language_loss": 0.68389821, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70548809, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.1282959, + "step": 9956, + "time_per_iteration": 2.5066771507263184 + }, + { + "auxiliary_loss_clip": 0.01123429, + "auxiliary_loss_mlp": 0.01029693, + "balance_loss_clip": 1.04655349, + "balance_loss_mlp": 1.01713991, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.9583421596887356, + "language_loss": 0.73644209, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75797337, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12548828, + "step": 9957, + "time_per_iteration": 2.4740545749664307 + }, + { + "auxiliary_loss_clip": 0.01123528, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.05073488, + "balance_loss_mlp": 1.01927447, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.7903312932421276, + "language_loss": 0.84522116, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86676514, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1159668, + "step": 9958, + "time_per_iteration": 2.4407904148101807 + }, + { + "auxiliary_loss_clip": 0.01119376, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.04191434, + "balance_loss_mlp": 1.01891518, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 1.8181479631463418, + "language_loss": 0.66513962, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68665266, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13018799, + "step": 9959, + "time_per_iteration": 2.533515691757202 + }, + { + "auxiliary_loss_clip": 0.01123655, + "auxiliary_loss_mlp": 0.01036308, + "balance_loss_clip": 1.04689217, + "balance_loss_mlp": 1.02237868, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6405222328322888, + "language_loss": 0.8365671, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85816669, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13922119, + "step": 9960, + "time_per_iteration": 2.4553959369659424 + }, + { + "auxiliary_loss_clip": 0.01121907, + "auxiliary_loss_mlp": 0.01028287, + "balance_loss_clip": 1.04716682, + "balance_loss_mlp": 1.01583576, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.5138661807902947, + "language_loss": 0.7921524, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.8136543, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12445068, + "step": 9961, + "time_per_iteration": 3.9253289699554443 + }, + { + "auxiliary_loss_clip": 0.01119088, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.04364848, + "balance_loss_mlp": 1.02461886, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.6083020526263245, + "language_loss": 0.67341435, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69498968, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13842773, + "step": 9962, + "time_per_iteration": 2.4791109561920166 + }, + { + "auxiliary_loss_clip": 0.0112853, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.05344594, + "balance_loss_mlp": 1.01784647, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.375735510806645, + "language_loss": 0.74250323, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76409125, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12420654, + "step": 9963, + "time_per_iteration": 2.5149624347686768 + }, + { + "auxiliary_loss_clip": 0.01116615, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.04315138, + "balance_loss_mlp": 1.01891065, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 1.9124538775971653, + "language_loss": 0.68086588, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70235538, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.13427734, + "step": 9964, + "time_per_iteration": 2.5906429290771484 + }, + { + "auxiliary_loss_clip": 0.01127243, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.05498552, + "balance_loss_mlp": 1.01405132, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.9369925155428265, + "language_loss": 0.76936644, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79090428, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12493896, + "step": 9965, + "time_per_iteration": 2.495344877243042 + }, + { + "auxiliary_loss_clip": 0.01120267, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.04488587, + "balance_loss_mlp": 1.01792228, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.8101677109602377, + "language_loss": 0.77126908, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79277515, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12420654, + "step": 9966, + "time_per_iteration": 2.4594931602478027 + }, + { + "auxiliary_loss_clip": 0.0111717, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.04263246, + "balance_loss_mlp": 1.01789188, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.3701740627874563, + "language_loss": 0.73529673, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75676751, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12017822, + "step": 9967, + "time_per_iteration": 3.945477247238159 + }, + { + "auxiliary_loss_clip": 0.01126378, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.04684901, + "balance_loss_mlp": 1.02400553, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.1382490622675103, + "language_loss": 0.68840688, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.71004438, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13378906, + "step": 9968, + "time_per_iteration": 2.474482297897339 + }, + { + "auxiliary_loss_clip": 0.01130347, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.05201352, + "balance_loss_mlp": 1.01787472, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.5413946685658269, + "language_loss": 0.79537416, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81699848, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14221191, + "step": 9969, + "time_per_iteration": 2.457545280456543 + }, + { + "auxiliary_loss_clip": 0.01123039, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.0447731, + "balance_loss_mlp": 1.01786709, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.6945109890942756, + "language_loss": 0.81535715, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83689725, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13104248, + "step": 9970, + "time_per_iteration": 2.4785189628601074 + }, + { + "auxiliary_loss_clip": 0.01125202, + "auxiliary_loss_mlp": 0.01037661, + "balance_loss_clip": 1.04654527, + "balance_loss_mlp": 1.02327228, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 3.7019585683976044, + "language_loss": 0.62429261, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64592123, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.14385986, + "step": 9971, + "time_per_iteration": 2.4551141262054443 + }, + { + "auxiliary_loss_clip": 0.01118298, + "auxiliary_loss_mlp": 0.01028594, + "balance_loss_clip": 1.04725838, + "balance_loss_mlp": 1.0168457, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.729625110840714, + "language_loss": 0.78981602, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81128502, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11761475, + "step": 9972, + "time_per_iteration": 2.54136323928833 + }, + { + "auxiliary_loss_clip": 0.01130512, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.0503546, + "balance_loss_mlp": 1.02576613, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.151202541007394, + "language_loss": 0.76099467, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78269315, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13562012, + "step": 9973, + "time_per_iteration": 3.953111171722412 + }, + { + "auxiliary_loss_clip": 0.01124069, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.04816854, + "balance_loss_mlp": 1.01884627, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.959217253430628, + "language_loss": 0.65348506, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67503965, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12554932, + "step": 9974, + "time_per_iteration": 2.4923605918884277 + }, + { + "auxiliary_loss_clip": 0.01128608, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.05307651, + "balance_loss_mlp": 1.02123523, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.383027288387878, + "language_loss": 0.74711704, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76874363, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.1282959, + "step": 9975, + "time_per_iteration": 2.5130269527435303 + }, + { + "auxiliary_loss_clip": 0.01124005, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.04926395, + "balance_loss_mlp": 1.0173552, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.917355228903076, + "language_loss": 0.77377498, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79531801, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12939453, + "step": 9976, + "time_per_iteration": 2.5472426414489746 + }, + { + "auxiliary_loss_clip": 0.01116802, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.04307413, + "balance_loss_mlp": 1.01886463, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.557917434601563, + "language_loss": 0.74743146, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76891613, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12792969, + "step": 9977, + "time_per_iteration": 2.530393123626709 + }, + { + "auxiliary_loss_clip": 0.01123482, + "auxiliary_loss_mlp": 0.01031943, + "balance_loss_clip": 1.04910791, + "balance_loss_mlp": 1.01925945, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.6446800455483326, + "language_loss": 0.69101918, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.71257341, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12677002, + "step": 9978, + "time_per_iteration": 2.4995157718658447 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01041582, + "balance_loss_clip": 1.04804015, + "balance_loss_mlp": 1.02801037, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.1694485603490343, + "language_loss": 0.81538045, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83708608, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13555908, + "step": 9979, + "time_per_iteration": 2.428860664367676 + }, + { + "auxiliary_loss_clip": 0.01120463, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.04843688, + "balance_loss_mlp": 1.0200963, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 2.577888251006312, + "language_loss": 0.70064914, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72217602, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.12121582, + "step": 9980, + "time_per_iteration": 2.4768240451812744 + }, + { + "auxiliary_loss_clip": 0.01126531, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.0483923, + "balance_loss_mlp": 1.01854289, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 3.1996710343038273, + "language_loss": 0.68194962, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70353663, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13635254, + "step": 9981, + "time_per_iteration": 2.426844596862793 + }, + { + "auxiliary_loss_clip": 0.01120367, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.04737687, + "balance_loss_mlp": 1.02244401, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 2.189273909842715, + "language_loss": 0.78672194, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.8082636, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11346436, + "step": 9982, + "time_per_iteration": 2.5044503211975098 + }, + { + "auxiliary_loss_clip": 0.01120194, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.04690707, + "balance_loss_mlp": 1.02017581, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.6749260241433437, + "language_loss": 0.73284358, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7543726, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12536621, + "step": 9983, + "time_per_iteration": 2.466710329055786 + }, + { + "auxiliary_loss_clip": 0.01123916, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.0475992, + "balance_loss_mlp": 1.0198859, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.0391207112524588, + "language_loss": 0.77270168, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79426515, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12536621, + "step": 9984, + "time_per_iteration": 2.4792697429656982 + }, + { + "auxiliary_loss_clip": 0.01117909, + "auxiliary_loss_mlp": 0.01040898, + "balance_loss_clip": 1.0427779, + "balance_loss_mlp": 1.02714086, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.5608859423056274, + "language_loss": 0.83205235, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85364044, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13757324, + "step": 9985, + "time_per_iteration": 2.6710703372955322 + }, + { + "auxiliary_loss_clip": 0.01119187, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.04408193, + "balance_loss_mlp": 1.02564347, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 2.075935637613103, + "language_loss": 0.71270907, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73427463, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11737061, + "step": 9986, + "time_per_iteration": 2.6362287998199463 + }, + { + "auxiliary_loss_clip": 0.01126678, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.05202532, + "balance_loss_mlp": 1.01943099, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.4386554471942548, + "language_loss": 0.71661365, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73820412, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12939453, + "step": 9987, + "time_per_iteration": 2.561424732208252 + }, + { + "auxiliary_loss_clip": 0.01116239, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.04133594, + "balance_loss_mlp": 1.02144063, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.8919560092276657, + "language_loss": 0.84799349, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86949599, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12554932, + "step": 9988, + "time_per_iteration": 2.5679171085357666 + }, + { + "auxiliary_loss_clip": 0.01127754, + "auxiliary_loss_mlp": 0.01034203, + "balance_loss_clip": 1.0500257, + "balance_loss_mlp": 1.02088153, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6157351171003875, + "language_loss": 0.65079099, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67241061, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13330078, + "step": 9989, + "time_per_iteration": 2.4278082847595215 + }, + { + "auxiliary_loss_clip": 0.01123102, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.0487349, + "balance_loss_mlp": 1.02156985, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.7164273521079654, + "language_loss": 0.80573088, + "learning_rate": 1.452299436003257e-06, + "loss": 0.8272953, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11767578, + "step": 9990, + "time_per_iteration": 2.5189576148986816 + }, + { + "auxiliary_loss_clip": 0.01120363, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.0433948, + "balance_loss_mlp": 1.0229454, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.9056431113537955, + "language_loss": 0.82861829, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.85017961, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12817383, + "step": 9991, + "time_per_iteration": 2.536371946334839 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.04570723, + "balance_loss_mlp": 1.01605439, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.9565408492985406, + "language_loss": 0.828457, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84990728, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1159668, + "step": 9992, + "time_per_iteration": 2.564162015914917 + }, + { + "auxiliary_loss_clip": 0.0112028, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.04766762, + "balance_loss_mlp": 1.02147162, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 4.126728722101374, + "language_loss": 0.66202462, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.6835711, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12896729, + "step": 9993, + "time_per_iteration": 2.4462969303131104 + }, + { + "auxiliary_loss_clip": 0.01116697, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.04319, + "balance_loss_mlp": 1.01918721, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.8182059793319687, + "language_loss": 0.81002808, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83151251, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12561035, + "step": 9994, + "time_per_iteration": 2.4692068099975586 + }, + { + "auxiliary_loss_clip": 0.01114797, + "auxiliary_loss_mlp": 0.01025469, + "balance_loss_clip": 1.04485369, + "balance_loss_mlp": 1.01456141, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.8746236759212245, + "language_loss": 0.72873002, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.75013268, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10900879, + "step": 9995, + "time_per_iteration": 2.447906255722046 + }, + { + "auxiliary_loss_clip": 0.01118894, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.0436101, + "balance_loss_mlp": 1.01741326, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 1.6731634192886875, + "language_loss": 0.8117581, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83324385, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12268066, + "step": 9996, + "time_per_iteration": 3.866718292236328 + }, + { + "auxiliary_loss_clip": 0.01121772, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.04878879, + "balance_loss_mlp": 1.01830578, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 1.703104080890023, + "language_loss": 0.78903735, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.81056654, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12854004, + "step": 9997, + "time_per_iteration": 2.6041641235351562 + }, + { + "auxiliary_loss_clip": 0.01117406, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.04131889, + "balance_loss_mlp": 1.01666236, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 1.5968111853708777, + "language_loss": 0.73094487, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75241452, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12915039, + "step": 9998, + "time_per_iteration": 2.5107052326202393 + }, + { + "auxiliary_loss_clip": 0.01115821, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.04241419, + "balance_loss_mlp": 1.01874566, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.5384746133641833, + "language_loss": 0.72466469, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74613339, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12304688, + "step": 9999, + "time_per_iteration": 2.659274101257324 + }, + { + "auxiliary_loss_clip": 0.01121727, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.04307795, + "balance_loss_mlp": 1.02278256, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.6716137855197943, + "language_loss": 0.78279364, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80436563, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12695312, + "step": 10000, + "time_per_iteration": 2.5309417247772217 + }, + { + "auxiliary_loss_clip": 0.01120306, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.04364812, + "balance_loss_mlp": 1.02232182, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 2.1535996672798885, + "language_loss": 0.77768677, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79925716, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.14416504, + "step": 10001, + "time_per_iteration": 2.495095729827881 + }, + { + "auxiliary_loss_clip": 0.01128734, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.04708242, + "balance_loss_mlp": 1.01760101, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.7390906804194755, + "language_loss": 0.58662349, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60822499, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13812256, + "step": 10002, + "time_per_iteration": 2.5835843086242676 + }, + { + "auxiliary_loss_clip": 0.01125748, + "auxiliary_loss_mlp": 0.0103749, + "balance_loss_clip": 1.04938853, + "balance_loss_mlp": 1.02297044, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.5417112391984413, + "language_loss": 0.77822149, + "learning_rate": 1.447431741055314e-06, + "loss": 0.79985392, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.14538574, + "step": 10003, + "time_per_iteration": 4.058086156845093 + }, + { + "auxiliary_loss_clip": 0.01132533, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.05450153, + "balance_loss_mlp": 1.01838827, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.5275450649495945, + "language_loss": 0.77858478, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.80021572, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.1217041, + "step": 10004, + "time_per_iteration": 2.499384880065918 + }, + { + "auxiliary_loss_clip": 0.01123875, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.04978597, + "balance_loss_mlp": 1.01765501, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.5181661374432565, + "language_loss": 0.72475284, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74629211, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12408447, + "step": 10005, + "time_per_iteration": 2.5087263584136963 + }, + { + "auxiliary_loss_clip": 0.01119811, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.04471087, + "balance_loss_mlp": 1.02043486, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.2127214290890107, + "language_loss": 0.74867094, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77020395, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13061523, + "step": 10006, + "time_per_iteration": 2.4212403297424316 + }, + { + "auxiliary_loss_clip": 0.01118299, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.04229426, + "balance_loss_mlp": 1.01934147, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.409953533443592, + "language_loss": 0.74080098, + "learning_rate": 1.445934699732685e-06, + "loss": 0.7623046, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12731934, + "step": 10007, + "time_per_iteration": 2.458665132522583 + }, + { + "auxiliary_loss_clip": 0.01124815, + "auxiliary_loss_mlp": 0.01023889, + "balance_loss_clip": 1.05116546, + "balance_loss_mlp": 1.01255774, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 2.8793150395781932, + "language_loss": 0.70054138, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72202849, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11340332, + "step": 10008, + "time_per_iteration": 2.4990158081054688 + }, + { + "auxiliary_loss_clip": 0.01120352, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.04751539, + "balance_loss_mlp": 1.01791239, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.6130301573753967, + "language_loss": 0.76653254, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78803587, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12072754, + "step": 10009, + "time_per_iteration": 2.5324695110321045 + }, + { + "auxiliary_loss_clip": 0.01120349, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.04465008, + "balance_loss_mlp": 1.02278066, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.0737546730729908, + "language_loss": 0.7444815, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76603848, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12561035, + "step": 10010, + "time_per_iteration": 3.987586498260498 + }, + { + "auxiliary_loss_clip": 0.01049448, + "auxiliary_loss_mlp": 0.01004631, + "balance_loss_clip": 1.02334762, + "balance_loss_mlp": 1.00355947, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8066959884281505, + "language_loss": 0.55070251, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57124335, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01071167, + "step": 10011, + "time_per_iteration": 3.1394920349121094 + }, + { + "auxiliary_loss_clip": 0.01116131, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.04204798, + "balance_loss_mlp": 1.02237201, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.7815465655008969, + "language_loss": 0.62310052, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64460325, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11767578, + "step": 10012, + "time_per_iteration": 2.617300510406494 + }, + { + "auxiliary_loss_clip": 0.01125619, + "auxiliary_loss_mlp": 0.0102457, + "balance_loss_clip": 1.05088139, + "balance_loss_mlp": 1.01334012, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.7594435074791765, + "language_loss": 0.74929643, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77079827, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11224365, + "step": 10013, + "time_per_iteration": 2.56711745262146 + }, + { + "auxiliary_loss_clip": 0.01124746, + "auxiliary_loss_mlp": 0.0102589, + "balance_loss_clip": 1.05373859, + "balance_loss_mlp": 1.01476216, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.7130537328599005, + "language_loss": 0.81274748, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83425385, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11126709, + "step": 10014, + "time_per_iteration": 2.6141319274902344 + }, + { + "auxiliary_loss_clip": 0.0111337, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.04449022, + "balance_loss_mlp": 1.01366067, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.4582951082535847, + "language_loss": 0.72492009, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74630773, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.11743164, + "step": 10015, + "time_per_iteration": 2.4845120906829834 + }, + { + "auxiliary_loss_clip": 0.01042465, + "auxiliary_loss_mlp": 0.01003757, + "balance_loss_clip": 1.0171237, + "balance_loss_mlp": 1.00258636, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8185568199132234, + "language_loss": 0.54792833, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56839061, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01171875, + "step": 10016, + "time_per_iteration": 4.405981779098511 + }, + { + "auxiliary_loss_clip": 0.01117375, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.0440222, + "balance_loss_mlp": 1.02068496, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.648213111848504, + "language_loss": 0.83076704, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85227573, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12817383, + "step": 10017, + "time_per_iteration": 2.4924139976501465 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.04467106, + "balance_loss_mlp": 1.01601672, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7245720114051752, + "language_loss": 0.83730185, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85875118, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.13269043, + "step": 10018, + "time_per_iteration": 2.514497756958008 + }, + { + "auxiliary_loss_clip": 0.01128573, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.05143595, + "balance_loss_mlp": 1.02384663, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.6604776997481132, + "language_loss": 0.78414738, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80580199, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13049316, + "step": 10019, + "time_per_iteration": 2.5057713985443115 + }, + { + "auxiliary_loss_clip": 0.01115977, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.04224992, + "balance_loss_mlp": 1.02155602, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.452391435664892, + "language_loss": 0.73989367, + "learning_rate": 1.441071641765681e-06, + "loss": 0.76139522, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1262207, + "step": 10020, + "time_per_iteration": 2.522817373275757 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.04551399, + "balance_loss_mlp": 1.02139091, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.6346446342754006, + "language_loss": 0.64240491, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66395193, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12506104, + "step": 10021, + "time_per_iteration": 2.5705926418304443 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.04251409, + "balance_loss_mlp": 1.02064919, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.442641118388616, + "language_loss": 0.80331153, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82481688, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.13330078, + "step": 10022, + "time_per_iteration": 2.494722843170166 + }, + { + "auxiliary_loss_clip": 0.01125856, + "auxiliary_loss_mlp": 0.01027275, + "balance_loss_clip": 1.04905093, + "balance_loss_mlp": 1.01492524, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.5109558300933974, + "language_loss": 0.66623306, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68776441, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12347412, + "step": 10023, + "time_per_iteration": 2.541466474533081 + }, + { + "auxiliary_loss_clip": 0.01125055, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.05062652, + "balance_loss_mlp": 1.01814985, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 3.0282943657443173, + "language_loss": 0.74423784, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76578909, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11914062, + "step": 10024, + "time_per_iteration": 2.5392520427703857 + }, + { + "auxiliary_loss_clip": 0.01122122, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.04953849, + "balance_loss_mlp": 1.01848888, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.8287549868220263, + "language_loss": 0.73069626, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75223076, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12841797, + "step": 10025, + "time_per_iteration": 2.5087530612945557 + }, + { + "auxiliary_loss_clip": 0.0112562, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.04585862, + "balance_loss_mlp": 1.02129102, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.55585285652945, + "language_loss": 0.6731627, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69476336, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1315918, + "step": 10026, + "time_per_iteration": 2.488077163696289 + }, + { + "auxiliary_loss_clip": 0.01110308, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.0411576, + "balance_loss_mlp": 1.02084041, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.780060876344827, + "language_loss": 0.79968095, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82110822, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11572266, + "step": 10027, + "time_per_iteration": 2.532135009765625 + }, + { + "auxiliary_loss_clip": 0.01118765, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.04193139, + "balance_loss_mlp": 1.01972103, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 3.963761403885118, + "language_loss": 0.70865822, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73017168, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12854004, + "step": 10028, + "time_per_iteration": 2.538365125656128 + }, + { + "auxiliary_loss_clip": 0.01121745, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.04265046, + "balance_loss_mlp": 1.02752924, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 2.562123903819292, + "language_loss": 0.84842199, + "learning_rate": 1.437707005721669e-06, + "loss": 0.87004876, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13415527, + "step": 10029, + "time_per_iteration": 2.47324275970459 + }, + { + "auxiliary_loss_clip": 0.01118287, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.04464447, + "balance_loss_mlp": 1.01946163, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.765523102595263, + "language_loss": 0.79806954, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81956422, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11724854, + "step": 10030, + "time_per_iteration": 2.4313735961914062 + }, + { + "auxiliary_loss_clip": 0.01112013, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03952932, + "balance_loss_mlp": 1.01897573, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 2.201831416314777, + "language_loss": 0.7124306, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73385954, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11901855, + "step": 10031, + "time_per_iteration": 2.5127336978912354 + }, + { + "auxiliary_loss_clip": 0.01116497, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.0391928, + "balance_loss_mlp": 1.0171783, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 2.228902915262939, + "language_loss": 0.73459768, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75607544, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.14105225, + "step": 10032, + "time_per_iteration": 2.5519306659698486 + }, + { + "auxiliary_loss_clip": 0.01124401, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.02170706, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.725597271601091, + "language_loss": 0.68365204, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70523989, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12683105, + "step": 10033, + "time_per_iteration": 2.4451801776885986 + }, + { + "auxiliary_loss_clip": 0.0112001, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.04714775, + "balance_loss_mlp": 1.01928973, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 2.1220482508988487, + "language_loss": 0.7579993, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.7795161, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12384033, + "step": 10034, + "time_per_iteration": 2.5418176651000977 + }, + { + "auxiliary_loss_clip": 0.01124015, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.04807794, + "balance_loss_mlp": 1.01946521, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 1.7724305259768909, + "language_loss": 0.74652159, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76808667, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13037109, + "step": 10035, + "time_per_iteration": 2.520233154296875 + }, + { + "auxiliary_loss_clip": 0.01119289, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.04357898, + "balance_loss_mlp": 1.02222514, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.6753344108215524, + "language_loss": 0.86500007, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88654912, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13391113, + "step": 10036, + "time_per_iteration": 2.423137903213501 + }, + { + "auxiliary_loss_clip": 0.01124577, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.04915142, + "balance_loss_mlp": 1.02038336, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.51660917791063, + "language_loss": 0.7052424, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72681808, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1260376, + "step": 10037, + "time_per_iteration": 2.54095458984375 + }, + { + "auxiliary_loss_clip": 0.01117309, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.0444591, + "balance_loss_mlp": 1.01944399, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.7557932234774458, + "language_loss": 0.85431224, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87580371, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1237793, + "step": 10038, + "time_per_iteration": 2.537919521331787 + }, + { + "auxiliary_loss_clip": 0.01121064, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.04345655, + "balance_loss_mlp": 1.022048, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 1.9115888950844129, + "language_loss": 0.76342756, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78500301, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.14416504, + "step": 10039, + "time_per_iteration": 2.4831721782684326 + }, + { + "auxiliary_loss_clip": 0.0111915, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.04292417, + "balance_loss_mlp": 1.02208972, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 2.1254771159304533, + "language_loss": 0.71009874, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73163128, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12023926, + "step": 10040, + "time_per_iteration": 3.989046812057495 + }, + { + "auxiliary_loss_clip": 0.0112313, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.04436183, + "balance_loss_mlp": 1.0173465, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 1.9888361644611228, + "language_loss": 0.78117216, + "learning_rate": 1.433223512712475e-06, + "loss": 0.8027221, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.1451416, + "step": 10041, + "time_per_iteration": 2.4396133422851562 + }, + { + "auxiliary_loss_clip": 0.01120601, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.04735899, + "balance_loss_mlp": 1.01712823, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.869402668899126, + "language_loss": 0.75432265, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77582252, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12255859, + "step": 10042, + "time_per_iteration": 2.452843189239502 + }, + { + "auxiliary_loss_clip": 0.01126926, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.05288422, + "balance_loss_mlp": 1.01850283, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 1.7953525487450173, + "language_loss": 0.84258103, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86415607, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12084961, + "step": 10043, + "time_per_iteration": 2.4519081115722656 + }, + { + "auxiliary_loss_clip": 0.01125773, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.04790854, + "balance_loss_mlp": 1.02374291, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.799918374326086, + "language_loss": 0.69871831, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72034836, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.1348877, + "step": 10044, + "time_per_iteration": 2.5344467163085938 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.05184507, + "balance_loss_mlp": 1.01795387, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 1.9079920845429255, + "language_loss": 0.78063291, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80222952, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1340332, + "step": 10045, + "time_per_iteration": 2.5510036945343018 + }, + { + "auxiliary_loss_clip": 0.01125999, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.05174863, + "balance_loss_mlp": 1.02275348, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.6426312144046475, + "language_loss": 0.77330256, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79492319, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.13317871, + "step": 10046, + "time_per_iteration": 2.504079580307007 + }, + { + "auxiliary_loss_clip": 0.01117927, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.04351079, + "balance_loss_mlp": 1.0214839, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.7981270330120134, + "language_loss": 0.87007594, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89158279, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11260986, + "step": 10047, + "time_per_iteration": 3.9802193641662598 + }, + { + "auxiliary_loss_clip": 0.01119433, + "auxiliary_loss_mlp": 0.01025579, + "balance_loss_clip": 1.04813385, + "balance_loss_mlp": 1.01390815, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.6424375684278907, + "language_loss": 0.76087832, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.78232843, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11669922, + "step": 10048, + "time_per_iteration": 2.613337755203247 + }, + { + "auxiliary_loss_clip": 0.01126123, + "auxiliary_loss_mlp": 0.01038373, + "balance_loss_clip": 1.04357815, + "balance_loss_mlp": 1.02293563, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.1033934835037167, + "language_loss": 0.66389638, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68554127, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.15429688, + "step": 10049, + "time_per_iteration": 2.643955945968628 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.04725242, + "balance_loss_mlp": 1.02148461, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.5889945928729747, + "language_loss": 0.66689944, + "learning_rate": 1.429862922631336e-06, + "loss": 0.68845654, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12084961, + "step": 10050, + "time_per_iteration": 2.479814052581787 + }, + { + "auxiliary_loss_clip": 0.01125636, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.04917288, + "balance_loss_mlp": 1.02161956, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 1.934068710406157, + "language_loss": 0.69512177, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71672153, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12719727, + "step": 10051, + "time_per_iteration": 2.5806174278259277 + }, + { + "auxiliary_loss_clip": 0.01121775, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.04434919, + "balance_loss_mlp": 1.01887679, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.9769356249422996, + "language_loss": 0.64626521, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66780078, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12902832, + "step": 10052, + "time_per_iteration": 2.46567964553833 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01028297, + "balance_loss_clip": 1.04353881, + "balance_loss_mlp": 1.01535726, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.724230700630863, + "language_loss": 0.68931729, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71078277, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12939453, + "step": 10053, + "time_per_iteration": 3.9586880207061768 + }, + { + "auxiliary_loss_clip": 0.01062654, + "auxiliary_loss_mlp": 0.01000175, + "balance_loss_clip": 1.03768754, + "balance_loss_mlp": 0.99889505, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7303464073544682, + "language_loss": 0.60421199, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62484026, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01278687, + "step": 10054, + "time_per_iteration": 3.1540324687957764 + }, + { + "auxiliary_loss_clip": 0.0111905, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.04491436, + "balance_loss_mlp": 1.01908076, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.7118075455666462, + "language_loss": 0.85546201, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87697637, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.13305664, + "step": 10055, + "time_per_iteration": 2.501382350921631 + }, + { + "auxiliary_loss_clip": 0.01128353, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.05143046, + "balance_loss_mlp": 1.0285213, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.339677997271109, + "language_loss": 0.73694861, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75865674, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13928223, + "step": 10056, + "time_per_iteration": 2.5389814376831055 + }, + { + "auxiliary_loss_clip": 0.01124404, + "auxiliary_loss_mlp": 0.0103548, + "balance_loss_clip": 1.05139399, + "balance_loss_mlp": 1.02348804, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 2.262255383821775, + "language_loss": 0.80295527, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.8245542, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11987305, + "step": 10057, + "time_per_iteration": 2.5653207302093506 + }, + { + "auxiliary_loss_clip": 0.01117101, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.04335773, + "balance_loss_mlp": 1.02196181, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.447804381845841, + "language_loss": 0.75328195, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77480412, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.13171387, + "step": 10058, + "time_per_iteration": 2.4948887825012207 + }, + { + "auxiliary_loss_clip": 0.0112757, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.05051184, + "balance_loss_mlp": 1.01909542, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 2.5419833022272185, + "language_loss": 0.70910019, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73069412, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12738037, + "step": 10059, + "time_per_iteration": 2.583627462387085 + }, + { + "auxiliary_loss_clip": 0.01129708, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.05416322, + "balance_loss_mlp": 1.01739001, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.450229309736966, + "language_loss": 0.75978458, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78138185, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12609863, + "step": 10060, + "time_per_iteration": 3.986227512359619 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.05039251, + "balance_loss_mlp": 1.01631212, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.0378044586745827, + "language_loss": 0.73875749, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.76028323, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12243652, + "step": 10061, + "time_per_iteration": 2.518035888671875 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.05112529, + "balance_loss_mlp": 1.01949644, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 1.7077893969171003, + "language_loss": 0.67493558, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69651723, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12347412, + "step": 10062, + "time_per_iteration": 2.5183303356170654 + }, + { + "auxiliary_loss_clip": 0.01119173, + "auxiliary_loss_mlp": 0.01035139, + "balance_loss_clip": 1.04538274, + "balance_loss_mlp": 1.02277136, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.4665668472848252, + "language_loss": 0.719019, + "learning_rate": 1.425011831266978e-06, + "loss": 0.7405622, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12365723, + "step": 10063, + "time_per_iteration": 2.508253812789917 + }, + { + "auxiliary_loss_clip": 0.01113877, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.04246068, + "balance_loss_mlp": 1.02195597, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.6077163291654353, + "language_loss": 0.84662759, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86810708, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12103271, + "step": 10064, + "time_per_iteration": 2.4951295852661133 + }, + { + "auxiliary_loss_clip": 0.01115851, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.04276621, + "balance_loss_mlp": 1.0186131, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.69026139212807, + "language_loss": 0.7970438, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81852186, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.13354492, + "step": 10065, + "time_per_iteration": 2.511671304702759 + }, + { + "auxiliary_loss_clip": 0.01118359, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.04181266, + "balance_loss_mlp": 1.02063787, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 2.0127967900612305, + "language_loss": 0.78983474, + "learning_rate": 1.423892870799226e-06, + "loss": 0.81136286, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13824463, + "step": 10066, + "time_per_iteration": 2.5273847579956055 + }, + { + "auxiliary_loss_clip": 0.01116454, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.04174876, + "balance_loss_mlp": 1.01813626, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.6696856177927428, + "language_loss": 0.73018068, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75165963, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1328125, + "step": 10067, + "time_per_iteration": 2.577260971069336 + }, + { + "auxiliary_loss_clip": 0.01122486, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.04848099, + "balance_loss_mlp": 1.01997006, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.3972362194256984, + "language_loss": 0.68948865, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.7110377, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12451172, + "step": 10068, + "time_per_iteration": 2.5584919452667236 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.03806043, + "balance_loss_mlp": 1.02445531, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 1.9331983150015257, + "language_loss": 0.87534547, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89685953, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.125, + "step": 10069, + "time_per_iteration": 2.4793193340301514 + }, + { + "auxiliary_loss_clip": 0.01122873, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.04865646, + "balance_loss_mlp": 1.01619732, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.6334651511917027, + "language_loss": 0.83160722, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85311687, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11889648, + "step": 10070, + "time_per_iteration": 2.497847557067871 + }, + { + "auxiliary_loss_clip": 0.01126682, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.05058479, + "balance_loss_mlp": 1.02102232, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.6816756039914909, + "language_loss": 0.86513036, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88673592, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.128479, + "step": 10071, + "time_per_iteration": 2.5267333984375 + }, + { + "auxiliary_loss_clip": 0.01127101, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.04879665, + "balance_loss_mlp": 1.02134657, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.5737415058787723, + "language_loss": 0.76958352, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79121095, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.14294434, + "step": 10072, + "time_per_iteration": 2.634519577026367 + }, + { + "auxiliary_loss_clip": 0.01123682, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.0473609, + "balance_loss_mlp": 1.01429248, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.8014102622340487, + "language_loss": 0.74450219, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76601696, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1350708, + "step": 10073, + "time_per_iteration": 2.4823055267333984 + }, + { + "auxiliary_loss_clip": 0.01046344, + "auxiliary_loss_mlp": 0.01005395, + "balance_loss_clip": 1.02100182, + "balance_loss_mlp": 1.0040915, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7590102763628817, + "language_loss": 0.55171716, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57223457, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01304626, + "step": 10074, + "time_per_iteration": 3.119222640991211 + }, + { + "auxiliary_loss_clip": 0.01115905, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.04270208, + "balance_loss_mlp": 1.01941752, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.6950530994928468, + "language_loss": 0.815992, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83747435, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12927246, + "step": 10075, + "time_per_iteration": 2.486302137374878 + }, + { + "auxiliary_loss_clip": 0.01127702, + "auxiliary_loss_mlp": 0.01025765, + "balance_loss_clip": 1.04947329, + "balance_loss_mlp": 1.01278877, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 1.8592282386918297, + "language_loss": 0.77921611, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80075079, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12976074, + "step": 10076, + "time_per_iteration": 2.5182132720947266 + }, + { + "auxiliary_loss_clip": 0.01129613, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.05227399, + "balance_loss_mlp": 1.0221982, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 2.0450782416837563, + "language_loss": 0.72352159, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74517047, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13079834, + "step": 10077, + "time_per_iteration": 2.459906578063965 + }, + { + "auxiliary_loss_clip": 0.01128239, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.05268919, + "balance_loss_mlp": 1.02028584, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 2.388210091687412, + "language_loss": 0.55648333, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57809854, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12994385, + "step": 10078, + "time_per_iteration": 2.4607770442962646 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.04165292, + "balance_loss_mlp": 1.01736951, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.533771599200633, + "language_loss": 0.70449668, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72599125, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.128479, + "step": 10079, + "time_per_iteration": 2.5160064697265625 + }, + { + "auxiliary_loss_clip": 0.0111919, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.04261672, + "balance_loss_mlp": 1.02601004, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.8422500023006827, + "language_loss": 0.62335408, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.644934, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12799072, + "step": 10080, + "time_per_iteration": 2.45102858543396 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.04659128, + "balance_loss_mlp": 1.01565897, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.8844782071879618, + "language_loss": 0.70793587, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.72944784, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12969971, + "step": 10081, + "time_per_iteration": 2.4703691005706787 + }, + { + "auxiliary_loss_clip": 0.01115854, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.04188204, + "balance_loss_mlp": 1.01762867, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.665192672766669, + "language_loss": 0.69477069, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.7162379, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13244629, + "step": 10082, + "time_per_iteration": 2.5559263229370117 + }, + { + "auxiliary_loss_clip": 0.01124041, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.04769564, + "balance_loss_mlp": 1.01586723, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.434438906103613, + "language_loss": 0.65989757, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.68142283, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12628174, + "step": 10083, + "time_per_iteration": 2.4915647506713867 + }, + { + "auxiliary_loss_clip": 0.01117807, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.04320431, + "balance_loss_mlp": 1.01535797, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 1.8625852696929097, + "language_loss": 0.74204731, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76350969, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13079834, + "step": 10084, + "time_per_iteration": 3.939296007156372 + }, + { + "auxiliary_loss_clip": 0.01128557, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.05108333, + "balance_loss_mlp": 1.02052546, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 7.418424580374662, + "language_loss": 0.72419649, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.7458154, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12817383, + "step": 10085, + "time_per_iteration": 2.4451005458831787 + }, + { + "auxiliary_loss_clip": 0.01119784, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.04344177, + "balance_loss_mlp": 1.02783322, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.576759879140213, + "language_loss": 0.75937587, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78097844, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12646484, + "step": 10086, + "time_per_iteration": 2.473006010055542 + }, + { + "auxiliary_loss_clip": 0.01118921, + "auxiliary_loss_mlp": 0.01032934, + "balance_loss_clip": 1.04703772, + "balance_loss_mlp": 1.0205003, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.3614999943951591, + "language_loss": 0.7294507, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.75096929, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12445068, + "step": 10087, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01121334, + "auxiliary_loss_mlp": 0.01027093, + "balance_loss_clip": 1.04712939, + "balance_loss_mlp": 1.01586926, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.8924946745507552, + "language_loss": 0.84018189, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.8616662, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11224365, + "step": 10088, + "time_per_iteration": 2.4695775508880615 + }, + { + "auxiliary_loss_clip": 0.01122569, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.04777479, + "balance_loss_mlp": 1.01527441, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.0074970060121213, + "language_loss": 0.71320683, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73472077, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13555908, + "step": 10089, + "time_per_iteration": 2.479984998703003 + }, + { + "auxiliary_loss_clip": 0.01124986, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.04697716, + "balance_loss_mlp": 1.02385688, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 4.117576735003129, + "language_loss": 0.82895714, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.85056639, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12084961, + "step": 10090, + "time_per_iteration": 3.909014940261841 + }, + { + "auxiliary_loss_clip": 0.01134095, + "auxiliary_loss_mlp": 0.01037414, + "balance_loss_clip": 1.05276263, + "balance_loss_mlp": 1.02361512, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.079902923550606, + "language_loss": 0.7557922, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77750731, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13793945, + "step": 10091, + "time_per_iteration": 2.4424338340759277 + }, + { + "auxiliary_loss_clip": 0.01117778, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.0461359, + "balance_loss_mlp": 1.01832354, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.470926670762009, + "language_loss": 0.79260063, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81408763, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.1260376, + "step": 10092, + "time_per_iteration": 2.4623501300811768 + }, + { + "auxiliary_loss_clip": 0.0113804, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.05861545, + "balance_loss_mlp": 1.02249539, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.7327128736760662, + "language_loss": 0.75927085, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78101087, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13452148, + "step": 10093, + "time_per_iteration": 2.4251222610473633 + }, + { + "auxiliary_loss_clip": 0.01121109, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.04944468, + "balance_loss_mlp": 1.01796651, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.8686102066435863, + "language_loss": 0.87390149, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89540958, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11730957, + "step": 10094, + "time_per_iteration": 2.4630043506622314 + }, + { + "auxiliary_loss_clip": 0.01123552, + "auxiliary_loss_mlp": 0.01028214, + "balance_loss_clip": 1.0498879, + "balance_loss_mlp": 1.01537561, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.6854075752783242, + "language_loss": 0.72497725, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74649501, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12854004, + "step": 10095, + "time_per_iteration": 2.4548423290252686 + }, + { + "auxiliary_loss_clip": 0.01123014, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.04807544, + "balance_loss_mlp": 1.0205996, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.598175247088034, + "language_loss": 0.76356483, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78511882, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11785889, + "step": 10096, + "time_per_iteration": 3.8518149852752686 + }, + { + "auxiliary_loss_clip": 0.01125096, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.05002785, + "balance_loss_mlp": 1.02074623, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 2.167433500630119, + "language_loss": 0.79857624, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82015687, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12219238, + "step": 10097, + "time_per_iteration": 2.4223554134368896 + }, + { + "auxiliary_loss_clip": 0.01123807, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.04884636, + "balance_loss_mlp": 1.01821566, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.4516251379560243, + "language_loss": 0.67436892, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69590819, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11907959, + "step": 10098, + "time_per_iteration": 2.652031183242798 + }, + { + "auxiliary_loss_clip": 0.01115515, + "auxiliary_loss_mlp": 0.01028203, + "balance_loss_clip": 1.0422765, + "balance_loss_mlp": 1.01577568, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.9203776347297559, + "language_loss": 0.80331194, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82474911, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12432861, + "step": 10099, + "time_per_iteration": 2.4755947589874268 + }, + { + "auxiliary_loss_clip": 0.01126561, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.04646492, + "balance_loss_mlp": 1.01886737, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 2.276700077028389, + "language_loss": 0.70927256, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.73086482, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13793945, + "step": 10100, + "time_per_iteration": 2.4508681297302246 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.05022705, + "balance_loss_mlp": 1.02233255, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.6249299850582963, + "language_loss": 0.70861459, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.73023528, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13409424, + "step": 10101, + "time_per_iteration": 2.4132964611053467 + }, + { + "auxiliary_loss_clip": 0.01114053, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.0411315, + "balance_loss_mlp": 1.01808238, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 2.337417852094522, + "language_loss": 0.6936847, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71513832, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.13232422, + "step": 10102, + "time_per_iteration": 2.560899257659912 + }, + { + "auxiliary_loss_clip": 0.01128186, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.05065978, + "balance_loss_mlp": 1.01951385, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.9331846023175574, + "language_loss": 0.73899758, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.76060003, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12548828, + "step": 10103, + "time_per_iteration": 2.473069667816162 + }, + { + "auxiliary_loss_clip": 0.01126696, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.04924548, + "balance_loss_mlp": 1.02152514, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.9138674411603875, + "language_loss": 0.76548576, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78710163, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13366699, + "step": 10104, + "time_per_iteration": 3.9038891792297363 + }, + { + "auxiliary_loss_clip": 0.01044396, + "auxiliary_loss_mlp": 0.01009843, + "balance_loss_clip": 1.01934195, + "balance_loss_mlp": 1.0084064, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7124877161990825, + "language_loss": 0.560049, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58059132, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01435852, + "step": 10105, + "time_per_iteration": 3.1163158416748047 + }, + { + "auxiliary_loss_clip": 0.01073848, + "auxiliary_loss_mlp": 0.0100481, + "balance_loss_clip": 1.04849195, + "balance_loss_mlp": 1.00261462, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7595912090317076, + "language_loss": 0.56822586, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58901244, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.25366211, + "router_z_loss_mlp": 0.02189636, + "step": 10106, + "time_per_iteration": 3.092386484146118 + }, + { + "auxiliary_loss_clip": 0.01121293, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.04674864, + "balance_loss_mlp": 1.02131915, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.5037425151895953, + "language_loss": 0.68520916, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70675558, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12036133, + "step": 10107, + "time_per_iteration": 2.5170845985412598 + }, + { + "auxiliary_loss_clip": 0.01126089, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.04916835, + "balance_loss_mlp": 1.01523948, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 3.745119210602944, + "language_loss": 0.8109237, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83246148, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12445068, + "step": 10108, + "time_per_iteration": 2.4364941120147705 + }, + { + "auxiliary_loss_clip": 0.01129023, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.04835105, + "balance_loss_mlp": 1.0199461, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.6556718582711347, + "language_loss": 0.71129256, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73291469, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13244629, + "step": 10109, + "time_per_iteration": 2.5925498008728027 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.04993701, + "balance_loss_mlp": 1.02050078, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.7820513528402215, + "language_loss": 0.80286312, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82443488, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11199951, + "step": 10110, + "time_per_iteration": 2.4710795879364014 + }, + { + "auxiliary_loss_clip": 0.01123576, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.04748702, + "balance_loss_mlp": 1.0147047, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.4844160607605343, + "language_loss": 0.7099539, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.73145705, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12023926, + "step": 10111, + "time_per_iteration": 2.5478708744049072 + }, + { + "auxiliary_loss_clip": 0.01123481, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.0474689, + "balance_loss_mlp": 1.01880515, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.7578672014400154, + "language_loss": 0.65224385, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67378974, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1229248, + "step": 10112, + "time_per_iteration": 2.475498914718628 + }, + { + "auxiliary_loss_clip": 0.01052413, + "auxiliary_loss_mlp": 0.01005568, + "balance_loss_clip": 1.02654862, + "balance_loss_mlp": 1.00414538, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.634365687544819, + "language_loss": 0.49648991, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51706976, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01422119, + "step": 10113, + "time_per_iteration": 3.0965120792388916 + }, + { + "auxiliary_loss_clip": 0.01050644, + "auxiliary_loss_mlp": 0.01004429, + "balance_loss_clip": 1.02529681, + "balance_loss_mlp": 1.00309825, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8363759730758518, + "language_loss": 0.56978321, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.590334, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01330566, + "step": 10114, + "time_per_iteration": 2.9739830493927 + }, + { + "auxiliary_loss_clip": 0.01123814, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.04545522, + "balance_loss_mlp": 1.0149498, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.8261324974919946, + "language_loss": 0.70135224, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72287834, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13848877, + "step": 10115, + "time_per_iteration": 2.46010160446167 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.04614067, + "balance_loss_mlp": 1.01847219, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 2.8228923154049665, + "language_loss": 0.72314668, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74467349, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12316895, + "step": 10116, + "time_per_iteration": 2.448672294616699 + }, + { + "auxiliary_loss_clip": 0.01124762, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.04854882, + "balance_loss_mlp": 1.02367675, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.8665927754181262, + "language_loss": 0.54206294, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.56367671, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12921143, + "step": 10117, + "time_per_iteration": 2.588627338409424 + }, + { + "auxiliary_loss_clip": 0.01123017, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.04880452, + "balance_loss_mlp": 1.01521301, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.823279436166196, + "language_loss": 0.70382529, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72532821, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12072754, + "step": 10118, + "time_per_iteration": 2.448626756668091 + }, + { + "auxiliary_loss_clip": 0.0112582, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.05267203, + "balance_loss_mlp": 1.01675844, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.5023661441908918, + "language_loss": 0.74609172, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76763928, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12176514, + "step": 10119, + "time_per_iteration": 2.4675018787384033 + }, + { + "auxiliary_loss_clip": 0.01119547, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.04527056, + "balance_loss_mlp": 1.02008057, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.7175378507493775, + "language_loss": 0.67695808, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69847596, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12145996, + "step": 10120, + "time_per_iteration": 2.4363462924957275 + }, + { + "auxiliary_loss_clip": 0.01133346, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.05342519, + "balance_loss_mlp": 1.02400374, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 2.7042555221311617, + "language_loss": 0.74409211, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76579213, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12652588, + "step": 10121, + "time_per_iteration": 2.500544548034668 + }, + { + "auxiliary_loss_clip": 0.01119218, + "auxiliary_loss_mlp": 0.01028398, + "balance_loss_clip": 1.04437089, + "balance_loss_mlp": 1.01681089, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 2.3137694142141334, + "language_loss": 0.80457878, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82605499, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11590576, + "step": 10122, + "time_per_iteration": 2.411872148513794 + }, + { + "auxiliary_loss_clip": 0.01121327, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.04720318, + "balance_loss_mlp": 1.01898146, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.5350611675631671, + "language_loss": 0.55554795, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57708001, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12902832, + "step": 10123, + "time_per_iteration": 2.561018943786621 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.05079913, + "balance_loss_mlp": 1.02253509, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 2.1953467070481207, + "language_loss": 0.74142623, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76300967, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12780762, + "step": 10124, + "time_per_iteration": 2.4614479541778564 + }, + { + "auxiliary_loss_clip": 0.01123897, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.04825366, + "balance_loss_mlp": 1.01833546, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 1.8324347222830353, + "language_loss": 0.65793324, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67947865, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1229248, + "step": 10125, + "time_per_iteration": 2.474130153656006 + }, + { + "auxiliary_loss_clip": 0.01124895, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.01840222, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 2.462192661118005, + "language_loss": 0.76368481, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78523982, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12200928, + "step": 10126, + "time_per_iteration": 2.4835662841796875 + }, + { + "auxiliary_loss_clip": 0.0112705, + "auxiliary_loss_mlp": 0.01029799, + "balance_loss_clip": 1.0498445, + "balance_loss_mlp": 1.01675153, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.8874964708247592, + "language_loss": 0.71116066, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73272914, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13049316, + "step": 10127, + "time_per_iteration": 3.8836703300476074 + }, + { + "auxiliary_loss_clip": 0.01135702, + "auxiliary_loss_mlp": 0.01033416, + "balance_loss_clip": 1.05761147, + "balance_loss_mlp": 1.01924777, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.5571495455187097, + "language_loss": 0.72920775, + "learning_rate": 1.400812267497691e-06, + "loss": 0.75089896, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.14160156, + "step": 10128, + "time_per_iteration": 2.4533543586730957 + }, + { + "auxiliary_loss_clip": 0.01121081, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.04709876, + "balance_loss_mlp": 1.02221799, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.035738280991373, + "language_loss": 0.73370194, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75524879, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1138916, + "step": 10129, + "time_per_iteration": 2.4517741203308105 + }, + { + "auxiliary_loss_clip": 0.01126272, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.05221426, + "balance_loss_mlp": 1.0207969, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.5074656949893799, + "language_loss": 0.65866077, + "learning_rate": 1.400069168015626e-06, + "loss": 0.68025005, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11871338, + "step": 10130, + "time_per_iteration": 2.6735947132110596 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.04859054, + "balance_loss_mlp": 1.01711738, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.84226219140167, + "language_loss": 0.7690388, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79053611, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11071777, + "step": 10131, + "time_per_iteration": 2.43943190574646 + }, + { + "auxiliary_loss_clip": 0.01121044, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.04737413, + "balance_loss_mlp": 1.01941013, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.7210280215548464, + "language_loss": 0.77196658, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.7934804, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10919189, + "step": 10132, + "time_per_iteration": 2.462186336517334 + }, + { + "auxiliary_loss_clip": 0.01112666, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.04158902, + "balance_loss_mlp": 1.0199753, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.7761176252579138, + "language_loss": 0.75575507, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.7771948, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11328125, + "step": 10133, + "time_per_iteration": 2.4860222339630127 + }, + { + "auxiliary_loss_clip": 0.01123096, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.04673409, + "balance_loss_mlp": 1.02167964, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.6986894124323833, + "language_loss": 0.63903499, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.66061556, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13269043, + "step": 10134, + "time_per_iteration": 2.550708770751953 + }, + { + "auxiliary_loss_clip": 0.0111898, + "auxiliary_loss_mlp": 0.01029944, + "balance_loss_clip": 1.04573643, + "balance_loss_mlp": 1.01791573, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7843372775520585, + "language_loss": 0.78379309, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80528235, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12042236, + "step": 10135, + "time_per_iteration": 3.8377208709716797 + }, + { + "auxiliary_loss_clip": 0.01121598, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.04633248, + "balance_loss_mlp": 1.01457596, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 1.7168651695382027, + "language_loss": 0.71994102, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74142206, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1192627, + "step": 10136, + "time_per_iteration": 2.4858949184417725 + }, + { + "auxiliary_loss_clip": 0.0111812, + "auxiliary_loss_mlp": 0.01034887, + "balance_loss_clip": 1.0438242, + "balance_loss_mlp": 1.02075458, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 2.0766598797330706, + "language_loss": 0.74184924, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76337934, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.14123535, + "step": 10137, + "time_per_iteration": 2.5627899169921875 + }, + { + "auxiliary_loss_clip": 0.01116365, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.04040527, + "balance_loss_mlp": 1.03041577, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 2.716828033172678, + "language_loss": 0.8020792, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82368779, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.14074707, + "step": 10138, + "time_per_iteration": 2.4827463626861572 + }, + { + "auxiliary_loss_clip": 0.01111294, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.04164565, + "balance_loss_mlp": 1.01952434, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5930183546441403, + "language_loss": 0.81125927, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83268046, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11297607, + "step": 10139, + "time_per_iteration": 2.438420057296753 + }, + { + "auxiliary_loss_clip": 0.0112237, + "auxiliary_loss_mlp": 0.01031576, + "balance_loss_clip": 1.04571962, + "balance_loss_mlp": 1.01889849, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.030425173548611, + "language_loss": 0.83027565, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85181522, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12670898, + "step": 10140, + "time_per_iteration": 3.7994823455810547 + }, + { + "auxiliary_loss_clip": 0.01128924, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.04869294, + "balance_loss_mlp": 1.02574682, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 1.7565933087214909, + "language_loss": 0.75504744, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77672803, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.1340332, + "step": 10141, + "time_per_iteration": 2.4718751907348633 + }, + { + "auxiliary_loss_clip": 0.01120989, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.04723668, + "balance_loss_mlp": 1.01701641, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.8734326004232085, + "language_loss": 0.76278335, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.784284, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1206665, + "step": 10142, + "time_per_iteration": 2.4376258850097656 + }, + { + "auxiliary_loss_clip": 0.01113936, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.04075551, + "balance_loss_mlp": 1.0156858, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.7449367359258932, + "language_loss": 0.76811808, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78954959, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.13513184, + "step": 10143, + "time_per_iteration": 2.51636004447937 + }, + { + "auxiliary_loss_clip": 0.01120576, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.04618144, + "balance_loss_mlp": 1.0188427, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.8020391209448416, + "language_loss": 0.75168478, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77321529, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.13641357, + "step": 10144, + "time_per_iteration": 2.412631034851074 + }, + { + "auxiliary_loss_clip": 0.01121483, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.0464319, + "balance_loss_mlp": 1.01750088, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 1.7498164549392525, + "language_loss": 0.7287119, + "learning_rate": 1.394498830235383e-06, + "loss": 0.75022781, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.1262207, + "step": 10145, + "time_per_iteration": 2.6608493328094482 + }, + { + "auxiliary_loss_clip": 0.01121743, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.04715848, + "balance_loss_mlp": 1.01938653, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 1.86289076257806, + "language_loss": 0.69238526, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71391737, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12091064, + "step": 10146, + "time_per_iteration": 2.476546287536621 + }, + { + "auxiliary_loss_clip": 0.01124415, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.05297279, + "balance_loss_mlp": 1.01573396, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.6842628878807322, + "language_loss": 0.76667607, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78818637, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10882568, + "step": 10147, + "time_per_iteration": 3.945464611053467 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01027279, + "balance_loss_clip": 1.03870797, + "balance_loss_mlp": 1.01488733, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.910561019987053, + "language_loss": 0.78141534, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80280459, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12390137, + "step": 10148, + "time_per_iteration": 2.5601003170013428 + }, + { + "auxiliary_loss_clip": 0.01128928, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.05055141, + "balance_loss_mlp": 1.02544892, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.0975491274512836, + "language_loss": 0.53921956, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56091923, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.15576172, + "step": 10149, + "time_per_iteration": 2.5645954608917236 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.04142809, + "balance_loss_mlp": 1.02380586, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 2.60979847619333, + "language_loss": 0.80620831, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82768589, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.1282959, + "step": 10150, + "time_per_iteration": 2.5301148891448975 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.05060482, + "balance_loss_mlp": 1.02468419, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.5498374165091782, + "language_loss": 0.69051909, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71218628, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13677979, + "step": 10151, + "time_per_iteration": 2.5104775428771973 + }, + { + "auxiliary_loss_clip": 0.01116945, + "auxiliary_loss_mlp": 0.01027052, + "balance_loss_clip": 1.04408038, + "balance_loss_mlp": 1.01558995, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 2.02570394683823, + "language_loss": 0.7155453, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.73698533, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11468506, + "step": 10152, + "time_per_iteration": 2.5305018424987793 + }, + { + "auxiliary_loss_clip": 0.01123818, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.04714036, + "balance_loss_mlp": 1.01930213, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 2.0872574307422136, + "language_loss": 0.78527224, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80682933, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12591553, + "step": 10153, + "time_per_iteration": 2.4294323921203613 + }, + { + "auxiliary_loss_clip": 0.01118817, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.04269862, + "balance_loss_mlp": 1.01948822, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.7527200973068298, + "language_loss": 0.7931568, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81466222, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12237549, + "step": 10154, + "time_per_iteration": 2.5196306705474854 + }, + { + "auxiliary_loss_clip": 0.01122497, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.05044365, + "balance_loss_mlp": 1.01750588, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.7526673928275263, + "language_loss": 0.70476633, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72628188, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11560059, + "step": 10155, + "time_per_iteration": 2.4965174198150635 + }, + { + "auxiliary_loss_clip": 0.01113132, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.04021776, + "balance_loss_mlp": 1.01752627, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.6006156236240758, + "language_loss": 0.71591628, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73735619, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.13323975, + "step": 10156, + "time_per_iteration": 2.587231397628784 + }, + { + "auxiliary_loss_clip": 0.01111561, + "auxiliary_loss_mlp": 0.01036505, + "balance_loss_clip": 1.04141164, + "balance_loss_mlp": 1.02272463, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.6164954790727881, + "language_loss": 0.67508864, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69656932, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.13781738, + "step": 10157, + "time_per_iteration": 2.466898202896118 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01026038, + "balance_loss_clip": 1.03977156, + "balance_loss_mlp": 1.01375341, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 2.0060916629092427, + "language_loss": 0.72336727, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74476886, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.1227417, + "step": 10158, + "time_per_iteration": 2.464508533477783 + }, + { + "auxiliary_loss_clip": 0.01119281, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.04266417, + "balance_loss_mlp": 1.02478933, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.646357798016588, + "language_loss": 0.69298172, + "learning_rate": 1.389304508366635e-06, + "loss": 0.7145443, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12200928, + "step": 10159, + "time_per_iteration": 2.54803204536438 + }, + { + "auxiliary_loss_clip": 0.01120658, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.04519701, + "balance_loss_mlp": 1.02222693, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 2.204670385553104, + "language_loss": 0.79328245, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81484759, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1362915, + "step": 10160, + "time_per_iteration": 2.4255435466766357 + }, + { + "auxiliary_loss_clip": 0.01053168, + "auxiliary_loss_mlp": 0.01001503, + "balance_loss_clip": 1.02750945, + "balance_loss_mlp": 1.00022769, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8254759842449423, + "language_loss": 0.61526513, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63581181, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01275635, + "step": 10161, + "time_per_iteration": 3.2084054946899414 + }, + { + "auxiliary_loss_clip": 0.0112409, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.04735875, + "balance_loss_mlp": 1.02161908, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.814692518845057, + "language_loss": 0.76214463, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78373146, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12982178, + "step": 10162, + "time_per_iteration": 2.455108165740967 + }, + { + "auxiliary_loss_clip": 0.01119069, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.04588282, + "balance_loss_mlp": 1.01832569, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.819694030712319, + "language_loss": 0.71933323, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.74083376, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12646484, + "step": 10163, + "time_per_iteration": 2.511780261993408 + }, + { + "auxiliary_loss_clip": 0.01116681, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.04569101, + "balance_loss_mlp": 1.01840711, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 21.83988980959813, + "language_loss": 0.59843332, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61989897, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11486816, + "step": 10164, + "time_per_iteration": 2.5091283321380615 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.03991628, + "balance_loss_mlp": 1.01817429, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.634073484915351, + "language_loss": 0.76094437, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.78237712, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11737061, + "step": 10165, + "time_per_iteration": 2.509732961654663 + }, + { + "auxiliary_loss_clip": 0.01114742, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.04443598, + "balance_loss_mlp": 1.01678193, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.602844404719217, + "language_loss": 0.7959286, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81736213, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.1184082, + "step": 10166, + "time_per_iteration": 2.487733840942383 + }, + { + "auxiliary_loss_clip": 0.01122557, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.04547644, + "balance_loss_mlp": 1.02235651, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 1.867889678741025, + "language_loss": 0.6784814, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.7000674, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13696289, + "step": 10167, + "time_per_iteration": 2.452655792236328 + }, + { + "auxiliary_loss_clip": 0.01119616, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.0454241, + "balance_loss_mlp": 1.02777624, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.9258059174367668, + "language_loss": 0.78795326, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.809551, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12384033, + "step": 10168, + "time_per_iteration": 2.4614052772521973 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01038082, + "balance_loss_clip": 1.05237103, + "balance_loss_mlp": 1.02372956, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 2.2123388142606357, + "language_loss": 0.85581946, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87756801, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14367676, + "step": 10169, + "time_per_iteration": 2.3999011516571045 + }, + { + "auxiliary_loss_clip": 0.01115108, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.04383516, + "balance_loss_mlp": 1.01822209, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.6115559851723162, + "language_loss": 0.78807199, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.80951357, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10827637, + "step": 10170, + "time_per_iteration": 4.049155235290527 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.04327106, + "balance_loss_mlp": 1.02589202, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 2.3120362528150555, + "language_loss": 0.68563509, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.7072469, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.1394043, + "step": 10171, + "time_per_iteration": 2.4576501846313477 + }, + { + "auxiliary_loss_clip": 0.01123558, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.04821074, + "balance_loss_mlp": 1.01944649, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.543040869269081, + "language_loss": 0.79004925, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81161582, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13671875, + "step": 10172, + "time_per_iteration": 2.5490050315856934 + }, + { + "auxiliary_loss_clip": 0.01128718, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.0477941, + "balance_loss_mlp": 1.02410257, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.7832707703834232, + "language_loss": 0.66719252, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.68886817, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.14733887, + "step": 10173, + "time_per_iteration": 2.4398417472839355 + }, + { + "auxiliary_loss_clip": 0.0112346, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.0477376, + "balance_loss_mlp": 1.02461076, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 2.400021732420228, + "language_loss": 0.55782598, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57944506, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13812256, + "step": 10174, + "time_per_iteration": 2.4525198936462402 + }, + { + "auxiliary_loss_clip": 0.01131446, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.05521035, + "balance_loss_mlp": 1.01997876, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 2.2225944843553536, + "language_loss": 0.66239536, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68403101, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12139893, + "step": 10175, + "time_per_iteration": 2.4775664806365967 + }, + { + "auxiliary_loss_clip": 0.01113214, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.03908825, + "balance_loss_mlp": 1.02031898, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.0061363410896083, + "language_loss": 0.82589126, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84735483, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12817383, + "step": 10176, + "time_per_iteration": 2.539154291152954 + }, + { + "auxiliary_loss_clip": 0.01120691, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.04621077, + "balance_loss_mlp": 1.02348208, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 3.6290730580987063, + "language_loss": 0.7741549, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79573023, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13372803, + "step": 10177, + "time_per_iteration": 2.482393503189087 + }, + { + "auxiliary_loss_clip": 0.01125564, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.04787612, + "balance_loss_mlp": 1.02093494, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.3017787904076763, + "language_loss": 0.75670689, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77829874, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12695312, + "step": 10178, + "time_per_iteration": 3.9771106243133545 + }, + { + "auxiliary_loss_clip": 0.01128828, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.0543046, + "balance_loss_mlp": 1.02522123, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.956188642095806, + "language_loss": 0.67220342, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69387364, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12957764, + "step": 10179, + "time_per_iteration": 2.4813480377197266 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.05043864, + "balance_loss_mlp": 1.02166164, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 2.282956784720403, + "language_loss": 0.8364026, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.85800219, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11987305, + "step": 10180, + "time_per_iteration": 2.4178853034973145 + }, + { + "auxiliary_loss_clip": 0.01123283, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.05037379, + "balance_loss_mlp": 1.01792276, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.6496322371456316, + "language_loss": 0.77528107, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79682696, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.13397217, + "step": 10181, + "time_per_iteration": 2.451483726501465 + }, + { + "auxiliary_loss_clip": 0.01125837, + "auxiliary_loss_mlp": 0.01043786, + "balance_loss_clip": 1.04611635, + "balance_loss_mlp": 1.03029203, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.9781328464233352, + "language_loss": 0.80760956, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82930577, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1348877, + "step": 10182, + "time_per_iteration": 2.3913190364837646 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01027377, + "balance_loss_clip": 1.04913306, + "balance_loss_mlp": 1.0174644, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.5428447257917526, + "language_loss": 0.82998538, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85146934, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.09906006, + "step": 10183, + "time_per_iteration": 3.913701057434082 + }, + { + "auxiliary_loss_clip": 0.01052584, + "auxiliary_loss_mlp": 0.01007202, + "balance_loss_clip": 1.02727771, + "balance_loss_mlp": 1.00584412, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7112211717015267, + "language_loss": 0.62798142, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64857936, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.25292969, + "router_z_loss_mlp": 0.01356506, + "step": 10184, + "time_per_iteration": 3.1780924797058105 + }, + { + "auxiliary_loss_clip": 0.0112851, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.05526102, + "balance_loss_mlp": 1.02038324, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 1.7731910764416094, + "language_loss": 0.8236829, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84528816, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11645508, + "step": 10185, + "time_per_iteration": 2.4454281330108643 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.05043304, + "balance_loss_mlp": 1.01667738, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 1.953254885029787, + "language_loss": 0.74775308, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76933479, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13220215, + "step": 10186, + "time_per_iteration": 2.4602608680725098 + }, + { + "auxiliary_loss_clip": 0.01117802, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.0438081, + "balance_loss_mlp": 1.01596689, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.628897731836686, + "language_loss": 0.78495908, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80641139, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11456299, + "step": 10187, + "time_per_iteration": 2.468121290206909 + }, + { + "auxiliary_loss_clip": 0.01125243, + "auxiliary_loss_mlp": 0.01026626, + "balance_loss_clip": 1.05157375, + "balance_loss_mlp": 1.01390076, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.6435084699901314, + "language_loss": 0.83285308, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85437173, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1272583, + "step": 10188, + "time_per_iteration": 2.4555647373199463 + }, + { + "auxiliary_loss_clip": 0.01124644, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.0494287, + "balance_loss_mlp": 1.01949131, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.9227351229335636, + "language_loss": 0.75586683, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77743679, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12860107, + "step": 10189, + "time_per_iteration": 2.4260072708129883 + }, + { + "auxiliary_loss_clip": 0.01121289, + "auxiliary_loss_mlp": 0.01027946, + "balance_loss_clip": 1.04589248, + "balance_loss_mlp": 1.01578033, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.5229062214922975, + "language_loss": 0.74373233, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.7652247, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12164307, + "step": 10190, + "time_per_iteration": 2.4975733757019043 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.04389465, + "balance_loss_mlp": 1.01827347, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.7479565626926983, + "language_loss": 0.68269265, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70419222, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.13336182, + "step": 10191, + "time_per_iteration": 4.028519630432129 + }, + { + "auxiliary_loss_clip": 0.01117981, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.04155028, + "balance_loss_mlp": 1.02583623, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.0640660029168236, + "language_loss": 0.73819077, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75976837, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13928223, + "step": 10192, + "time_per_iteration": 2.581965684890747 + }, + { + "auxiliary_loss_clip": 0.01118632, + "auxiliary_loss_mlp": 0.01028192, + "balance_loss_clip": 1.04694271, + "balance_loss_mlp": 1.01631904, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.9781054186919473, + "language_loss": 0.83528495, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85675323, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11883545, + "step": 10193, + "time_per_iteration": 2.589472532272339 + }, + { + "auxiliary_loss_clip": 0.01121614, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04479051, + "balance_loss_mlp": 1.01888514, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.4669824015233575, + "language_loss": 0.70454431, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.72607434, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12512207, + "step": 10194, + "time_per_iteration": 2.5767104625701904 + }, + { + "auxiliary_loss_clip": 0.01049851, + "auxiliary_loss_mlp": 0.01005022, + "balance_loss_clip": 1.02394223, + "balance_loss_mlp": 1.00366426, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8263076784617684, + "language_loss": 0.58709621, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60764492, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01358032, + "step": 10195, + "time_per_iteration": 2.8543355464935303 + }, + { + "auxiliary_loss_clip": 0.01119071, + "auxiliary_loss_mlp": 0.01036835, + "balance_loss_clip": 1.04536915, + "balance_loss_mlp": 1.02250028, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.1890595851695016, + "language_loss": 0.69484353, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71640265, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.14318848, + "step": 10196, + "time_per_iteration": 2.5041470527648926 + }, + { + "auxiliary_loss_clip": 0.01123548, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.04650021, + "balance_loss_mlp": 1.02573204, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.6939376901742595, + "language_loss": 0.71310931, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73473191, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12982178, + "step": 10197, + "time_per_iteration": 2.519794225692749 + }, + { + "auxiliary_loss_clip": 0.01121918, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.02164912, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 3.3561442301503375, + "language_loss": 0.79104018, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81260204, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12628174, + "step": 10198, + "time_per_iteration": 2.4419736862182617 + }, + { + "auxiliary_loss_clip": 0.01122903, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.04500008, + "balance_loss_mlp": 1.02148914, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.4663121149723375, + "language_loss": 0.74140954, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76298583, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13250732, + "step": 10199, + "time_per_iteration": 2.4652023315429688 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.04152346, + "balance_loss_mlp": 1.02907038, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 2.215723211269666, + "language_loss": 0.62080276, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64243585, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.15679932, + "step": 10200, + "time_per_iteration": 2.5152246952056885 + }, + { + "auxiliary_loss_clip": 0.01124039, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.0477854, + "balance_loss_mlp": 1.02215409, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.9871750912803345, + "language_loss": 0.68719083, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70878577, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13305664, + "step": 10201, + "time_per_iteration": 2.4473016262054443 + }, + { + "auxiliary_loss_clip": 0.01121143, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.04478133, + "balance_loss_mlp": 1.02167046, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 1.926337659743319, + "language_loss": 0.83972597, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.86128449, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13024902, + "step": 10202, + "time_per_iteration": 2.452913761138916 + }, + { + "auxiliary_loss_clip": 0.01052311, + "auxiliary_loss_mlp": 0.01005493, + "balance_loss_clip": 1.02590513, + "balance_loss_mlp": 1.00404418, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8862172908949647, + "language_loss": 0.67133653, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69191456, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01448059, + "step": 10203, + "time_per_iteration": 3.0958950519561768 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01030454, + "balance_loss_clip": 1.05479693, + "balance_loss_mlp": 1.01802647, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.494685225617766, + "language_loss": 0.61204731, + "learning_rate": 1.37263940830327e-06, + "loss": 0.6336658, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12438965, + "step": 10204, + "time_per_iteration": 2.7098095417022705 + }, + { + "auxiliary_loss_clip": 0.01127782, + "auxiliary_loss_mlp": 0.01026632, + "balance_loss_clip": 1.05257678, + "balance_loss_mlp": 1.01455617, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.6987253374346678, + "language_loss": 0.72359061, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74513477, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12084961, + "step": 10205, + "time_per_iteration": 2.551831007003784 + }, + { + "auxiliary_loss_clip": 0.01121084, + "auxiliary_loss_mlp": 0.01025372, + "balance_loss_clip": 1.04751265, + "balance_loss_mlp": 1.01286077, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 2.955347530176756, + "language_loss": 0.76134181, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.7828064, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12506104, + "step": 10206, + "time_per_iteration": 2.489163637161255 + }, + { + "auxiliary_loss_clip": 0.01126453, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.05133855, + "balance_loss_mlp": 1.0198878, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 2.1271805876601473, + "language_loss": 0.7581144, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77970684, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12896729, + "step": 10207, + "time_per_iteration": 2.4957923889160156 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01034764, + "balance_loss_clip": 1.04589891, + "balance_loss_mlp": 1.02239585, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.322662981482217, + "language_loss": 0.8227489, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84432286, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12365723, + "step": 10208, + "time_per_iteration": 2.368884801864624 + }, + { + "auxiliary_loss_clip": 0.01128032, + "auxiliary_loss_mlp": 0.01036607, + "balance_loss_clip": 1.05157256, + "balance_loss_mlp": 1.02283251, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 1.8964724004948867, + "language_loss": 0.72346687, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74511325, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13781738, + "step": 10209, + "time_per_iteration": 2.538560390472412 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0103643, + "balance_loss_clip": 1.04663849, + "balance_loss_mlp": 1.02403808, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6050446073605709, + "language_loss": 0.74290228, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76445961, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12402344, + "step": 10210, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.0104263, + "auxiliary_loss_mlp": 0.01005658, + "balance_loss_clip": 1.01769114, + "balance_loss_mlp": 1.00433433, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.861494606569735, + "language_loss": 0.64918679, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.66966963, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01324463, + "step": 10211, + "time_per_iteration": 3.292681932449341 + }, + { + "auxiliary_loss_clip": 0.0111464, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.04063141, + "balance_loss_mlp": 1.02409542, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.579927683395082, + "language_loss": 0.75718212, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77870059, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13134766, + "step": 10212, + "time_per_iteration": 2.4469666481018066 + }, + { + "auxiliary_loss_clip": 0.01119928, + "auxiliary_loss_mlp": 0.01041838, + "balance_loss_clip": 1.0450151, + "balance_loss_mlp": 1.02645981, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.7329277612137002, + "language_loss": 0.73847902, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76009673, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.15380859, + "step": 10213, + "time_per_iteration": 3.8964405059814453 + }, + { + "auxiliary_loss_clip": 0.01132161, + "auxiliary_loss_mlp": 0.0103991, + "balance_loss_clip": 1.05154455, + "balance_loss_mlp": 1.0258491, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 3.1942482806736416, + "language_loss": 0.73108244, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75280315, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.140625, + "step": 10214, + "time_per_iteration": 2.461454391479492 + }, + { + "auxiliary_loss_clip": 0.01122264, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.0466876, + "balance_loss_mlp": 1.01770186, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.7255510422136258, + "language_loss": 0.74579322, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76732075, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12792969, + "step": 10215, + "time_per_iteration": 2.460315465927124 + }, + { + "auxiliary_loss_clip": 0.01119201, + "auxiliary_loss_mlp": 0.0103116, + "balance_loss_clip": 1.04532456, + "balance_loss_mlp": 1.01861334, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.683856910180926, + "language_loss": 0.78294981, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80445343, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12536621, + "step": 10216, + "time_per_iteration": 2.4810996055603027 + }, + { + "auxiliary_loss_clip": 0.01124152, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.0500145, + "balance_loss_mlp": 1.01913691, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 3.373291197135904, + "language_loss": 0.80392551, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82549572, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.13739014, + "step": 10217, + "time_per_iteration": 2.407064914703369 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.04503417, + "balance_loss_mlp": 1.01588297, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.7033942288536683, + "language_loss": 0.78570366, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.8071959, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13000488, + "step": 10218, + "time_per_iteration": 2.5446085929870605 + }, + { + "auxiliary_loss_clip": 0.01128239, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.05263436, + "balance_loss_mlp": 1.01918077, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.5542196099428065, + "language_loss": 0.82216078, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84376019, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12518311, + "step": 10219, + "time_per_iteration": 2.465622901916504 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.04124022, + "balance_loss_mlp": 1.02091646, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.9173588648733149, + "language_loss": 0.67078125, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.69230318, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.13201904, + "step": 10220, + "time_per_iteration": 2.7011520862579346 + }, + { + "auxiliary_loss_clip": 0.01117671, + "auxiliary_loss_mlp": 0.0102742, + "balance_loss_clip": 1.04403973, + "balance_loss_mlp": 1.01515305, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.8017309219269282, + "language_loss": 0.72336978, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.74482071, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12261963, + "step": 10221, + "time_per_iteration": 2.4654619693756104 + }, + { + "auxiliary_loss_clip": 0.01114924, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.04181671, + "balance_loss_mlp": 1.01730418, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.7518341390363286, + "language_loss": 0.79869288, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.82013136, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11627197, + "step": 10222, + "time_per_iteration": 3.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0111928, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.04453015, + "balance_loss_mlp": 1.02152574, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 2.653458828710918, + "language_loss": 0.76702642, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78855443, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11987305, + "step": 10223, + "time_per_iteration": 2.468193769454956 + }, + { + "auxiliary_loss_clip": 0.01121368, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.04856491, + "balance_loss_mlp": 1.02034521, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 3.094637930759304, + "language_loss": 0.78482682, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80636835, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12432861, + "step": 10224, + "time_per_iteration": 2.410883665084839 + }, + { + "auxiliary_loss_clip": 0.01115592, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.04189372, + "balance_loss_mlp": 1.01861525, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.314945679948207, + "language_loss": 0.66442734, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68588471, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11529541, + "step": 10225, + "time_per_iteration": 2.7343907356262207 + }, + { + "auxiliary_loss_clip": 0.01124879, + "auxiliary_loss_mlp": 0.01029025, + "balance_loss_clip": 1.04843152, + "balance_loss_mlp": 1.01660335, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.3708366820210554, + "language_loss": 0.62676346, + "learning_rate": 1.364509479649357e-06, + "loss": 0.64830244, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12420654, + "step": 10226, + "time_per_iteration": 2.5269362926483154 + }, + { + "auxiliary_loss_clip": 0.0111777, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.04291451, + "balance_loss_mlp": 1.01930022, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 2.556265561538948, + "language_loss": 0.75788677, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77938902, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13171387, + "step": 10227, + "time_per_iteration": 3.855095863342285 + }, + { + "auxiliary_loss_clip": 0.01121449, + "auxiliary_loss_mlp": 0.01038464, + "balance_loss_clip": 1.04304206, + "balance_loss_mlp": 1.02398562, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 1.9894860797659664, + "language_loss": 0.62384975, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.64544886, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.14471436, + "step": 10228, + "time_per_iteration": 2.4048800468444824 + }, + { + "auxiliary_loss_clip": 0.01114239, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.04176295, + "balance_loss_mlp": 1.01762652, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3119346976221795, + "language_loss": 0.74400216, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76544213, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12127686, + "step": 10229, + "time_per_iteration": 2.482130289077759 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.0487417, + "balance_loss_mlp": 1.02100182, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.835368048900925, + "language_loss": 0.77981484, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80138302, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13275146, + "step": 10230, + "time_per_iteration": 2.437248468399048 + }, + { + "auxiliary_loss_clip": 0.01118675, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.04276097, + "balance_loss_mlp": 1.02333879, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.5837792021580388, + "language_loss": 0.73143023, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75299346, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.14306641, + "step": 10231, + "time_per_iteration": 2.547441244125366 + }, + { + "auxiliary_loss_clip": 0.01120856, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.04642975, + "balance_loss_mlp": 1.02082825, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.8045498269632902, + "language_loss": 0.69842207, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71995836, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11956787, + "step": 10232, + "time_per_iteration": 2.5120887756347656 + }, + { + "auxiliary_loss_clip": 0.01111659, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.04048419, + "balance_loss_mlp": 1.0170455, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9441016079480578, + "language_loss": 0.91616291, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93756545, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11541748, + "step": 10233, + "time_per_iteration": 2.473956823348999 + }, + { + "auxiliary_loss_clip": 0.01113823, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.04081106, + "balance_loss_mlp": 1.02002478, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.6823249661765, + "language_loss": 0.71839964, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73984659, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10852051, + "step": 10234, + "time_per_iteration": 4.0129477977752686 + }, + { + "auxiliary_loss_clip": 0.01116405, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.0406208, + "balance_loss_mlp": 1.01945758, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 1.7758084090148052, + "language_loss": 0.66962034, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.691109, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.13000488, + "step": 10235, + "time_per_iteration": 2.525461196899414 + }, + { + "auxiliary_loss_clip": 0.01116102, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.03936362, + "balance_loss_mlp": 1.01571679, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 2.8544041756837637, + "language_loss": 0.81060767, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83205503, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.1293335, + "step": 10236, + "time_per_iteration": 2.474562406539917 + }, + { + "auxiliary_loss_clip": 0.01123258, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.04419374, + "balance_loss_mlp": 1.02061057, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 2.693254146585904, + "language_loss": 0.80270696, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82427251, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12683105, + "step": 10237, + "time_per_iteration": 2.4868178367614746 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.04932773, + "balance_loss_mlp": 1.02646267, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.6755752116127722, + "language_loss": 0.75920582, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78083837, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12261963, + "step": 10238, + "time_per_iteration": 2.4919893741607666 + }, + { + "auxiliary_loss_clip": 0.01044099, + "auxiliary_loss_mlp": 0.01004737, + "balance_loss_clip": 1.01874375, + "balance_loss_mlp": 1.00335431, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7894459063952037, + "language_loss": 0.57714641, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59763479, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01385498, + "step": 10239, + "time_per_iteration": 3.1341867446899414 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01041373, + "balance_loss_clip": 1.04115415, + "balance_loss_mlp": 1.02574468, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 1.7590650498044258, + "language_loss": 0.77504957, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79663289, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.15612793, + "step": 10240, + "time_per_iteration": 2.448920726776123 + }, + { + "auxiliary_loss_clip": 0.01125061, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.04691172, + "balance_loss_mlp": 1.02181745, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 3.1507371553263606, + "language_loss": 0.73220628, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75380856, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13366699, + "step": 10241, + "time_per_iteration": 2.446105480194092 + }, + { + "auxiliary_loss_clip": 0.01124852, + "auxiliary_loss_mlp": 0.01027693, + "balance_loss_clip": 1.05272055, + "balance_loss_mlp": 1.01573634, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.6400168631787055, + "language_loss": 0.71945012, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.74097556, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11968994, + "step": 10242, + "time_per_iteration": 2.465141534805298 + }, + { + "auxiliary_loss_clip": 0.01120153, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.04629421, + "balance_loss_mlp": 1.01776624, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 2.1493672364482506, + "language_loss": 0.72236043, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.7438612, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12164307, + "step": 10243, + "time_per_iteration": 2.438739776611328 + }, + { + "auxiliary_loss_clip": 0.01051796, + "auxiliary_loss_mlp": 0.01005509, + "balance_loss_clip": 1.02577209, + "balance_loss_mlp": 1.0040102, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.767081827659327, + "language_loss": 0.56851858, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58909166, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01499939, + "step": 10244, + "time_per_iteration": 3.1498863697052 + }, + { + "auxiliary_loss_clip": 0.01115711, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.04170728, + "balance_loss_mlp": 1.01893115, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.700263735410284, + "language_loss": 0.63794589, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65942967, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13726807, + "step": 10245, + "time_per_iteration": 2.5904345512390137 + }, + { + "auxiliary_loss_clip": 0.01113602, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.04126763, + "balance_loss_mlp": 1.02054787, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.731572111753336, + "language_loss": 0.78965825, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81112415, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12445068, + "step": 10246, + "time_per_iteration": 2.51450252532959 + }, + { + "auxiliary_loss_clip": 0.01128646, + "auxiliary_loss_mlp": 0.01048292, + "balance_loss_clip": 1.05000663, + "balance_loss_mlp": 1.03235984, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.3628199685438114, + "language_loss": 0.87445939, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89622879, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.15930176, + "step": 10247, + "time_per_iteration": 2.478431463241577 + }, + { + "auxiliary_loss_clip": 0.01128917, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.05443788, + "balance_loss_mlp": 1.01691103, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.707109854138684, + "language_loss": 0.80100048, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82257944, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12072754, + "step": 10248, + "time_per_iteration": 2.446833372116089 + }, + { + "auxiliary_loss_clip": 0.01125499, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.05095804, + "balance_loss_mlp": 1.01931167, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 2.7070997007186017, + "language_loss": 0.87346184, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89502788, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11791992, + "step": 10249, + "time_per_iteration": 2.4672086238861084 + }, + { + "auxiliary_loss_clip": 0.01120906, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.0454551, + "balance_loss_mlp": 1.01618767, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.7280383043615632, + "language_loss": 0.69046485, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.71196949, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13366699, + "step": 10250, + "time_per_iteration": 2.5874862670898438 + }, + { + "auxiliary_loss_clip": 0.01119532, + "auxiliary_loss_mlp": 0.01028934, + "balance_loss_clip": 1.0489136, + "balance_loss_mlp": 1.01749575, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.7338863641571798, + "language_loss": 0.74158812, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76307285, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11431885, + "step": 10251, + "time_per_iteration": 2.4414539337158203 + }, + { + "auxiliary_loss_clip": 0.01120653, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.04458797, + "balance_loss_mlp": 1.01448274, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.1226988232797797, + "language_loss": 0.68640584, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70788729, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13018799, + "step": 10252, + "time_per_iteration": 2.4333879947662354 + }, + { + "auxiliary_loss_clip": 0.01052391, + "auxiliary_loss_mlp": 0.01004932, + "balance_loss_clip": 1.02675056, + "balance_loss_mlp": 1.00364506, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8825416722160049, + "language_loss": 0.57828605, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59885931, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01286316, + "step": 10253, + "time_per_iteration": 3.133012533187866 + }, + { + "auxiliary_loss_clip": 0.01125331, + "auxiliary_loss_mlp": 0.01029799, + "balance_loss_clip": 1.04868388, + "balance_loss_mlp": 1.01724029, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.5608507460395682, + "language_loss": 0.79398972, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81554103, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12561035, + "step": 10254, + "time_per_iteration": 2.487677574157715 + }, + { + "auxiliary_loss_clip": 0.01126855, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.0502274, + "balance_loss_mlp": 1.01520407, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 3.5775658096168432, + "language_loss": 0.8047654, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82631207, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.1262207, + "step": 10255, + "time_per_iteration": 2.4756553173065186 + }, + { + "auxiliary_loss_clip": 0.01125215, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.05016351, + "balance_loss_mlp": 1.01774538, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 1.8671101947471793, + "language_loss": 0.65669918, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67826056, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13171387, + "step": 10256, + "time_per_iteration": 2.5512688159942627 + }, + { + "auxiliary_loss_clip": 0.01118276, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.04538882, + "balance_loss_mlp": 1.01695657, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.59764204682907, + "language_loss": 0.71788132, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73934627, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11260986, + "step": 10257, + "time_per_iteration": 3.8694381713867188 + }, + { + "auxiliary_loss_clip": 0.01128727, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.05238605, + "balance_loss_mlp": 1.01581407, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.6525133956835447, + "language_loss": 0.72205591, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74362987, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.128479, + "step": 10258, + "time_per_iteration": 2.4502406120300293 + }, + { + "auxiliary_loss_clip": 0.01123305, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.04787445, + "balance_loss_mlp": 1.01758361, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.7090832127997486, + "language_loss": 0.6401825, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.6617167, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12524414, + "step": 10259, + "time_per_iteration": 2.526571273803711 + }, + { + "auxiliary_loss_clip": 0.01118166, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.04621696, + "balance_loss_mlp": 1.01838827, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.7833924848019016, + "language_loss": 0.71612668, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73762, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12786865, + "step": 10260, + "time_per_iteration": 2.4389212131500244 + }, + { + "auxiliary_loss_clip": 0.01130055, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.05180168, + "balance_loss_mlp": 1.01792264, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 1.7479421121339735, + "language_loss": 0.68488938, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70650399, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13482666, + "step": 10261, + "time_per_iteration": 2.505418062210083 + }, + { + "auxiliary_loss_clip": 0.01120429, + "auxiliary_loss_mlp": 0.01035983, + "balance_loss_clip": 1.04604506, + "balance_loss_mlp": 1.02462864, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.7453416499091077, + "language_loss": 0.71177852, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73334265, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11352539, + "step": 10262, + "time_per_iteration": 2.471597671508789 + }, + { + "auxiliary_loss_clip": 0.01125048, + "auxiliary_loss_mlp": 0.01029985, + "balance_loss_clip": 1.0515976, + "balance_loss_mlp": 1.016675, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.919624578933197, + "language_loss": 0.70235395, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72390437, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.13311768, + "step": 10263, + "time_per_iteration": 2.4812817573547363 + }, + { + "auxiliary_loss_clip": 0.01131435, + "auxiliary_loss_mlp": 0.01024143, + "balance_loss_clip": 1.05372286, + "balance_loss_mlp": 1.01217473, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 2.5681018204956514, + "language_loss": 0.76753449, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78909028, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11975098, + "step": 10264, + "time_per_iteration": 2.4467785358428955 + }, + { + "auxiliary_loss_clip": 0.01122076, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.04841971, + "balance_loss_mlp": 1.01952553, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.3982710580394295, + "language_loss": 0.85391557, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87546068, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12902832, + "step": 10265, + "time_per_iteration": 2.462019443511963 + }, + { + "auxiliary_loss_clip": 0.0111832, + "auxiliary_loss_mlp": 0.01032328, + "balance_loss_clip": 1.04570484, + "balance_loss_mlp": 1.02023447, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 2.0400887235939655, + "language_loss": 0.64630187, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66780829, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12084961, + "step": 10266, + "time_per_iteration": 3.89285945892334 + }, + { + "auxiliary_loss_clip": 0.01115029, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.03993893, + "balance_loss_mlp": 1.02211022, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.633584084681889, + "language_loss": 0.75984538, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.78133589, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11901855, + "step": 10267, + "time_per_iteration": 2.4698517322540283 + }, + { + "auxiliary_loss_clip": 0.01123416, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.04726291, + "balance_loss_mlp": 1.01551449, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 2.079417657058758, + "language_loss": 0.74999595, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77151382, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12860107, + "step": 10268, + "time_per_iteration": 2.4768779277801514 + }, + { + "auxiliary_loss_clip": 0.01117883, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.0424037, + "balance_loss_mlp": 1.01747167, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.7231683480034647, + "language_loss": 0.75606805, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77755046, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12890625, + "step": 10269, + "time_per_iteration": 2.4561431407928467 + }, + { + "auxiliary_loss_clip": 0.01112729, + "auxiliary_loss_mlp": 0.01026057, + "balance_loss_clip": 1.04028225, + "balance_loss_mlp": 1.01414847, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.0461016971352493, + "language_loss": 0.76696932, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78835714, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11907959, + "step": 10270, + "time_per_iteration": 2.4270668029785156 + }, + { + "auxiliary_loss_clip": 0.0112546, + "auxiliary_loss_mlp": 0.01027446, + "balance_loss_clip": 1.0501523, + "balance_loss_mlp": 1.01528692, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.8781045068428863, + "language_loss": 0.8252154, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84674442, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.121521, + "step": 10271, + "time_per_iteration": 3.8577017784118652 + }, + { + "auxiliary_loss_clip": 0.01124883, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.04799998, + "balance_loss_mlp": 1.02324665, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 3.951074885400317, + "language_loss": 0.7699551, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79157662, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.14025879, + "step": 10272, + "time_per_iteration": 2.4934704303741455 + }, + { + "auxiliary_loss_clip": 0.01052125, + "auxiliary_loss_mlp": 0.01002683, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.00086832, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.804885710582531, + "language_loss": 0.59041226, + "learning_rate": 1.347180259404513e-06, + "loss": 0.6109603, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01812744, + "step": 10273, + "time_per_iteration": 2.9342846870422363 + }, + { + "auxiliary_loss_clip": 0.01124786, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.05040753, + "balance_loss_mlp": 1.01873147, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.3037679423974984, + "language_loss": 0.72810686, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.74967349, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.13146973, + "step": 10274, + "time_per_iteration": 2.414210557937622 + }, + { + "auxiliary_loss_clip": 0.01129248, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.05282438, + "balance_loss_mlp": 1.01786184, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 2.097146907598745, + "language_loss": 0.77777708, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79937005, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12158203, + "step": 10275, + "time_per_iteration": 2.4747490882873535 + }, + { + "auxiliary_loss_clip": 0.01121956, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.04581904, + "balance_loss_mlp": 1.01710773, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.5633161004090101, + "language_loss": 0.79611415, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81763184, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1270752, + "step": 10276, + "time_per_iteration": 2.46305251121521 + }, + { + "auxiliary_loss_clip": 0.01125943, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.05054677, + "balance_loss_mlp": 1.02484393, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 1.8305434224837376, + "language_loss": 0.81164587, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83328331, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12957764, + "step": 10277, + "time_per_iteration": 3.833570957183838 + }, + { + "auxiliary_loss_clip": 0.01128403, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.05124688, + "balance_loss_mlp": 1.01700354, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.6725870849314497, + "language_loss": 0.81387436, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83546144, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13311768, + "step": 10278, + "time_per_iteration": 2.4626166820526123 + }, + { + "auxiliary_loss_clip": 0.01121054, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.04711926, + "balance_loss_mlp": 1.01608694, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.9918381228137696, + "language_loss": 0.73946953, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.76095814, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1171875, + "step": 10279, + "time_per_iteration": 2.5523681640625 + }, + { + "auxiliary_loss_clip": 0.01117895, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.04536438, + "balance_loss_mlp": 1.01677966, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.6297426540810132, + "language_loss": 0.7090379, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.73050332, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11877441, + "step": 10280, + "time_per_iteration": 2.4620814323425293 + }, + { + "auxiliary_loss_clip": 0.01121761, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.04556501, + "balance_loss_mlp": 1.02205253, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.6161788938970778, + "language_loss": 0.72985494, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75142074, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12768555, + "step": 10281, + "time_per_iteration": 2.440372943878174 + }, + { + "auxiliary_loss_clip": 0.01110911, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.04176247, + "balance_loss_mlp": 1.01965547, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.7473271555206507, + "language_loss": 0.76896596, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.79039681, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.12530518, + "step": 10282, + "time_per_iteration": 2.5262138843536377 + }, + { + "auxiliary_loss_clip": 0.01116804, + "auxiliary_loss_mlp": 0.01036374, + "balance_loss_clip": 1.04162335, + "balance_loss_mlp": 1.02047145, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.6237245279964565, + "language_loss": 0.69156951, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71310127, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.15905762, + "step": 10283, + "time_per_iteration": 2.499131441116333 + }, + { + "auxiliary_loss_clip": 0.01126329, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.0476892, + "balance_loss_mlp": 1.0194124, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.7056921998954397, + "language_loss": 0.75057214, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77217376, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.14428711, + "step": 10284, + "time_per_iteration": 2.481740951538086 + }, + { + "auxiliary_loss_clip": 0.01114849, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.04532456, + "balance_loss_mlp": 1.02675569, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.5622867559436018, + "language_loss": 0.75740486, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77895343, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.13238525, + "step": 10285, + "time_per_iteration": 2.5408270359039307 + }, + { + "auxiliary_loss_clip": 0.01120632, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.04680681, + "balance_loss_mlp": 1.02031386, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.582619574774301, + "language_loss": 0.72644657, + "learning_rate": 1.342396663517503e-06, + "loss": 0.74797678, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12078857, + "step": 10286, + "time_per_iteration": 2.5050182342529297 + }, + { + "auxiliary_loss_clip": 0.01117036, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.04518211, + "balance_loss_mlp": 1.01595545, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.8128040476881486, + "language_loss": 0.7605812, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78203219, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12121582, + "step": 10287, + "time_per_iteration": 2.4705417156219482 + }, + { + "auxiliary_loss_clip": 0.01123749, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.0504297, + "balance_loss_mlp": 1.01846695, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.6563250705208112, + "language_loss": 0.7314992, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.7530396, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11828613, + "step": 10288, + "time_per_iteration": 2.4796581268310547 + }, + { + "auxiliary_loss_clip": 0.01122062, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.04921269, + "balance_loss_mlp": 1.02330375, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4418399319003272, + "language_loss": 0.72792333, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74948788, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11090088, + "step": 10289, + "time_per_iteration": 2.637554168701172 + }, + { + "auxiliary_loss_clip": 0.01122211, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.04632759, + "balance_loss_mlp": 1.01789331, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.5131608032082036, + "language_loss": 0.79469383, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81621814, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12322998, + "step": 10290, + "time_per_iteration": 2.4576406478881836 + }, + { + "auxiliary_loss_clip": 0.01127862, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.04985881, + "balance_loss_mlp": 1.0199616, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 2.051546988243992, + "language_loss": 0.81613111, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83773303, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12347412, + "step": 10291, + "time_per_iteration": 2.4984982013702393 + }, + { + "auxiliary_loss_clip": 0.01117375, + "auxiliary_loss_mlp": 0.01029781, + "balance_loss_clip": 1.04439151, + "balance_loss_mlp": 1.0181582, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.7777905900667532, + "language_loss": 0.7794857, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.80095726, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11633301, + "step": 10292, + "time_per_iteration": 2.473172664642334 + }, + { + "auxiliary_loss_clip": 0.0112685, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.04790604, + "balance_loss_mlp": 1.01973236, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.831760205774595, + "language_loss": 0.73417377, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75578094, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.14141846, + "step": 10293, + "time_per_iteration": 2.453765630722046 + }, + { + "auxiliary_loss_clip": 0.01124315, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.05077243, + "balance_loss_mlp": 1.02233076, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.516983025660631, + "language_loss": 0.82793975, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.8495304, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12420654, + "step": 10294, + "time_per_iteration": 2.457435369491577 + }, + { + "auxiliary_loss_clip": 0.01128944, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.05115855, + "balance_loss_mlp": 1.01884115, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.3282528013869683, + "language_loss": 0.70830262, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72990203, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12145996, + "step": 10295, + "time_per_iteration": 2.413428544998169 + }, + { + "auxiliary_loss_clip": 0.0112205, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.05013752, + "balance_loss_mlp": 1.02064574, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.5637212189834264, + "language_loss": 0.6993472, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.7208935, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1194458, + "step": 10296, + "time_per_iteration": 2.5232996940612793 + }, + { + "auxiliary_loss_clip": 0.01124274, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.04861283, + "balance_loss_mlp": 1.01738644, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 30.462113226716053, + "language_loss": 0.71833253, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.7398802, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13092041, + "step": 10297, + "time_per_iteration": 2.539419174194336 + }, + { + "auxiliary_loss_clip": 0.01066216, + "auxiliary_loss_mlp": 0.01009227, + "balance_loss_clip": 1.03966665, + "balance_loss_mlp": 1.00777745, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8813958698496714, + "language_loss": 0.64078599, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66154039, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.26611328, + "router_z_loss_mlp": 0.01451111, + "step": 10298, + "time_per_iteration": 2.9769344329833984 + }, + { + "auxiliary_loss_clip": 0.01122526, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.04727602, + "balance_loss_mlp": 1.02156854, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.7328654900092912, + "language_loss": 0.74579477, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76735383, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11810303, + "step": 10299, + "time_per_iteration": 2.466538667678833 + }, + { + "auxiliary_loss_clip": 0.01125948, + "auxiliary_loss_mlp": 0.01026434, + "balance_loss_clip": 1.04810798, + "balance_loss_mlp": 1.01413131, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.5437299696414506, + "language_loss": 0.68658984, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70811367, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12316895, + "step": 10300, + "time_per_iteration": 3.895092487335205 + }, + { + "auxiliary_loss_clip": 0.01123798, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.05019188, + "balance_loss_mlp": 1.02340388, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.7166055501717274, + "language_loss": 0.67074466, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69233876, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12225342, + "step": 10301, + "time_per_iteration": 2.436401844024658 + }, + { + "auxiliary_loss_clip": 0.01119194, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.04442084, + "balance_loss_mlp": 1.01806426, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 2.528998543986771, + "language_loss": 0.73110777, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11651611, + "step": 10302, + "time_per_iteration": 2.527611494064331 + }, + { + "auxiliary_loss_clip": 0.01124782, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.05177164, + "balance_loss_mlp": 1.01643658, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.7494221054717511, + "language_loss": 0.80546087, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82700133, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1282959, + "step": 10303, + "time_per_iteration": 2.5396108627319336 + }, + { + "auxiliary_loss_clip": 0.01126752, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.04833114, + "balance_loss_mlp": 1.01599324, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.5386128206318714, + "language_loss": 0.76776564, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78933191, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13891602, + "step": 10304, + "time_per_iteration": 2.4523513317108154 + }, + { + "auxiliary_loss_clip": 0.01133651, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.05143678, + "balance_loss_mlp": 1.02189338, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 1.920016800349637, + "language_loss": 0.77004385, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79173088, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13165283, + "step": 10305, + "time_per_iteration": 2.483944892883301 + }, + { + "auxiliary_loss_clip": 0.0113161, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.05266297, + "balance_loss_mlp": 1.01622653, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.717433709295946, + "language_loss": 0.78530502, + "learning_rate": 1.335045524968045e-06, + "loss": 0.80691779, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13433838, + "step": 10306, + "time_per_iteration": 2.4421112537384033 + }, + { + "auxiliary_loss_clip": 0.0111994, + "auxiliary_loss_mlp": 0.01027715, + "balance_loss_clip": 1.04941773, + "balance_loss_mlp": 1.01701593, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.6025634532490274, + "language_loss": 0.80213964, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82361615, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10693359, + "step": 10307, + "time_per_iteration": 2.5224130153656006 + }, + { + "auxiliary_loss_clip": 0.01067996, + "auxiliary_loss_mlp": 0.01008083, + "balance_loss_clip": 1.04181838, + "balance_loss_mlp": 1.00682878, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8819272686171578, + "language_loss": 0.59383976, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.6146006, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01255798, + "step": 10308, + "time_per_iteration": 3.1170566082000732 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.04359579, + "balance_loss_mlp": 1.01785028, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 2.2874782728467267, + "language_loss": 0.68072677, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70217359, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11016846, + "step": 10309, + "time_per_iteration": 3.979522228240967 + }, + { + "auxiliary_loss_clip": 0.01118443, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.0447855, + "balance_loss_mlp": 1.0199759, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 3.3471721341103624, + "language_loss": 0.7190125, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74051696, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12030029, + "step": 10310, + "time_per_iteration": 2.4592392444610596 + }, + { + "auxiliary_loss_clip": 0.01133137, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.05573952, + "balance_loss_mlp": 1.01987243, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 2.798423023201385, + "language_loss": 0.78863358, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81030393, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.14038086, + "step": 10311, + "time_per_iteration": 2.455209732055664 + }, + { + "auxiliary_loss_clip": 0.01122381, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.04798639, + "balance_loss_mlp": 1.02135062, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.9922992617933004, + "language_loss": 0.72987813, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75143886, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12341309, + "step": 10312, + "time_per_iteration": 2.44960880279541 + }, + { + "auxiliary_loss_clip": 0.01130283, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.05333602, + "balance_loss_mlp": 1.01833391, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 2.163221579042828, + "language_loss": 0.72195399, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.74356544, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12530518, + "step": 10313, + "time_per_iteration": 2.4669058322906494 + }, + { + "auxiliary_loss_clip": 0.01124422, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.04893553, + "balance_loss_mlp": 1.01716352, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8494930692481286, + "language_loss": 0.78628385, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80782723, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12762451, + "step": 10314, + "time_per_iteration": 2.4527587890625 + }, + { + "auxiliary_loss_clip": 0.01122351, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.04497814, + "balance_loss_mlp": 1.02299857, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.920379341737003, + "language_loss": 0.78073519, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80231196, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12329102, + "step": 10315, + "time_per_iteration": 3.8605618476867676 + }, + { + "auxiliary_loss_clip": 0.01125063, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.04835105, + "balance_loss_mlp": 1.02046275, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 1.8558926160201683, + "language_loss": 0.76235563, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78393257, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12176514, + "step": 10316, + "time_per_iteration": 2.465348482131958 + }, + { + "auxiliary_loss_clip": 0.01118194, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.04144621, + "balance_loss_mlp": 1.01674461, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 2.741832766499886, + "language_loss": 0.77678555, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79826492, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13000488, + "step": 10317, + "time_per_iteration": 2.5125324726104736 + }, + { + "auxiliary_loss_clip": 0.01053021, + "auxiliary_loss_mlp": 0.01005665, + "balance_loss_clip": 1.02722609, + "balance_loss_mlp": 1.00426626, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6945822758698111, + "language_loss": 0.59081161, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61139852, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01397705, + "step": 10318, + "time_per_iteration": 3.1418750286102295 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01033157, + "balance_loss_clip": 1.04431522, + "balance_loss_mlp": 1.02053845, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.6576463102505816, + "language_loss": 0.77894264, + "learning_rate": 1.330272686582143e-06, + "loss": 0.80046415, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1262207, + "step": 10319, + "time_per_iteration": 2.4734675884246826 + }, + { + "auxiliary_loss_clip": 0.01124337, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.04911411, + "balance_loss_mlp": 1.0220623, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 1.8506498802111713, + "language_loss": 0.66227579, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68385684, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11712646, + "step": 10320, + "time_per_iteration": 2.456446647644043 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.04175091, + "balance_loss_mlp": 1.01703119, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.7028612522203643, + "language_loss": 0.75971103, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78114474, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12084961, + "step": 10321, + "time_per_iteration": 3.8145787715911865 + }, + { + "auxiliary_loss_clip": 0.0112251, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.04933536, + "balance_loss_mlp": 1.0170424, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.7331547601216626, + "language_loss": 0.73115396, + "learning_rate": 1.329171870732758e-06, + "loss": 0.7526679, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11859131, + "step": 10322, + "time_per_iteration": 2.4542787075042725 + }, + { + "auxiliary_loss_clip": 0.01125454, + "auxiliary_loss_mlp": 0.01026235, + "balance_loss_clip": 1.05189085, + "balance_loss_mlp": 1.01436162, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.9938346155789852, + "language_loss": 0.72580111, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.74731791, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11871338, + "step": 10323, + "time_per_iteration": 2.4765658378601074 + }, + { + "auxiliary_loss_clip": 0.01134667, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.05193949, + "balance_loss_mlp": 1.01933479, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.13007632022152, + "language_loss": 0.5861817, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.60785294, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13104248, + "step": 10324, + "time_per_iteration": 2.412517786026001 + }, + { + "auxiliary_loss_clip": 0.01129635, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.05251169, + "balance_loss_mlp": 1.01875472, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 2.141801870434956, + "language_loss": 0.76858568, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.79019445, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12493896, + "step": 10325, + "time_per_iteration": 2.527254343032837 + }, + { + "auxiliary_loss_clip": 0.01122309, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.04491973, + "balance_loss_mlp": 1.01670766, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 2.5377566492067456, + "language_loss": 0.72685111, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74837309, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13171387, + "step": 10326, + "time_per_iteration": 2.5263004302978516 + }, + { + "auxiliary_loss_clip": 0.01124341, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.04615462, + "balance_loss_mlp": 1.02880013, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.059404462158734, + "language_loss": 0.73815727, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.75982136, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13262939, + "step": 10327, + "time_per_iteration": 2.477367877960205 + }, + { + "auxiliary_loss_clip": 0.01129589, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.05149531, + "balance_loss_mlp": 1.0219785, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 7.259081014838247, + "language_loss": 0.7969951, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81864935, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1385498, + "step": 10328, + "time_per_iteration": 2.3972926139831543 + }, + { + "auxiliary_loss_clip": 0.01125451, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.04561591, + "balance_loss_mlp": 1.02479649, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.8158268120877323, + "language_loss": 0.77862269, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.80025792, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.1328125, + "step": 10329, + "time_per_iteration": 2.448425531387329 + }, + { + "auxiliary_loss_clip": 0.01068707, + "auxiliary_loss_mlp": 0.01005362, + "balance_loss_clip": 1.04344344, + "balance_loss_mlp": 1.00410891, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8337238563782067, + "language_loss": 0.62147987, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64222062, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.25268555, + "router_z_loss_mlp": 0.01252747, + "step": 10330, + "time_per_iteration": 3.0275399684906006 + }, + { + "auxiliary_loss_clip": 0.01123542, + "auxiliary_loss_mlp": 0.01039511, + "balance_loss_clip": 1.04572678, + "balance_loss_mlp": 1.02571225, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 1.8541806567256216, + "language_loss": 0.7762233, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79785383, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13806152, + "step": 10331, + "time_per_iteration": 2.5211331844329834 + }, + { + "auxiliary_loss_clip": 0.01128749, + "auxiliary_loss_mlp": 0.01037633, + "balance_loss_clip": 1.05149055, + "balance_loss_mlp": 1.02453232, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 1.82430794369522, + "language_loss": 0.6779784, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69964224, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13104248, + "step": 10332, + "time_per_iteration": 2.4477264881134033 + }, + { + "auxiliary_loss_clip": 0.01118552, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.04418361, + "balance_loss_mlp": 1.01861894, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.709243945708628, + "language_loss": 0.76570195, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78719592, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12231445, + "step": 10333, + "time_per_iteration": 2.484731435775757 + }, + { + "auxiliary_loss_clip": 0.01120471, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.04724145, + "balance_loss_mlp": 1.02385187, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.467347443721007, + "language_loss": 0.6974892, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71905065, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11810303, + "step": 10334, + "time_per_iteration": 2.470057487487793 + }, + { + "auxiliary_loss_clip": 0.01120816, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.04796934, + "balance_loss_mlp": 1.02406454, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.869209471446517, + "language_loss": 0.70009178, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72165549, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1149292, + "step": 10335, + "time_per_iteration": 2.4363443851470947 + }, + { + "auxiliary_loss_clip": 0.01117332, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.044716, + "balance_loss_mlp": 1.02083755, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.6020383415381514, + "language_loss": 0.80195302, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82345676, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12194824, + "step": 10336, + "time_per_iteration": 2.483583927154541 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.04400206, + "balance_loss_mlp": 1.0235846, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 4.392061214199205, + "language_loss": 0.73157209, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75310165, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12329102, + "step": 10337, + "time_per_iteration": 2.4885127544403076 + }, + { + "auxiliary_loss_clip": 0.01119306, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.04258871, + "balance_loss_mlp": 1.02821684, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 2.7624425911606645, + "language_loss": 0.62960136, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65120584, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.1293335, + "step": 10338, + "time_per_iteration": 2.5749969482421875 + }, + { + "auxiliary_loss_clip": 0.01116837, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.04275143, + "balance_loss_mlp": 1.02317619, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.9055123905017615, + "language_loss": 0.71060598, + "learning_rate": 1.322938249724991e-06, + "loss": 0.7321319, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12573242, + "step": 10339, + "time_per_iteration": 2.504537343978882 + }, + { + "auxiliary_loss_clip": 0.01118748, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.0453217, + "balance_loss_mlp": 1.02069867, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.8459499361983631, + "language_loss": 0.6926229, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71414316, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12579346, + "step": 10340, + "time_per_iteration": 2.434267997741699 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.04567802, + "balance_loss_mlp": 1.0179317, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 2.2704555988225237, + "language_loss": 0.69075644, + "learning_rate": 1.322205369037788e-06, + "loss": 0.71222723, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11444092, + "step": 10341, + "time_per_iteration": 2.4771339893341064 + }, + { + "auxiliary_loss_clip": 0.01123321, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.04793239, + "balance_loss_mlp": 1.01726294, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 2.420645790433662, + "language_loss": 0.80849189, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83002985, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13214111, + "step": 10342, + "time_per_iteration": 2.4173665046691895 + }, + { + "auxiliary_loss_clip": 0.01058575, + "auxiliary_loss_mlp": 0.01007196, + "balance_loss_clip": 1.03052115, + "balance_loss_mlp": 1.00569427, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7745088762959447, + "language_loss": 0.57282627, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59348404, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.28076172, + "router_z_loss_mlp": 0.01502991, + "step": 10343, + "time_per_iteration": 4.3957507610321045 + }, + { + "auxiliary_loss_clip": 0.01118621, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.04629529, + "balance_loss_mlp": 1.01742554, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.815782634171821, + "language_loss": 0.73191124, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75337768, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10595703, + "step": 10344, + "time_per_iteration": 2.4775779247283936 + }, + { + "auxiliary_loss_clip": 0.01126532, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.05067611, + "balance_loss_mlp": 1.02426517, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.7498652654817841, + "language_loss": 0.60214996, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62377334, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11553955, + "step": 10345, + "time_per_iteration": 2.4760985374450684 + }, + { + "auxiliary_loss_clip": 0.01117853, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.04239607, + "balance_loss_mlp": 1.0270586, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.850491453528034, + "language_loss": 0.78423768, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80581915, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13226318, + "step": 10346, + "time_per_iteration": 2.638953924179077 + }, + { + "auxiliary_loss_clip": 0.01131538, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.04931366, + "balance_loss_mlp": 1.02559328, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.5114571911035706, + "language_loss": 0.71146095, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73317122, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13916016, + "step": 10347, + "time_per_iteration": 2.6111626625061035 + }, + { + "auxiliary_loss_clip": 0.01121348, + "auxiliary_loss_mlp": 0.01030052, + "balance_loss_clip": 1.04676867, + "balance_loss_mlp": 1.01760054, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.8256478992244554, + "language_loss": 0.71917951, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74069351, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12463379, + "step": 10348, + "time_per_iteration": 2.518702507019043 + }, + { + "auxiliary_loss_clip": 0.01053664, + "auxiliary_loss_mlp": 0.01006337, + "balance_loss_clip": 1.02705824, + "balance_loss_mlp": 1.00452209, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8288119052904223, + "language_loss": 0.54169363, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56229365, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01815796, + "step": 10349, + "time_per_iteration": 3.0846452713012695 + }, + { + "auxiliary_loss_clip": 0.01119427, + "auxiliary_loss_mlp": 0.01030994, + "balance_loss_clip": 1.0444665, + "balance_loss_mlp": 1.01674855, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 1.8802842576330892, + "language_loss": 0.69400644, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71551067, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.14233398, + "step": 10350, + "time_per_iteration": 2.4952890872955322 + }, + { + "auxiliary_loss_clip": 0.01124512, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.04721081, + "balance_loss_mlp": 1.02765369, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.8132083703541653, + "language_loss": 0.57056701, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59222627, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13763428, + "step": 10351, + "time_per_iteration": 2.438568115234375 + }, + { + "auxiliary_loss_clip": 0.01056718, + "auxiliary_loss_mlp": 0.0100874, + "balance_loss_clip": 1.03074574, + "balance_loss_mlp": 1.00722146, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.802811322757505, + "language_loss": 0.61148286, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63213754, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.0151825, + "step": 10352, + "time_per_iteration": 3.099952220916748 + }, + { + "auxiliary_loss_clip": 0.01125875, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.05233467, + "balance_loss_mlp": 1.02107322, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.3036686873309526, + "language_loss": 0.82215554, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84374934, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12426758, + "step": 10353, + "time_per_iteration": 4.011642217636108 + }, + { + "auxiliary_loss_clip": 0.01119312, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.04593658, + "balance_loss_mlp": 1.02336645, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 2.11890441257021, + "language_loss": 0.75807047, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77961934, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12213135, + "step": 10354, + "time_per_iteration": 2.4608848094940186 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.04739571, + "balance_loss_mlp": 1.01716638, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.6829712471277598, + "language_loss": 0.78603494, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80753577, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1227417, + "step": 10355, + "time_per_iteration": 2.4232470989227295 + }, + { + "auxiliary_loss_clip": 0.01123515, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.04992664, + "balance_loss_mlp": 1.02360737, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.5326647030731584, + "language_loss": 0.78108251, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.8026787, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12506104, + "step": 10356, + "time_per_iteration": 2.4954872131347656 + }, + { + "auxiliary_loss_clip": 0.0113098, + "auxiliary_loss_mlp": 0.01036939, + "balance_loss_clip": 1.049932, + "balance_loss_mlp": 1.02303958, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 1.9128697318091084, + "language_loss": 0.67875862, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.70043778, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13897705, + "step": 10357, + "time_per_iteration": 2.429988384246826 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.05213547, + "balance_loss_mlp": 1.01777673, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 4.4797704315501115, + "language_loss": 0.76060861, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78222436, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.14123535, + "step": 10358, + "time_per_iteration": 3.927262783050537 + }, + { + "auxiliary_loss_clip": 0.01118427, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.04392958, + "balance_loss_mlp": 1.01526737, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.0252443771970934, + "language_loss": 0.81986642, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84132326, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11987305, + "step": 10359, + "time_per_iteration": 2.425574779510498 + }, + { + "auxiliary_loss_clip": 0.01111772, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.03998494, + "balance_loss_mlp": 1.02333045, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.4017437592126027, + "language_loss": 0.73235261, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75382996, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12646484, + "step": 10360, + "time_per_iteration": 2.4592456817626953 + }, + { + "auxiliary_loss_clip": 0.01125047, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.04905796, + "balance_loss_mlp": 1.02381301, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.023313661397462, + "language_loss": 0.77686548, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.79848135, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12719727, + "step": 10361, + "time_per_iteration": 2.4319891929626465 + }, + { + "auxiliary_loss_clip": 0.01125752, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.05145967, + "balance_loss_mlp": 1.0177722, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 1.6911323369310274, + "language_loss": 0.67422557, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69578195, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12097168, + "step": 10362, + "time_per_iteration": 2.4386117458343506 + }, + { + "auxiliary_loss_clip": 0.01130608, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.04968977, + "balance_loss_mlp": 1.0258894, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.0697211110108213, + "language_loss": 0.67256874, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.6942668, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13311768, + "step": 10363, + "time_per_iteration": 2.4896082878112793 + }, + { + "auxiliary_loss_clip": 0.01122085, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.04567885, + "balance_loss_mlp": 1.0174185, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 2.4825690797009865, + "language_loss": 0.86646307, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88799083, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13287354, + "step": 10364, + "time_per_iteration": 2.4389708042144775 + }, + { + "auxiliary_loss_clip": 0.01052278, + "auxiliary_loss_mlp": 0.0100176, + "balance_loss_clip": 1.02649069, + "balance_loss_mlp": 0.99998862, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.885052333870449, + "language_loss": 0.60766578, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62820619, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01771545, + "step": 10365, + "time_per_iteration": 4.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01128009, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.04928851, + "balance_loss_mlp": 1.02245212, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 1.778766424846442, + "language_loss": 0.75115776, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77280533, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14294434, + "step": 10366, + "time_per_iteration": 2.472093343734741 + }, + { + "auxiliary_loss_clip": 0.01123229, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0465281, + "balance_loss_mlp": 1.02896202, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 11.328265232245812, + "language_loss": 0.76494962, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78661424, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.14263916, + "step": 10367, + "time_per_iteration": 2.4615390300750732 + }, + { + "auxiliary_loss_clip": 0.01113707, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.0423485, + "balance_loss_mlp": 1.02164984, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.5357783988016485, + "language_loss": 0.7860949, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80757219, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12371826, + "step": 10368, + "time_per_iteration": 2.5225486755371094 + }, + { + "auxiliary_loss_clip": 0.01126869, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.04985332, + "balance_loss_mlp": 1.01975155, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.8811978403726644, + "language_loss": 0.68157101, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.7031647, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12756348, + "step": 10369, + "time_per_iteration": 2.5040712356567383 + }, + { + "auxiliary_loss_clip": 0.01126382, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.04914284, + "balance_loss_mlp": 1.02451372, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.28597784026314, + "language_loss": 0.87945628, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90110171, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13653564, + "step": 10370, + "time_per_iteration": 2.4183273315429688 + }, + { + "auxiliary_loss_clip": 0.01120732, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.04649734, + "balance_loss_mlp": 1.01595783, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.5320750869882644, + "language_loss": 0.66202515, + "learning_rate": 1.311224557923402e-06, + "loss": 0.6835146, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12243652, + "step": 10371, + "time_per_iteration": 2.5255136489868164 + }, + { + "auxiliary_loss_clip": 0.01121265, + "auxiliary_loss_mlp": 0.01030972, + "balance_loss_clip": 1.04902768, + "balance_loss_mlp": 1.02029693, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.417618710281843, + "language_loss": 0.77598554, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79750788, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10675049, + "step": 10372, + "time_per_iteration": 2.5868215560913086 + }, + { + "auxiliary_loss_clip": 0.01123798, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.0465492, + "balance_loss_mlp": 1.02247715, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.7614518194442632, + "language_loss": 0.77326238, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79485643, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13104248, + "step": 10373, + "time_per_iteration": 2.544471263885498 + }, + { + "auxiliary_loss_clip": 0.01121715, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.04954898, + "balance_loss_mlp": 1.01449966, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.6694540943777374, + "language_loss": 0.69490302, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71638489, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11968994, + "step": 10374, + "time_per_iteration": 2.480578899383545 + }, + { + "auxiliary_loss_clip": 0.0112131, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.04710388, + "balance_loss_mlp": 1.02163219, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 2.2455008994115038, + "language_loss": 0.77196443, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79351401, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12017822, + "step": 10375, + "time_per_iteration": 2.427020788192749 + }, + { + "auxiliary_loss_clip": 0.01117423, + "auxiliary_loss_mlp": 0.01037283, + "balance_loss_clip": 1.04401588, + "balance_loss_mlp": 1.02499247, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.7954185373336589, + "language_loss": 0.7024833, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72403038, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12286377, + "step": 10376, + "time_per_iteration": 2.584221601486206 + }, + { + "auxiliary_loss_clip": 0.01117926, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.04242277, + "balance_loss_mlp": 1.02115345, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.9515263158121918, + "language_loss": 0.76730585, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78883648, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13977051, + "step": 10377, + "time_per_iteration": 2.498490571975708 + }, + { + "auxiliary_loss_clip": 0.01117497, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.04405785, + "balance_loss_mlp": 1.02108097, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 3.1688619243739446, + "language_loss": 0.6862132, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70771098, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11206055, + "step": 10378, + "time_per_iteration": 2.5119872093200684 + }, + { + "auxiliary_loss_clip": 0.01117937, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.04414439, + "balance_loss_mlp": 1.02572644, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.7891749789815594, + "language_loss": 0.76257473, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78414226, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.13098145, + "step": 10379, + "time_per_iteration": 2.4974377155303955 + }, + { + "auxiliary_loss_clip": 0.01124309, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.05045652, + "balance_loss_mlp": 1.01935768, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.6982886879873107, + "language_loss": 0.79255491, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81411296, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12145996, + "step": 10380, + "time_per_iteration": 2.558608055114746 + }, + { + "auxiliary_loss_clip": 0.01121827, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.05066085, + "balance_loss_mlp": 1.02488482, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.4983352690626581, + "language_loss": 0.80108732, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82266504, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11071777, + "step": 10381, + "time_per_iteration": 2.4539601802825928 + }, + { + "auxiliary_loss_clip": 0.01121337, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.04575336, + "balance_loss_mlp": 1.02051067, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.1984132833297636, + "language_loss": 0.74576235, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76730412, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12335205, + "step": 10382, + "time_per_iteration": 2.428527593612671 + }, + { + "auxiliary_loss_clip": 0.01119477, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.04680562, + "balance_loss_mlp": 1.01748967, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.4629015424351288, + "language_loss": 0.78492451, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80641007, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11584473, + "step": 10383, + "time_per_iteration": 2.5153281688690186 + }, + { + "auxiliary_loss_clip": 0.0111293, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.04099762, + "balance_loss_mlp": 1.01850772, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 1.9550683486456937, + "language_loss": 0.75277305, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77420306, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11572266, + "step": 10384, + "time_per_iteration": 2.488783836364746 + }, + { + "auxiliary_loss_clip": 0.01114893, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.03997672, + "balance_loss_mlp": 1.02039623, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 2.0289621394961257, + "language_loss": 0.66758764, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68906772, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1272583, + "step": 10385, + "time_per_iteration": 3.8654158115386963 + }, + { + "auxiliary_loss_clip": 0.01046158, + "auxiliary_loss_mlp": 0.01006252, + "balance_loss_clip": 1.02059841, + "balance_loss_mlp": 1.00493479, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7595826480497337, + "language_loss": 0.62029171, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64081579, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01318359, + "step": 10386, + "time_per_iteration": 3.107893943786621 + }, + { + "auxiliary_loss_clip": 0.0112245, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.04418325, + "balance_loss_mlp": 1.02203465, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.396607209734467, + "language_loss": 0.71816057, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.73973858, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13323975, + "step": 10387, + "time_per_iteration": 2.486563205718994 + }, + { + "auxiliary_loss_clip": 0.01126599, + "auxiliary_loss_mlp": 0.01040197, + "balance_loss_clip": 1.04425073, + "balance_loss_mlp": 1.02574921, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.535058667458913, + "language_loss": 0.65771568, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67938364, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.14440918, + "step": 10388, + "time_per_iteration": 2.5263400077819824 + }, + { + "auxiliary_loss_clip": 0.01119919, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.04646933, + "balance_loss_mlp": 1.01622391, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.5973917567694607, + "language_loss": 0.79213893, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81361258, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11242676, + "step": 10389, + "time_per_iteration": 2.4438371658325195 + }, + { + "auxiliary_loss_clip": 0.01124034, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.05088484, + "balance_loss_mlp": 1.02383971, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.6566602967498325, + "language_loss": 0.6044721, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62607777, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12689209, + "step": 10390, + "time_per_iteration": 2.4090452194213867 + }, + { + "auxiliary_loss_clip": 0.01126747, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.04895568, + "balance_loss_mlp": 1.01662803, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 4.565564289527095, + "language_loss": 0.77198678, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79354846, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12792969, + "step": 10391, + "time_per_iteration": 2.4734888076782227 + }, + { + "auxiliary_loss_clip": 0.0112756, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.05291104, + "balance_loss_mlp": 1.01610923, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 3.680252761281686, + "language_loss": 0.65051442, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.67207944, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12835693, + "step": 10392, + "time_per_iteration": 2.664034128189087 + }, + { + "auxiliary_loss_clip": 0.01129815, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.05347061, + "balance_loss_mlp": 1.02316546, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.7953820089594523, + "language_loss": 0.76700926, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.7886678, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12878418, + "step": 10393, + "time_per_iteration": 2.4899723529815674 + }, + { + "auxiliary_loss_clip": 0.01120482, + "auxiliary_loss_mlp": 0.01037612, + "balance_loss_clip": 1.04517531, + "balance_loss_mlp": 1.02408791, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.9975567900159878, + "language_loss": 0.8279326, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84951353, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13513184, + "step": 10394, + "time_per_iteration": 2.4945313930511475 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.04466057, + "balance_loss_mlp": 1.02399552, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.7897579991903134, + "language_loss": 0.74776483, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.76935327, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13623047, + "step": 10395, + "time_per_iteration": 2.594813108444214 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.04424453, + "balance_loss_mlp": 1.01950586, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.8630146893077844, + "language_loss": 0.72778946, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74930853, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12231445, + "step": 10396, + "time_per_iteration": 3.9239864349365234 + }, + { + "auxiliary_loss_clip": 0.01129193, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.05555058, + "balance_loss_mlp": 1.02377748, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.6631001375815513, + "language_loss": 0.75781178, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.779468, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12652588, + "step": 10397, + "time_per_iteration": 2.552238941192627 + }, + { + "auxiliary_loss_clip": 0.01118582, + "auxiliary_loss_mlp": 0.0103397, + "balance_loss_clip": 1.04383886, + "balance_loss_mlp": 1.02082682, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.4998278536669982, + "language_loss": 0.75104845, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77257395, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13146973, + "step": 10398, + "time_per_iteration": 2.486077070236206 + }, + { + "auxiliary_loss_clip": 0.01118801, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.0425458, + "balance_loss_mlp": 1.01634276, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.967032899987965, + "language_loss": 0.73864651, + "learning_rate": 1.300997001489483e-06, + "loss": 0.76013464, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13671875, + "step": 10399, + "time_per_iteration": 2.4887821674346924 + }, + { + "auxiliary_loss_clip": 0.01114401, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.04002285, + "balance_loss_mlp": 1.02125359, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.8714865002267587, + "language_loss": 0.74752265, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76900423, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12518311, + "step": 10400, + "time_per_iteration": 2.4718260765075684 + }, + { + "auxiliary_loss_clip": 0.01050838, + "auxiliary_loss_mlp": 0.01003777, + "balance_loss_clip": 1.02513111, + "balance_loss_mlp": 1.00232232, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8196270185096822, + "language_loss": 0.56475824, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58530432, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01454163, + "step": 10401, + "time_per_iteration": 3.2063169479370117 + }, + { + "auxiliary_loss_clip": 0.01116411, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.04023051, + "balance_loss_mlp": 1.01992488, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.1131733931459467, + "language_loss": 0.83410227, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85559106, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12530518, + "step": 10402, + "time_per_iteration": 3.8902170658111572 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.03707695, + "balance_loss_mlp": 1.01987791, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 1.7851769123941483, + "language_loss": 0.6895299, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71095347, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12347412, + "step": 10403, + "time_per_iteration": 2.522714853286743 + }, + { + "auxiliary_loss_clip": 0.0112122, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.04326296, + "balance_loss_mlp": 1.01836538, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.6233643504712711, + "language_loss": 0.72104061, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.74257851, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.14208984, + "step": 10404, + "time_per_iteration": 2.47805118560791 + }, + { + "auxiliary_loss_clip": 0.01125553, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.05034208, + "balance_loss_mlp": 1.02428639, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 1.813350140661806, + "language_loss": 0.69396806, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71559322, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12683105, + "step": 10405, + "time_per_iteration": 2.424072265625 + }, + { + "auxiliary_loss_clip": 0.01124117, + "auxiliary_loss_mlp": 0.01034253, + "balance_loss_clip": 1.04998839, + "balance_loss_mlp": 1.02183735, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.5640905733207624, + "language_loss": 0.79014564, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81172931, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12426758, + "step": 10406, + "time_per_iteration": 2.5150599479675293 + }, + { + "auxiliary_loss_clip": 0.01118246, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.04334593, + "balance_loss_mlp": 1.02172256, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.9540631782420037, + "language_loss": 0.68843913, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.70995688, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11810303, + "step": 10407, + "time_per_iteration": 3.9429023265838623 + }, + { + "auxiliary_loss_clip": 0.01127347, + "auxiliary_loss_mlp": 0.01028002, + "balance_loss_clip": 1.0539546, + "balance_loss_mlp": 1.01632559, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.7117181619606325, + "language_loss": 0.85579443, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87734795, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11682129, + "step": 10408, + "time_per_iteration": 2.4651377201080322 + }, + { + "auxiliary_loss_clip": 0.01118367, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.04432738, + "balance_loss_mlp": 1.01959622, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.779404253005566, + "language_loss": 0.79665577, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81815141, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11608887, + "step": 10409, + "time_per_iteration": 2.4674699306488037 + }, + { + "auxiliary_loss_clip": 0.01120427, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.0475049, + "balance_loss_mlp": 1.02000785, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.4111031684773074, + "language_loss": 0.69256872, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71409822, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12524414, + "step": 10410, + "time_per_iteration": 2.4688620567321777 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.04624701, + "balance_loss_mlp": 1.01539969, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.6794956953726559, + "language_loss": 0.67689049, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69834912, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11566162, + "step": 10411, + "time_per_iteration": 2.4941256046295166 + }, + { + "auxiliary_loss_clip": 0.01116113, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.04229558, + "balance_loss_mlp": 1.02332318, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.6326171946442114, + "language_loss": 0.69576681, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71728092, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11962891, + "step": 10412, + "time_per_iteration": 2.546619415283203 + }, + { + "auxiliary_loss_clip": 0.01116142, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.04242539, + "balance_loss_mlp": 1.02445161, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.6196451704923311, + "language_loss": 0.69419944, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71572828, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1229248, + "step": 10413, + "time_per_iteration": 2.4893946647644043 + }, + { + "auxiliary_loss_clip": 0.01130906, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.04789507, + "balance_loss_mlp": 1.02105403, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.958021713660949, + "language_loss": 0.81258643, + "learning_rate": 1.295526482316796e-06, + "loss": 0.83425003, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14398193, + "step": 10414, + "time_per_iteration": 2.4080569744110107 + }, + { + "auxiliary_loss_clip": 0.01122168, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.04800916, + "balance_loss_mlp": 1.02229702, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 2.6681943428396884, + "language_loss": 0.74370313, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.7652719, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12408447, + "step": 10415, + "time_per_iteration": 2.46370530128479 + }, + { + "auxiliary_loss_clip": 0.01115085, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.0419358, + "balance_loss_mlp": 1.01828575, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.6701086057037635, + "language_loss": 0.74202251, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76347518, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11895752, + "step": 10416, + "time_per_iteration": 2.477071762084961 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.0446229, + "balance_loss_mlp": 1.01913202, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.7662105331248228, + "language_loss": 0.84535617, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.8668344, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11883545, + "step": 10417, + "time_per_iteration": 2.536721706390381 + }, + { + "auxiliary_loss_clip": 0.01122387, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.04560554, + "balance_loss_mlp": 1.02191377, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.0548640407750187, + "language_loss": 0.56791639, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.58948749, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12835693, + "step": 10418, + "time_per_iteration": 2.4394009113311768 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01040528, + "balance_loss_clip": 1.04812765, + "balance_loss_mlp": 1.0266223, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 2.1725503007250557, + "language_loss": 0.84603047, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.86774051, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13891602, + "step": 10419, + "time_per_iteration": 2.536351203918457 + }, + { + "auxiliary_loss_clip": 0.01122499, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.04803705, + "balance_loss_mlp": 1.01946461, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 2.8781771303357817, + "language_loss": 0.64735043, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66889238, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12225342, + "step": 10420, + "time_per_iteration": 2.565509557723999 + }, + { + "auxiliary_loss_clip": 0.01122061, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.04337573, + "balance_loss_mlp": 1.01731968, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 4.96070080237312, + "language_loss": 0.86271036, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88423765, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13366699, + "step": 10421, + "time_per_iteration": 2.452165126800537 + }, + { + "auxiliary_loss_clip": 0.01115423, + "auxiliary_loss_mlp": 0.01027138, + "balance_loss_clip": 1.04172552, + "balance_loss_mlp": 1.01529467, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 3.014532320281962, + "language_loss": 0.79521805, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81664366, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11846924, + "step": 10422, + "time_per_iteration": 2.44134259223938 + }, + { + "auxiliary_loss_clip": 0.01123696, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.04853749, + "balance_loss_mlp": 1.01358342, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 1.6518935084675688, + "language_loss": 0.74248576, + "learning_rate": 1.292247052906389e-06, + "loss": 0.7639879, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1295166, + "step": 10423, + "time_per_iteration": 2.462514638900757 + }, + { + "auxiliary_loss_clip": 0.01113892, + "auxiliary_loss_mlp": 0.01025597, + "balance_loss_clip": 1.0410955, + "balance_loss_mlp": 1.01358116, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 2.3463382671923245, + "language_loss": 0.77815938, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79955423, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12017822, + "step": 10424, + "time_per_iteration": 2.4271254539489746 + }, + { + "auxiliary_loss_clip": 0.01121818, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.0463742, + "balance_loss_mlp": 1.01748598, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 2.0386153150247526, + "language_loss": 0.69479084, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71631676, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1328125, + "step": 10425, + "time_per_iteration": 2.466987371444702 + }, + { + "auxiliary_loss_clip": 0.01113513, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.04315341, + "balance_loss_mlp": 1.0192225, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.59363652131465, + "language_loss": 0.74248672, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76392621, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11212158, + "step": 10426, + "time_per_iteration": 2.4946117401123047 + }, + { + "auxiliary_loss_clip": 0.0112404, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.0496527, + "balance_loss_mlp": 1.01991379, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.4456950431545377, + "language_loss": 0.80423987, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82580954, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.13031006, + "step": 10427, + "time_per_iteration": 2.476140022277832 + }, + { + "auxiliary_loss_clip": 0.01121014, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.04495955, + "balance_loss_mlp": 1.02357793, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 2.0549813403299413, + "language_loss": 0.68686533, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70843399, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12255859, + "step": 10428, + "time_per_iteration": 3.805293560028076 + }, + { + "auxiliary_loss_clip": 0.01118465, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.04511631, + "balance_loss_mlp": 1.02612925, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.7485898319884365, + "language_loss": 0.71550477, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73708355, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.13299561, + "step": 10429, + "time_per_iteration": 2.4300007820129395 + }, + { + "auxiliary_loss_clip": 0.01120071, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.04473877, + "balance_loss_mlp": 1.02068055, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.8805354580059483, + "language_loss": 0.79878557, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82033825, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.14526367, + "step": 10430, + "time_per_iteration": 2.50942063331604 + }, + { + "auxiliary_loss_clip": 0.01052491, + "auxiliary_loss_mlp": 0.0100308, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.00184667, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7862730594809009, + "language_loss": 0.59157825, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61213386, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01234436, + "step": 10431, + "time_per_iteration": 3.1711676120758057 + }, + { + "auxiliary_loss_clip": 0.0108167, + "auxiliary_loss_mlp": 0.01006908, + "balance_loss_clip": 1.05664492, + "balance_loss_mlp": 1.00499225, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8729750178453481, + "language_loss": 0.6374985, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65838432, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01916504, + "step": 10432, + "time_per_iteration": 3.1913678646087646 + }, + { + "auxiliary_loss_clip": 0.01117253, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.04287815, + "balance_loss_mlp": 1.02214956, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.9923535947730295, + "language_loss": 0.64921349, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67072368, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11608887, + "step": 10433, + "time_per_iteration": 2.503617763519287 + }, + { + "auxiliary_loss_clip": 0.01125264, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.04628348, + "balance_loss_mlp": 1.02087998, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.1772607911950126, + "language_loss": 0.61828226, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.63987184, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12817383, + "step": 10434, + "time_per_iteration": 2.4678306579589844 + }, + { + "auxiliary_loss_clip": 0.01119005, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.04420781, + "balance_loss_mlp": 1.01655519, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.9110369420363063, + "language_loss": 0.8463577, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.8678329, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11962891, + "step": 10435, + "time_per_iteration": 2.490083694458008 + }, + { + "auxiliary_loss_clip": 0.01059094, + "auxiliary_loss_mlp": 0.01002592, + "balance_loss_clip": 1.03342009, + "balance_loss_mlp": 1.00141203, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.8427812989295633, + "language_loss": 0.61500442, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63562131, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.0118103, + "step": 10436, + "time_per_iteration": 3.0724289417266846 + }, + { + "auxiliary_loss_clip": 0.01123547, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.04967189, + "balance_loss_mlp": 1.02101672, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.6120153985678327, + "language_loss": 0.7731868, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79476023, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12774658, + "step": 10437, + "time_per_iteration": 2.4895451068878174 + }, + { + "auxiliary_loss_clip": 0.01055136, + "auxiliary_loss_mlp": 0.01004955, + "balance_loss_clip": 1.0296042, + "balance_loss_mlp": 1.00383425, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7210144280329296, + "language_loss": 0.54233694, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56293786, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01119995, + "step": 10438, + "time_per_iteration": 3.009016752243042 + }, + { + "auxiliary_loss_clip": 0.01116436, + "auxiliary_loss_mlp": 0.01040692, + "balance_loss_clip": 1.04290307, + "balance_loss_mlp": 1.02860999, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 2.750782441458497, + "language_loss": 0.84140706, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86297834, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12091064, + "step": 10439, + "time_per_iteration": 2.507948160171509 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.03929973, + "balance_loss_mlp": 1.03002834, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.232524352494123, + "language_loss": 0.80458212, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.82616252, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12982178, + "step": 10440, + "time_per_iteration": 4.022459506988525 + }, + { + "auxiliary_loss_clip": 0.01111785, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.04023826, + "balance_loss_mlp": 1.01877069, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.4993969935980886, + "language_loss": 0.74622858, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76764482, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1105957, + "step": 10441, + "time_per_iteration": 2.4881646633148193 + }, + { + "auxiliary_loss_clip": 0.01121539, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.04715884, + "balance_loss_mlp": 1.01713967, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.7560786530508208, + "language_loss": 0.72432369, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74583399, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12347412, + "step": 10442, + "time_per_iteration": 2.4773876667022705 + }, + { + "auxiliary_loss_clip": 0.01121846, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.04785705, + "balance_loss_mlp": 1.01968503, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.501246119712713, + "language_loss": 0.71523249, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73677778, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.13000488, + "step": 10443, + "time_per_iteration": 2.5088863372802734 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.04031503, + "balance_loss_mlp": 1.01950049, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 3.022143150737396, + "language_loss": 0.73591369, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75737607, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12438965, + "step": 10444, + "time_per_iteration": 2.5314173698425293 + }, + { + "auxiliary_loss_clip": 0.01118607, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.04451275, + "balance_loss_mlp": 1.02037334, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.8533985355143514, + "language_loss": 0.72154582, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.7430582, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12261963, + "step": 10445, + "time_per_iteration": 2.439094305038452 + }, + { + "auxiliary_loss_clip": 0.0111847, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.0449543, + "balance_loss_mlp": 1.01886833, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.512510369728302, + "language_loss": 0.69107914, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71257126, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11871338, + "step": 10446, + "time_per_iteration": 3.8554718494415283 + }, + { + "auxiliary_loss_clip": 0.01124428, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.04422724, + "balance_loss_mlp": 1.02050567, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 2.406602408091439, + "language_loss": 0.73320341, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75478989, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13720703, + "step": 10447, + "time_per_iteration": 2.4118542671203613 + }, + { + "auxiliary_loss_clip": 0.01052729, + "auxiliary_loss_mlp": 0.01009008, + "balance_loss_clip": 1.02629471, + "balance_loss_mlp": 1.00778592, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6783003751947972, + "language_loss": 0.52329081, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54390824, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01220703, + "step": 10448, + "time_per_iteration": 2.928380012512207 + }, + { + "auxiliary_loss_clip": 0.01126652, + "auxiliary_loss_mlp": 0.01046955, + "balance_loss_clip": 1.04897547, + "balance_loss_mlp": 1.03377032, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.435659612503679, + "language_loss": 0.91027957, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93201566, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.1317749, + "step": 10449, + "time_per_iteration": 2.429413080215454 + }, + { + "auxiliary_loss_clip": 0.01114069, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.04012299, + "balance_loss_mlp": 1.01824427, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 2.570675574857162, + "language_loss": 0.60342157, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62485677, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11206055, + "step": 10450, + "time_per_iteration": 2.6790614128112793 + }, + { + "auxiliary_loss_clip": 0.01117498, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.0443356, + "balance_loss_mlp": 1.02141821, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.744498428891183, + "language_loss": 0.76903188, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79054022, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11920166, + "step": 10451, + "time_per_iteration": 3.9094581604003906 + }, + { + "auxiliary_loss_clip": 0.01113646, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.03855896, + "balance_loss_mlp": 1.0200336, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.644605791494584, + "language_loss": 0.77091229, + "learning_rate": 1.281694841064566e-06, + "loss": 0.7923733, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12420654, + "step": 10452, + "time_per_iteration": 2.5123138427734375 + }, + { + "auxiliary_loss_clip": 0.01114588, + "auxiliary_loss_mlp": 0.01032896, + "balance_loss_clip": 1.04218733, + "balance_loss_mlp": 1.02024817, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 2.0043502928426067, + "language_loss": 0.72542441, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74689925, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12646484, + "step": 10453, + "time_per_iteration": 2.497443914413452 + }, + { + "auxiliary_loss_clip": 0.01114828, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.0391407, + "balance_loss_mlp": 1.0147922, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.7648047953918382, + "language_loss": 0.80535102, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.8267765, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1293335, + "step": 10454, + "time_per_iteration": 2.447169542312622 + }, + { + "auxiliary_loss_clip": 0.01114201, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.04180598, + "balance_loss_mlp": 1.02230847, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 2.0308111598024476, + "language_loss": 0.82187718, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84335804, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11584473, + "step": 10455, + "time_per_iteration": 2.47011661529541 + }, + { + "auxiliary_loss_clip": 0.01123562, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.04772508, + "balance_loss_mlp": 1.01630557, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.539406884224133, + "language_loss": 0.81794775, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83946425, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11798096, + "step": 10456, + "time_per_iteration": 2.481003761291504 + }, + { + "auxiliary_loss_clip": 0.01129541, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.05345249, + "balance_loss_mlp": 1.01839995, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5418345315919884, + "language_loss": 0.72103488, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74264306, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12878418, + "step": 10457, + "time_per_iteration": 2.4726486206054688 + }, + { + "auxiliary_loss_clip": 0.01122926, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.04369402, + "balance_loss_mlp": 1.0182023, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.7732307052658418, + "language_loss": 0.8001501, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.82169056, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12908936, + "step": 10458, + "time_per_iteration": 2.4853267669677734 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.04370689, + "balance_loss_mlp": 1.01674247, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.4609068254374233, + "language_loss": 0.61186123, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63334906, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11938477, + "step": 10459, + "time_per_iteration": 2.601616144180298 + }, + { + "auxiliary_loss_clip": 0.01118446, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.0445745, + "balance_loss_mlp": 1.01805854, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.6579329668573022, + "language_loss": 0.78891587, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81039464, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11376953, + "step": 10460, + "time_per_iteration": 2.4904634952545166 + }, + { + "auxiliary_loss_clip": 0.01112373, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.0403626, + "balance_loss_mlp": 1.01677406, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.7679560960220566, + "language_loss": 0.74240506, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76382846, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.13189697, + "step": 10461, + "time_per_iteration": 2.4517812728881836 + }, + { + "auxiliary_loss_clip": 0.01110851, + "auxiliary_loss_mlp": 0.01036397, + "balance_loss_clip": 1.03943086, + "balance_loss_mlp": 1.0233016, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 2.0218220114698573, + "language_loss": 0.70710522, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72857773, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.13079834, + "step": 10462, + "time_per_iteration": 2.5408129692077637 + }, + { + "auxiliary_loss_clip": 0.011101, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.04197454, + "balance_loss_mlp": 1.01801682, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 2.020560876638973, + "language_loss": 0.71991277, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74131119, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.1171875, + "step": 10463, + "time_per_iteration": 2.5212624073028564 + }, + { + "auxiliary_loss_clip": 0.01116589, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.0455091, + "balance_loss_mlp": 1.02490568, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.8132267649480043, + "language_loss": 0.72887719, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.75042474, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.13275146, + "step": 10464, + "time_per_iteration": 2.489426374435425 + }, + { + "auxiliary_loss_clip": 0.01123018, + "auxiliary_loss_mlp": 0.01027522, + "balance_loss_clip": 1.05090463, + "balance_loss_mlp": 1.0159049, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.6743767286415436, + "language_loss": 0.69239092, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71389627, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1161499, + "step": 10465, + "time_per_iteration": 2.439614772796631 + }, + { + "auxiliary_loss_clip": 0.01069409, + "auxiliary_loss_mlp": 0.01003122, + "balance_loss_clip": 1.04348373, + "balance_loss_mlp": 1.00164723, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.682947146362286, + "language_loss": 0.59826666, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61899197, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01473999, + "step": 10466, + "time_per_iteration": 3.203695058822632 + }, + { + "auxiliary_loss_clip": 0.0111218, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.03833342, + "balance_loss_mlp": 1.02080989, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 2.470948293798429, + "language_loss": 0.65027338, + "learning_rate": 1.276245767820154e-06, + "loss": 0.67171776, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11462402, + "step": 10467, + "time_per_iteration": 2.5681705474853516 + }, + { + "auxiliary_loss_clip": 0.01061131, + "auxiliary_loss_mlp": 0.01004924, + "balance_loss_clip": 1.03382516, + "balance_loss_mlp": 1.00359929, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.795154191018452, + "language_loss": 0.5684979, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58915848, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.01324463, + "step": 10468, + "time_per_iteration": 2.8382766246795654 + }, + { + "auxiliary_loss_clip": 0.01054048, + "auxiliary_loss_mlp": 0.01005734, + "balance_loss_clip": 1.02824903, + "balance_loss_mlp": 1.00444651, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7492197642527704, + "language_loss": 0.57978982, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.60038757, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01286316, + "step": 10469, + "time_per_iteration": 3.009033203125 + }, + { + "auxiliary_loss_clip": 0.01053775, + "auxiliary_loss_mlp": 0.0100508, + "balance_loss_clip": 1.02779913, + "balance_loss_mlp": 1.00380754, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6823655449480774, + "language_loss": 0.52096748, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.541556, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01272583, + "step": 10470, + "time_per_iteration": 3.1041345596313477 + }, + { + "auxiliary_loss_clip": 0.01117868, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.04406905, + "balance_loss_mlp": 1.01927543, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.8285750446198281, + "language_loss": 0.7450397, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76653075, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11956787, + "step": 10471, + "time_per_iteration": 2.6562070846557617 + }, + { + "auxiliary_loss_clip": 0.01119635, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.04508996, + "balance_loss_mlp": 1.01843452, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.8314282070513426, + "language_loss": 0.62978351, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65127897, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11468506, + "step": 10472, + "time_per_iteration": 2.4710605144500732 + }, + { + "auxiliary_loss_clip": 0.01127071, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.04946363, + "balance_loss_mlp": 1.01952207, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 3.1394879004699487, + "language_loss": 0.69715798, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71874768, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12371826, + "step": 10473, + "time_per_iteration": 4.006153583526611 + }, + { + "auxiliary_loss_clip": 0.01116293, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.04264736, + "balance_loss_mlp": 1.01662457, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.6317743874912582, + "language_loss": 0.74830508, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76974702, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.112854, + "step": 10474, + "time_per_iteration": 2.5087108612060547 + }, + { + "auxiliary_loss_clip": 0.01118627, + "auxiliary_loss_mlp": 0.01025655, + "balance_loss_clip": 1.04534554, + "balance_loss_mlp": 1.01402569, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.859245622073919, + "language_loss": 0.66527957, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.6867224, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11639404, + "step": 10475, + "time_per_iteration": 2.5618510246276855 + }, + { + "auxiliary_loss_clip": 0.01114265, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.04415786, + "balance_loss_mlp": 1.01958632, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 2.089211374753933, + "language_loss": 0.90295815, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92440605, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10931396, + "step": 10476, + "time_per_iteration": 2.4561824798583984 + }, + { + "auxiliary_loss_clip": 0.01127168, + "auxiliary_loss_mlp": 0.01028545, + "balance_loss_clip": 1.05356669, + "balance_loss_mlp": 1.01743448, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.7977948364012912, + "language_loss": 0.75668401, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.7782411, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11114502, + "step": 10477, + "time_per_iteration": 2.505936861038208 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.05663848, + "balance_loss_mlp": 1.01629043, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.7526186963781587, + "language_loss": 0.70912671, + "learning_rate": 1.272253702758138e-06, + "loss": 0.73075294, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12121582, + "step": 10478, + "time_per_iteration": 2.4921445846557617 + }, + { + "auxiliary_loss_clip": 0.0112401, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.047562, + "balance_loss_mlp": 1.01600122, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.1995279825590726, + "language_loss": 0.67324126, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69476271, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12127686, + "step": 10479, + "time_per_iteration": 2.4616317749023438 + }, + { + "auxiliary_loss_clip": 0.01125554, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.05321801, + "balance_loss_mlp": 1.02277994, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 1.6657700785953544, + "language_loss": 0.737167, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75877255, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12207031, + "step": 10480, + "time_per_iteration": 2.5197978019714355 + }, + { + "auxiliary_loss_clip": 0.01122144, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.04599798, + "balance_loss_mlp": 1.01917374, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 2.007886765974185, + "language_loss": 0.78905272, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81059718, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13116455, + "step": 10481, + "time_per_iteration": 2.482895851135254 + }, + { + "auxiliary_loss_clip": 0.01061669, + "auxiliary_loss_mlp": 0.01008901, + "balance_loss_clip": 1.03640902, + "balance_loss_mlp": 1.0075314, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.896254652925366, + "language_loss": 0.61806601, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63877177, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.25268555, + "router_z_loss_mlp": 0.01370239, + "step": 10482, + "time_per_iteration": 2.9093854427337646 + }, + { + "auxiliary_loss_clip": 0.01119434, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04081476, + "balance_loss_mlp": 1.02413166, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.506511407858946, + "language_loss": 0.83156365, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85313487, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13549805, + "step": 10483, + "time_per_iteration": 3.8937740325927734 + }, + { + "auxiliary_loss_clip": 0.01105256, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.03686178, + "balance_loss_mlp": 1.02620625, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.564309970201554, + "language_loss": 0.72937286, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75080192, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.11444092, + "step": 10484, + "time_per_iteration": 2.6168527603149414 + }, + { + "auxiliary_loss_clip": 0.01113006, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.01599419, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.8262893046619866, + "language_loss": 0.74707586, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76848924, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12329102, + "step": 10485, + "time_per_iteration": 2.571502447128296 + }, + { + "auxiliary_loss_clip": 0.01131761, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.05326056, + "balance_loss_mlp": 1.02430153, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.9138790047562921, + "language_loss": 0.81252301, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83420587, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12237549, + "step": 10486, + "time_per_iteration": 2.724208354949951 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.04342246, + "balance_loss_mlp": 1.02689302, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.8074747913320275, + "language_loss": 0.63306856, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65464044, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11889648, + "step": 10487, + "time_per_iteration": 2.4661078453063965 + }, + { + "auxiliary_loss_clip": 0.01115307, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.0428946, + "balance_loss_mlp": 1.02402472, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.5571524071609886, + "language_loss": 0.66900623, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69051474, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1151123, + "step": 10488, + "time_per_iteration": 2.516685962677002 + }, + { + "auxiliary_loss_clip": 0.0111672, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.04235482, + "balance_loss_mlp": 1.01904607, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.8694465720356566, + "language_loss": 0.6757791, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69725466, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11791992, + "step": 10489, + "time_per_iteration": 3.924858331680298 + }, + { + "auxiliary_loss_clip": 0.0112623, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.04695106, + "balance_loss_mlp": 1.02369845, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.7297104542795239, + "language_loss": 0.69263613, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71426874, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13342285, + "step": 10490, + "time_per_iteration": 2.4718446731567383 + }, + { + "auxiliary_loss_clip": 0.01114932, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.04001427, + "balance_loss_mlp": 1.02274299, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 1.7492455879323885, + "language_loss": 0.77912259, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80062103, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12176514, + "step": 10491, + "time_per_iteration": 2.4920263290405273 + }, + { + "auxiliary_loss_clip": 0.01120309, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.04621696, + "balance_loss_mlp": 1.0241797, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.9665438464848717, + "language_loss": 0.55931145, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58089942, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.14312744, + "step": 10492, + "time_per_iteration": 2.5750410556793213 + }, + { + "auxiliary_loss_clip": 0.01126218, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.05073261, + "balance_loss_mlp": 1.02463746, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.7400575708175607, + "language_loss": 0.63696635, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.65860558, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13061523, + "step": 10493, + "time_per_iteration": 2.4926342964172363 + }, + { + "auxiliary_loss_clip": 0.01118925, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.04440355, + "balance_loss_mlp": 1.01796818, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.436039897350188, + "language_loss": 0.8298614, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.85135484, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12451172, + "step": 10494, + "time_per_iteration": 3.889967203140259 + }, + { + "auxiliary_loss_clip": 0.01124668, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.04857802, + "balance_loss_mlp": 1.0176363, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.8132459683600748, + "language_loss": 0.79209745, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.8136462, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12567139, + "step": 10495, + "time_per_iteration": 2.6454572677612305 + }, + { + "auxiliary_loss_clip": 0.01115901, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.02113688, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.7527836853621634, + "language_loss": 0.70356536, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72507584, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.14001465, + "step": 10496, + "time_per_iteration": 2.4386377334594727 + }, + { + "auxiliary_loss_clip": 0.0112263, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.04662323, + "balance_loss_mlp": 1.01891589, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.1226303066570282, + "language_loss": 0.80065048, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82219416, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12835693, + "step": 10497, + "time_per_iteration": 2.4876081943511963 + }, + { + "auxiliary_loss_clip": 0.01113765, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.04216993, + "balance_loss_mlp": 1.01912737, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.8143849190139496, + "language_loss": 0.74089241, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76233184, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1104126, + "step": 10498, + "time_per_iteration": 2.5472240447998047 + }, + { + "auxiliary_loss_clip": 0.01111272, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.0374701, + "balance_loss_mlp": 1.01940668, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 2.5627333192245607, + "language_loss": 0.69880402, + "learning_rate": 1.264641775364217e-06, + "loss": 0.72023356, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1229248, + "step": 10499, + "time_per_iteration": 2.5303280353546143 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.04582512, + "balance_loss_mlp": 1.02272034, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 2.2321053583337305, + "language_loss": 0.69932771, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72084093, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11914062, + "step": 10500, + "time_per_iteration": 2.5049238204956055 + }, + { + "auxiliary_loss_clip": 0.01112519, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.03909898, + "balance_loss_mlp": 1.02068377, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 2.0501181331166665, + "language_loss": 0.74096, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76240861, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11651611, + "step": 10501, + "time_per_iteration": 2.499147415161133 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.04311085, + "balance_loss_mlp": 1.02262688, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 3.2067202619371153, + "language_loss": 0.75284231, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77436161, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12768555, + "step": 10502, + "time_per_iteration": 2.4905974864959717 + }, + { + "auxiliary_loss_clip": 0.01117501, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_clip": 1.04130077, + "balance_loss_mlp": 1.03203249, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 1.9982629709112856, + "language_loss": 0.85835999, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87998605, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13067627, + "step": 10503, + "time_per_iteration": 2.562453031539917 + }, + { + "auxiliary_loss_clip": 0.01122057, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.04589367, + "balance_loss_mlp": 1.01762772, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 3.3271019402793356, + "language_loss": 0.86852032, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.89003831, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12115479, + "step": 10504, + "time_per_iteration": 2.4942047595977783 + }, + { + "auxiliary_loss_clip": 0.01122215, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.04538834, + "balance_loss_mlp": 1.0227325, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.6746387978553636, + "language_loss": 0.76278996, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78437245, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13305664, + "step": 10505, + "time_per_iteration": 2.5202741622924805 + }, + { + "auxiliary_loss_clip": 0.01112752, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.0383445, + "balance_loss_mlp": 1.02320242, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.9550932452342966, + "language_loss": 0.82029521, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.841784, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12921143, + "step": 10506, + "time_per_iteration": 2.570160388946533 + }, + { + "auxiliary_loss_clip": 0.01121103, + "auxiliary_loss_mlp": 0.01033836, + "balance_loss_clip": 1.04608905, + "balance_loss_mlp": 1.02120566, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.8360332083158202, + "language_loss": 0.74714535, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76869476, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.1262207, + "step": 10507, + "time_per_iteration": 2.5060837268829346 + }, + { + "auxiliary_loss_clip": 0.01125149, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.04747856, + "balance_loss_mlp": 1.02461743, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.6241633083305547, + "language_loss": 0.6796695, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.7012974, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13024902, + "step": 10508, + "time_per_iteration": 2.477893590927124 + }, + { + "auxiliary_loss_clip": 0.0111844, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.04275799, + "balance_loss_mlp": 1.02601504, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.9058262331697307, + "language_loss": 0.71313739, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73471594, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13409424, + "step": 10509, + "time_per_iteration": 2.4911675453186035 + }, + { + "auxiliary_loss_clip": 0.01121766, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.04639721, + "balance_loss_mlp": 1.01888621, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.5907921549580477, + "language_loss": 0.79300749, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81453216, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11810303, + "step": 10510, + "time_per_iteration": 2.472412109375 + }, + { + "auxiliary_loss_clip": 0.01116556, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.04195511, + "balance_loss_mlp": 1.02119064, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 2.147767989925831, + "language_loss": 0.70768869, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.7291882, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12200928, + "step": 10511, + "time_per_iteration": 2.473311424255371 + }, + { + "auxiliary_loss_clip": 0.01115346, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.04377806, + "balance_loss_mlp": 1.01905644, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.6556091295040078, + "language_loss": 0.80154103, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82299936, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11437988, + "step": 10512, + "time_per_iteration": 2.467806577682495 + }, + { + "auxiliary_loss_clip": 0.011241, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.04989803, + "balance_loss_mlp": 1.0223403, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 2.449908904836896, + "language_loss": 0.70960993, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.73120308, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12884521, + "step": 10513, + "time_per_iteration": 2.568326473236084 + }, + { + "auxiliary_loss_clip": 0.0111571, + "auxiliary_loss_mlp": 0.01030992, + "balance_loss_clip": 1.03953803, + "balance_loss_mlp": 1.01790857, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 2.727611953583231, + "language_loss": 0.66175973, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68322676, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13079834, + "step": 10514, + "time_per_iteration": 2.4912054538726807 + }, + { + "auxiliary_loss_clip": 0.01120446, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.04695892, + "balance_loss_mlp": 1.02014768, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.6710676388331174, + "language_loss": 0.74054253, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76206672, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11810303, + "step": 10515, + "time_per_iteration": 2.515519857406616 + }, + { + "auxiliary_loss_clip": 0.01117318, + "auxiliary_loss_mlp": 0.01026197, + "balance_loss_clip": 1.04706812, + "balance_loss_mlp": 1.01516986, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.9147186418185407, + "language_loss": 0.89990282, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.92133796, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11022949, + "step": 10516, + "time_per_iteration": 3.8808517456054688 + }, + { + "auxiliary_loss_clip": 0.01133798, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.05314636, + "balance_loss_mlp": 1.01702523, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.711820790770524, + "language_loss": 0.82139117, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84303665, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13720703, + "step": 10517, + "time_per_iteration": 2.4354522228240967 + }, + { + "auxiliary_loss_clip": 0.01126289, + "auxiliary_loss_mlp": 0.01036213, + "balance_loss_clip": 1.05103207, + "balance_loss_mlp": 1.02387452, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 2.135246103630368, + "language_loss": 0.77551585, + "learning_rate": 1.257765386189541e-06, + "loss": 0.7971409, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12329102, + "step": 10518, + "time_per_iteration": 2.4628682136535645 + }, + { + "auxiliary_loss_clip": 0.01122886, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.05098844, + "balance_loss_mlp": 1.01818228, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.6750095717162938, + "language_loss": 0.85406721, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87559503, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1171875, + "step": 10519, + "time_per_iteration": 2.499537944793701 + }, + { + "auxiliary_loss_clip": 0.01114472, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.04438376, + "balance_loss_mlp": 1.02462029, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 2.3699654215081107, + "language_loss": 0.71758044, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.73907852, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10717773, + "step": 10520, + "time_per_iteration": 2.4911022186279297 + }, + { + "auxiliary_loss_clip": 0.0111663, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.04428005, + "balance_loss_mlp": 1.0193758, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.9708137160801855, + "language_loss": 0.71682262, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73829615, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11340332, + "step": 10521, + "time_per_iteration": 2.5439677238464355 + }, + { + "auxiliary_loss_clip": 0.01122493, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.04832315, + "balance_loss_mlp": 1.02199411, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 2.038437241504446, + "language_loss": 0.72210371, + "learning_rate": 1.256319016853377e-06, + "loss": 0.74368894, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.14044189, + "step": 10522, + "time_per_iteration": 2.4699995517730713 + }, + { + "auxiliary_loss_clip": 0.01122161, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.04592037, + "balance_loss_mlp": 1.02131915, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 2.6593070691166356, + "language_loss": 0.81823003, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83978868, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12371826, + "step": 10523, + "time_per_iteration": 2.447880744934082 + }, + { + "auxiliary_loss_clip": 0.0111935, + "auxiliary_loss_mlp": 0.0102347, + "balance_loss_clip": 1.04596353, + "balance_loss_mlp": 1.01167989, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.6568075408893703, + "language_loss": 0.73749673, + "learning_rate": 1.255596001333195e-06, + "loss": 0.7589249, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11804199, + "step": 10524, + "time_per_iteration": 2.469883441925049 + }, + { + "auxiliary_loss_clip": 0.01125337, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.0462203, + "balance_loss_mlp": 1.0215621, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 2.0410548758223697, + "language_loss": 0.84473264, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86634171, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.14019775, + "step": 10525, + "time_per_iteration": 2.537801742553711 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.01024521, + "balance_loss_clip": 1.04417109, + "balance_loss_mlp": 1.01258826, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 2.063475519542628, + "language_loss": 0.67003798, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.69146764, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1192627, + "step": 10526, + "time_per_iteration": 2.587064266204834 + }, + { + "auxiliary_loss_clip": 0.01132678, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.05420196, + "balance_loss_mlp": 1.01869965, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 1.5072462286826362, + "language_loss": 0.735241, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75689268, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13793945, + "step": 10527, + "time_per_iteration": 3.9281370639801025 + }, + { + "auxiliary_loss_clip": 0.01120443, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.04802454, + "balance_loss_mlp": 1.02189207, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 2.8544481220402296, + "language_loss": 0.72010839, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74164635, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11456299, + "step": 10528, + "time_per_iteration": 2.4847867488861084 + }, + { + "auxiliary_loss_clip": 0.01117147, + "auxiliary_loss_mlp": 0.01025178, + "balance_loss_clip": 1.04351616, + "balance_loss_mlp": 1.01268458, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.720865548886822, + "language_loss": 0.66645229, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68787551, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12506104, + "step": 10529, + "time_per_iteration": 2.485731363296509 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.04159236, + "balance_loss_mlp": 1.01904273, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.7149923409769587, + "language_loss": 0.75223005, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77371991, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13037109, + "step": 10530, + "time_per_iteration": 2.461665630340576 + }, + { + "auxiliary_loss_clip": 0.01127323, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.05205655, + "balance_loss_mlp": 1.01900053, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 1.82961835503903, + "language_loss": 0.73618108, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75776696, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12268066, + "step": 10531, + "time_per_iteration": 2.50280499458313 + }, + { + "auxiliary_loss_clip": 0.01118932, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.04786301, + "balance_loss_mlp": 1.01699901, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.1147573547451413, + "language_loss": 0.79577553, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81725413, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11932373, + "step": 10532, + "time_per_iteration": 3.8695361614227295 + }, + { + "auxiliary_loss_clip": 0.01112699, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.04123342, + "balance_loss_mlp": 1.02367675, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.6839153595419123, + "language_loss": 0.74860418, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.77009165, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12384033, + "step": 10533, + "time_per_iteration": 2.4754581451416016 + }, + { + "auxiliary_loss_clip": 0.01122819, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.04308581, + "balance_loss_mlp": 1.02419996, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 3.771547947657202, + "language_loss": 0.77365291, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79527277, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.14959717, + "step": 10534, + "time_per_iteration": 2.399989366531372 + }, + { + "auxiliary_loss_clip": 0.01121683, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04856646, + "balance_loss_mlp": 1.02009058, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.7985693524644442, + "language_loss": 0.85782766, + "learning_rate": 1.251621437204777e-06, + "loss": 0.8793599, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11456299, + "step": 10535, + "time_per_iteration": 2.496748447418213 + }, + { + "auxiliary_loss_clip": 0.01125672, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.05071652, + "balance_loss_mlp": 1.01630437, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 1.7698121977142827, + "language_loss": 0.76447988, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78602052, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12091064, + "step": 10536, + "time_per_iteration": 2.4798381328582764 + }, + { + "auxiliary_loss_clip": 0.01126154, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.05428004, + "balance_loss_mlp": 1.01689172, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.7996455037364758, + "language_loss": 0.60138196, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62292528, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11279297, + "step": 10537, + "time_per_iteration": 2.506206750869751 + }, + { + "auxiliary_loss_clip": 0.0104629, + "auxiliary_loss_mlp": 0.01006273, + "balance_loss_clip": 1.02136338, + "balance_loss_mlp": 1.00499713, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7963105368497588, + "language_loss": 0.52384484, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54437053, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.24926758, + "router_z_loss_mlp": 0.01275635, + "step": 10538, + "time_per_iteration": 4.604723691940308 + }, + { + "auxiliary_loss_clip": 0.01123342, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.04602873, + "balance_loss_mlp": 1.01595116, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.7653302193318223, + "language_loss": 0.83485103, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85637575, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13171387, + "step": 10539, + "time_per_iteration": 2.5151140689849854 + }, + { + "auxiliary_loss_clip": 0.01118625, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.04386425, + "balance_loss_mlp": 1.01802588, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.8907653708181722, + "language_loss": 0.86794126, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88944387, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.13623047, + "step": 10540, + "time_per_iteration": 2.5060057640075684 + }, + { + "auxiliary_loss_clip": 0.01119419, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.04854059, + "balance_loss_mlp": 1.01942587, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.6232368863168263, + "language_loss": 0.72557628, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74706954, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10473633, + "step": 10541, + "time_per_iteration": 2.589621067047119 + }, + { + "auxiliary_loss_clip": 0.01128156, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.05031466, + "balance_loss_mlp": 1.01635242, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.576211806710961, + "language_loss": 0.84685981, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86844003, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13531494, + "step": 10542, + "time_per_iteration": 2.5831189155578613 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.04502892, + "balance_loss_mlp": 1.0173471, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.7149470177707593, + "language_loss": 0.77805459, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79955697, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.13641357, + "step": 10543, + "time_per_iteration": 2.502105951309204 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.048581, + "balance_loss_mlp": 1.02001429, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.7321201107583246, + "language_loss": 0.73699504, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75851685, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.1184082, + "step": 10544, + "time_per_iteration": 2.514258623123169 + }, + { + "auxiliary_loss_clip": 0.01124072, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.0463953, + "balance_loss_mlp": 1.0225395, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 1.9377592238557715, + "language_loss": 0.68540215, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70699239, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12420654, + "step": 10545, + "time_per_iteration": 2.4664156436920166 + }, + { + "auxiliary_loss_clip": 0.01111311, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.03898501, + "balance_loss_mlp": 1.02351475, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.0896495289915498, + "language_loss": 0.71040964, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73189962, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1416626, + "step": 10546, + "time_per_iteration": 2.4455151557922363 + }, + { + "auxiliary_loss_clip": 0.01124037, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.05086398, + "balance_loss_mlp": 1.01658297, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.4630584056488136, + "language_loss": 0.7806561, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80218166, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11938477, + "step": 10547, + "time_per_iteration": 2.50494384765625 + }, + { + "auxiliary_loss_clip": 0.01125116, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.04851115, + "balance_loss_mlp": 1.02271795, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7801082731994844, + "language_loss": 0.63126051, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65285611, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11730957, + "step": 10548, + "time_per_iteration": 2.503761053085327 + }, + { + "auxiliary_loss_clip": 0.01120635, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.04859996, + "balance_loss_mlp": 1.01896024, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.7444274134458806, + "language_loss": 0.61967659, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.64118648, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11401367, + "step": 10549, + "time_per_iteration": 2.5024805068969727 + }, + { + "auxiliary_loss_clip": 0.0111999, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.04732728, + "balance_loss_mlp": 1.01944613, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 2.291203996884314, + "language_loss": 0.73756838, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75907183, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10900879, + "step": 10550, + "time_per_iteration": 2.534341335296631 + }, + { + "auxiliary_loss_clip": 0.01062694, + "auxiliary_loss_mlp": 0.01003478, + "balance_loss_clip": 1.03789854, + "balance_loss_mlp": 1.00232577, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6924747432605641, + "language_loss": 0.57727599, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.5979377, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01152039, + "step": 10551, + "time_per_iteration": 3.1171188354492188 + }, + { + "auxiliary_loss_clip": 0.01116055, + "auxiliary_loss_mlp": 0.01024176, + "balance_loss_clip": 1.0436368, + "balance_loss_mlp": 1.01305938, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 2.0597416848374444, + "language_loss": 0.66862273, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69002503, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11114502, + "step": 10552, + "time_per_iteration": 2.496096134185791 + }, + { + "auxiliary_loss_clip": 0.0112063, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.04399014, + "balance_loss_mlp": 1.0151484, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.6486233171009108, + "language_loss": 0.82358682, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84506917, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12463379, + "step": 10553, + "time_per_iteration": 2.455610513687134 + }, + { + "auxiliary_loss_clip": 0.01120237, + "auxiliary_loss_mlp": 0.01031506, + "balance_loss_clip": 1.04584455, + "balance_loss_mlp": 1.01960325, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 2.822681166440031, + "language_loss": 0.55162776, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.57314521, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11907959, + "step": 10554, + "time_per_iteration": 2.5983505249023438 + }, + { + "auxiliary_loss_clip": 0.01126563, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.05070853, + "balance_loss_mlp": 1.0201323, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1.766316042705098, + "language_loss": 0.7096945, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.73128593, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12445068, + "step": 10555, + "time_per_iteration": 2.4875879287719727 + }, + { + "auxiliary_loss_clip": 0.01063684, + "auxiliary_loss_mlp": 0.01003191, + "balance_loss_clip": 1.03831959, + "balance_loss_mlp": 1.0017736, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.7999930733844102, + "language_loss": 0.55358171, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57425046, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01417542, + "step": 10556, + "time_per_iteration": 3.00337553024292 + }, + { + "auxiliary_loss_clip": 0.01118885, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.04274499, + "balance_loss_mlp": 1.02013993, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 2.188677781343125, + "language_loss": 0.68262219, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70414692, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13452148, + "step": 10557, + "time_per_iteration": 2.5211946964263916 + }, + { + "auxiliary_loss_clip": 0.01124308, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.0494622, + "balance_loss_mlp": 1.01893568, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.6855115792661959, + "language_loss": 0.70049822, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.7220487, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11804199, + "step": 10558, + "time_per_iteration": 2.4277069568634033 + }, + { + "auxiliary_loss_clip": 0.01117428, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.04362583, + "balance_loss_mlp": 1.02027977, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.7435964732692182, + "language_loss": 0.78260213, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80410397, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12481689, + "step": 10559, + "time_per_iteration": 3.8521523475646973 + }, + { + "auxiliary_loss_clip": 0.01124836, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.04822433, + "balance_loss_mlp": 1.021173, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 2.090231289803619, + "language_loss": 0.68176615, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70335102, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12493896, + "step": 10560, + "time_per_iteration": 2.460761070251465 + }, + { + "auxiliary_loss_clip": 0.01115546, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.04264784, + "balance_loss_mlp": 1.02167344, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.7388182741455263, + "language_loss": 0.76907754, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.79056835, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11871338, + "step": 10561, + "time_per_iteration": 2.487413167953491 + }, + { + "auxiliary_loss_clip": 0.01113006, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.03943527, + "balance_loss_mlp": 1.01552129, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 1.7946815898472992, + "language_loss": 0.7201224, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74152935, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12164307, + "step": 10562, + "time_per_iteration": 2.545854091644287 + }, + { + "auxiliary_loss_clip": 0.01125206, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.04834533, + "balance_loss_mlp": 1.0157578, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 4.15547749691897, + "language_loss": 0.81422734, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83576751, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.13061523, + "step": 10563, + "time_per_iteration": 2.47373628616333 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.04309356, + "balance_loss_mlp": 1.02068579, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.6341204195395163, + "language_loss": 0.81384265, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83535421, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12438965, + "step": 10564, + "time_per_iteration": 2.477804660797119 + }, + { + "auxiliary_loss_clip": 0.01121439, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.04740524, + "balance_loss_mlp": 1.02151966, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.7475756592513467, + "language_loss": 0.72310454, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74465013, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.1161499, + "step": 10565, + "time_per_iteration": 2.5854315757751465 + }, + { + "auxiliary_loss_clip": 0.01117317, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.04181588, + "balance_loss_mlp": 1.01798487, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.0446265183143955, + "language_loss": 0.69593263, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71741825, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13269043, + "step": 10566, + "time_per_iteration": 2.4805305004119873 + }, + { + "auxiliary_loss_clip": 0.01120738, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.04970396, + "balance_loss_mlp": 1.02338505, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.5557172774720174, + "language_loss": 0.69555676, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71711397, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11590576, + "step": 10567, + "time_per_iteration": 2.5615811347961426 + }, + { + "auxiliary_loss_clip": 0.01122561, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.05239701, + "balance_loss_mlp": 1.02017605, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 1.775968257510377, + "language_loss": 0.84455061, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86609644, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11853027, + "step": 10568, + "time_per_iteration": 2.503103733062744 + }, + { + "auxiliary_loss_clip": 0.01112926, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.03880084, + "balance_loss_mlp": 1.0254612, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.7585760271309447, + "language_loss": 0.84121829, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86273146, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12927246, + "step": 10569, + "time_per_iteration": 2.5846734046936035 + }, + { + "auxiliary_loss_clip": 0.01122189, + "auxiliary_loss_mlp": 0.0102767, + "balance_loss_clip": 1.04843688, + "balance_loss_mlp": 1.01567733, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.6871203510235602, + "language_loss": 0.69544089, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71693951, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11987305, + "step": 10570, + "time_per_iteration": 2.4441967010498047 + }, + { + "auxiliary_loss_clip": 0.01121894, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.04399621, + "balance_loss_mlp": 1.01986051, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.8408843811259175, + "language_loss": 0.65947366, + "learning_rate": 1.2386378775476e-06, + "loss": 0.68102491, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13348389, + "step": 10571, + "time_per_iteration": 3.9111924171447754 + }, + { + "auxiliary_loss_clip": 0.01119857, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.04310751, + "balance_loss_mlp": 1.01898789, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.7290112026529716, + "language_loss": 0.71482158, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73632717, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11730957, + "step": 10572, + "time_per_iteration": 2.477153778076172 + }, + { + "auxiliary_loss_clip": 0.01108306, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.03706884, + "balance_loss_mlp": 1.0211153, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.4492853367279857, + "language_loss": 0.81024766, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83165252, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11065674, + "step": 10573, + "time_per_iteration": 2.5640909671783447 + }, + { + "auxiliary_loss_clip": 0.01119571, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.04466808, + "balance_loss_mlp": 1.01917982, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.608055174463633, + "language_loss": 0.69122565, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71273994, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12683105, + "step": 10574, + "time_per_iteration": 2.7232632637023926 + }, + { + "auxiliary_loss_clip": 0.01116157, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.04352379, + "balance_loss_mlp": 1.02097654, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.125395421400744, + "language_loss": 0.86676168, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88827074, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.13757324, + "step": 10575, + "time_per_iteration": 2.399561643600464 + }, + { + "auxiliary_loss_clip": 0.01114722, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.0422864, + "balance_loss_mlp": 1.01881528, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.5581895409136075, + "language_loss": 0.72036111, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74181247, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11602783, + "step": 10576, + "time_per_iteration": 3.872065782546997 + }, + { + "auxiliary_loss_clip": 0.01122856, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.04530799, + "balance_loss_mlp": 1.02308881, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 2.7731060394319935, + "language_loss": 0.69573641, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71732092, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12506104, + "step": 10577, + "time_per_iteration": 2.4792258739471436 + }, + { + "auxiliary_loss_clip": 0.01117726, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.04537344, + "balance_loss_mlp": 1.01997662, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.6798643368867294, + "language_loss": 0.71958834, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74108291, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11761475, + "step": 10578, + "time_per_iteration": 2.633929967880249 + }, + { + "auxiliary_loss_clip": 0.01054025, + "auxiliary_loss_mlp": 0.01005254, + "balance_loss_clip": 1.02764654, + "balance_loss_mlp": 1.00379515, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7010906999501443, + "language_loss": 0.54490328, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56549609, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.26416016, + "router_z_loss_mlp": 0.01460266, + "step": 10579, + "time_per_iteration": 3.158264636993408 + }, + { + "auxiliary_loss_clip": 0.01121131, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.04703271, + "balance_loss_mlp": 1.01589155, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.7460700637454576, + "language_loss": 0.77457988, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79606569, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11572266, + "step": 10580, + "time_per_iteration": 2.4780449867248535 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01027421, + "balance_loss_clip": 1.03998172, + "balance_loss_mlp": 1.01553619, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.7724548522100267, + "language_loss": 0.66809332, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68949711, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11889648, + "step": 10581, + "time_per_iteration": 3.8730216026306152 + }, + { + "auxiliary_loss_clip": 0.01116309, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.04168904, + "balance_loss_mlp": 1.0210768, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.418964328805864, + "language_loss": 0.68022776, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70172417, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12243652, + "step": 10582, + "time_per_iteration": 2.4771318435668945 + }, + { + "auxiliary_loss_clip": 0.0112003, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.04309118, + "balance_loss_mlp": 1.02151477, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 1.7498358159313565, + "language_loss": 0.84458679, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86612153, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11914062, + "step": 10583, + "time_per_iteration": 2.5010159015655518 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.04101253, + "balance_loss_mlp": 1.02002728, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.5927155672412212, + "language_loss": 0.75147164, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77291596, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12042236, + "step": 10584, + "time_per_iteration": 2.4705710411071777 + }, + { + "auxiliary_loss_clip": 0.01124294, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.04655802, + "balance_loss_mlp": 1.0234164, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 2.0528334505052594, + "language_loss": 0.73397148, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.75559622, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.14770508, + "step": 10585, + "time_per_iteration": 2.452169179916382 + }, + { + "auxiliary_loss_clip": 0.01120127, + "auxiliary_loss_mlp": 0.0103054, + "balance_loss_clip": 1.04630303, + "balance_loss_mlp": 1.0193882, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.9095095030389702, + "language_loss": 0.82738268, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.84888935, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1116333, + "step": 10586, + "time_per_iteration": 2.44156813621521 + }, + { + "auxiliary_loss_clip": 0.01111155, + "auxiliary_loss_mlp": 0.0102563, + "balance_loss_clip": 1.03886414, + "balance_loss_mlp": 1.01400161, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 1.7511327869406355, + "language_loss": 0.72439158, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74575937, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11639404, + "step": 10587, + "time_per_iteration": 2.4963552951812744 + }, + { + "auxiliary_loss_clip": 0.01116258, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.04200232, + "balance_loss_mlp": 1.01694727, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 2.001210849644646, + "language_loss": 0.77193034, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.79338396, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.121521, + "step": 10588, + "time_per_iteration": 2.4629082679748535 + }, + { + "auxiliary_loss_clip": 0.01110373, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.04059219, + "balance_loss_mlp": 1.0177896, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.5185736547823872, + "language_loss": 0.80229187, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82370251, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.12896729, + "step": 10589, + "time_per_iteration": 2.4685862064361572 + }, + { + "auxiliary_loss_clip": 0.01122857, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.05001855, + "balance_loss_mlp": 1.0172981, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.4644120972733763, + "language_loss": 0.66879612, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69031978, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12200928, + "step": 10590, + "time_per_iteration": 2.482671022415161 + }, + { + "auxiliary_loss_clip": 0.01122373, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.04402471, + "balance_loss_mlp": 1.01786971, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6540760768954217, + "language_loss": 0.78949726, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81102395, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12438965, + "step": 10591, + "time_per_iteration": 2.53838849067688 + }, + { + "auxiliary_loss_clip": 0.01110822, + "auxiliary_loss_mlp": 0.01027925, + "balance_loss_clip": 1.03867161, + "balance_loss_mlp": 1.01660049, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.515685439890426, + "language_loss": 0.88852543, + "learning_rate": 1.231081372744317e-06, + "loss": 0.90991288, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11328125, + "step": 10592, + "time_per_iteration": 2.6430716514587402 + }, + { + "auxiliary_loss_clip": 0.01116847, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.04304028, + "balance_loss_mlp": 1.01647401, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.4404006439767072, + "language_loss": 0.68148232, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70292538, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10986328, + "step": 10593, + "time_per_iteration": 2.53653883934021 + }, + { + "auxiliary_loss_clip": 0.01113569, + "auxiliary_loss_mlp": 0.01027904, + "balance_loss_clip": 1.04146051, + "balance_loss_mlp": 1.01645374, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 3.030311609731159, + "language_loss": 0.63688362, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65829831, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11456299, + "step": 10594, + "time_per_iteration": 2.580615758895874 + }, + { + "auxiliary_loss_clip": 0.01057268, + "auxiliary_loss_mlp": 0.01006838, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.00559568, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7687809880612521, + "language_loss": 0.54653376, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56717479, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.01243591, + "step": 10595, + "time_per_iteration": 3.241546630859375 + }, + { + "auxiliary_loss_clip": 0.0112603, + "auxiliary_loss_mlp": 0.01038856, + "balance_loss_clip": 1.04854417, + "balance_loss_mlp": 1.02577829, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 2.1844831614283535, + "language_loss": 0.66862255, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69027138, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13079834, + "step": 10596, + "time_per_iteration": 2.531519651412964 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.0490824, + "balance_loss_mlp": 1.01724958, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.0891894551685297, + "language_loss": 0.78724498, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80876696, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12036133, + "step": 10597, + "time_per_iteration": 2.5737764835357666 + }, + { + "auxiliary_loss_clip": 0.01126213, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.05057538, + "balance_loss_mlp": 1.01905453, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 3.672776770377418, + "language_loss": 0.74793601, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.76950407, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11541748, + "step": 10598, + "time_per_iteration": 2.4828343391418457 + }, + { + "auxiliary_loss_clip": 0.01121386, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.04643524, + "balance_loss_mlp": 1.01648319, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.9179364561007446, + "language_loss": 0.68615288, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70764899, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11755371, + "step": 10599, + "time_per_iteration": 2.4445700645446777 + }, + { + "auxiliary_loss_clip": 0.01117442, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.04055536, + "balance_loss_mlp": 1.01701689, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.032808647400525, + "language_loss": 0.8022995, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82377595, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13183594, + "step": 10600, + "time_per_iteration": 2.4494869709014893 + }, + { + "auxiliary_loss_clip": 0.01112542, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.03964329, + "balance_loss_mlp": 1.02005529, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.4162450812481109, + "language_loss": 0.79926097, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82070291, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1161499, + "step": 10601, + "time_per_iteration": 2.521913766860962 + }, + { + "auxiliary_loss_clip": 0.01124766, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.04902267, + "balance_loss_mlp": 1.01567817, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 2.1772077969346255, + "language_loss": 0.67394543, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69547206, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12225342, + "step": 10602, + "time_per_iteration": 3.8966033458709717 + }, + { + "auxiliary_loss_clip": 0.01114042, + "auxiliary_loss_mlp": 0.01029359, + "balance_loss_clip": 1.04027045, + "balance_loss_mlp": 1.0172894, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 2.3358418773279546, + "language_loss": 0.79411197, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81554604, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1206665, + "step": 10603, + "time_per_iteration": 2.4332685470581055 + }, + { + "auxiliary_loss_clip": 0.01125787, + "auxiliary_loss_mlp": 0.0102699, + "balance_loss_clip": 1.04992175, + "balance_loss_mlp": 1.01388884, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.0606498604414845, + "language_loss": 0.77512139, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.79664916, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13104248, + "step": 10604, + "time_per_iteration": 2.4333744049072266 + }, + { + "auxiliary_loss_clip": 0.01128939, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.05152321, + "balance_loss_mlp": 1.01652479, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.6430938627678597, + "language_loss": 0.77094901, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79252398, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12036133, + "step": 10605, + "time_per_iteration": 2.4488086700439453 + }, + { + "auxiliary_loss_clip": 0.01118742, + "auxiliary_loss_mlp": 0.01025974, + "balance_loss_clip": 1.04449797, + "balance_loss_mlp": 1.01271236, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 2.079465319229132, + "language_loss": 0.65595877, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67740595, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.13275146, + "step": 10606, + "time_per_iteration": 2.508517265319824 + }, + { + "auxiliary_loss_clip": 0.01122322, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.05000436, + "balance_loss_mlp": 1.01858354, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 3.601380055838223, + "language_loss": 0.75392568, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77544826, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11364746, + "step": 10607, + "time_per_iteration": 2.4453415870666504 + }, + { + "auxiliary_loss_clip": 0.01120765, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.04383612, + "balance_loss_mlp": 1.01925337, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 2.0673306881893296, + "language_loss": 0.65927064, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68079376, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12298584, + "step": 10608, + "time_per_iteration": 2.7276201248168945 + }, + { + "auxiliary_loss_clip": 0.01053013, + "auxiliary_loss_mlp": 0.01003983, + "balance_loss_clip": 1.0272547, + "balance_loss_mlp": 1.00272644, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7293035498716222, + "language_loss": 0.51957941, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.54014933, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01255798, + "step": 10609, + "time_per_iteration": 3.0479400157928467 + }, + { + "auxiliary_loss_clip": 0.01117618, + "auxiliary_loss_mlp": 0.01025234, + "balance_loss_clip": 1.04750228, + "balance_loss_mlp": 1.01476705, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.605681788619768, + "language_loss": 0.75051141, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.77193987, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10467529, + "step": 10610, + "time_per_iteration": 2.463437557220459 + }, + { + "auxiliary_loss_clip": 0.01055522, + "auxiliary_loss_mlp": 0.01009381, + "balance_loss_clip": 1.02846241, + "balance_loss_mlp": 1.00804698, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8437370395229865, + "language_loss": 0.6313802, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65202928, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01333618, + "step": 10611, + "time_per_iteration": 3.1335909366607666 + }, + { + "auxiliary_loss_clip": 0.01123109, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.04855061, + "balance_loss_mlp": 1.01746249, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.1067523425573693, + "language_loss": 0.72065544, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74217957, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11834717, + "step": 10612, + "time_per_iteration": 2.5227489471435547 + }, + { + "auxiliary_loss_clip": 0.01067827, + "auxiliary_loss_mlp": 0.01004373, + "balance_loss_clip": 1.04330254, + "balance_loss_mlp": 1.00324607, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7267492417186192, + "language_loss": 0.57912469, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.59984672, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01126099, + "step": 10613, + "time_per_iteration": 2.9867019653320312 + }, + { + "auxiliary_loss_clip": 0.01131061, + "auxiliary_loss_mlp": 0.01026343, + "balance_loss_clip": 1.05550492, + "balance_loss_mlp": 1.01347482, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.9330192869399379, + "language_loss": 0.74869555, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77026963, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12872314, + "step": 10614, + "time_per_iteration": 3.910470485687256 + }, + { + "auxiliary_loss_clip": 0.01124709, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.04936779, + "balance_loss_mlp": 1.01848269, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 2.151234068515744, + "language_loss": 0.79470742, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81625795, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11859131, + "step": 10615, + "time_per_iteration": 2.481318712234497 + }, + { + "auxiliary_loss_clip": 0.01052389, + "auxiliary_loss_mlp": 0.01002911, + "balance_loss_clip": 1.02663732, + "balance_loss_mlp": 1.00167418, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6598310665660757, + "language_loss": 0.55647111, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57702416, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01235962, + "step": 10616, + "time_per_iteration": 3.149543523788452 + }, + { + "auxiliary_loss_clip": 0.01125902, + "auxiliary_loss_mlp": 0.01033876, + "balance_loss_clip": 1.04980886, + "balance_loss_mlp": 1.02160311, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.8480682645767468, + "language_loss": 0.83966684, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86126459, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.1227417, + "step": 10617, + "time_per_iteration": 2.4392502307891846 + }, + { + "auxiliary_loss_clip": 0.0112134, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.04581332, + "balance_loss_mlp": 1.02096701, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 2.4296437845100876, + "language_loss": 0.87072092, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89227402, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12994385, + "step": 10618, + "time_per_iteration": 2.5068321228027344 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.05020499, + "balance_loss_mlp": 1.02148914, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.626202723995617, + "language_loss": 0.7324965, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75407243, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11169434, + "step": 10619, + "time_per_iteration": 2.4597842693328857 + }, + { + "auxiliary_loss_clip": 0.01125946, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.04559493, + "balance_loss_mlp": 1.02186143, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 2.0841459250495062, + "language_loss": 0.76110542, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78272116, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13751221, + "step": 10620, + "time_per_iteration": 3.817079544067383 + }, + { + "auxiliary_loss_clip": 0.01127606, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.05275536, + "balance_loss_mlp": 1.01582468, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 9.263551262408047, + "language_loss": 0.70510274, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72665602, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11907959, + "step": 10621, + "time_per_iteration": 2.4864156246185303 + }, + { + "auxiliary_loss_clip": 0.01117747, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.04432499, + "balance_loss_mlp": 1.01856279, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.5502205183204796, + "language_loss": 0.77926022, + "learning_rate": 1.220308702586529e-06, + "loss": 0.80073476, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1114502, + "step": 10622, + "time_per_iteration": 2.4602811336517334 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.04667664, + "balance_loss_mlp": 1.01700044, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.7166284708979649, + "language_loss": 0.74867809, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77015579, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1151123, + "step": 10623, + "time_per_iteration": 2.457646369934082 + }, + { + "auxiliary_loss_clip": 0.01112397, + "auxiliary_loss_mlp": 0.01027576, + "balance_loss_clip": 1.04195464, + "balance_loss_mlp": 1.01689434, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.6599604163478328, + "language_loss": 0.76646239, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78786206, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10681152, + "step": 10624, + "time_per_iteration": 2.4872870445251465 + }, + { + "auxiliary_loss_clip": 0.01119361, + "auxiliary_loss_mlp": 0.01028802, + "balance_loss_clip": 1.04644966, + "balance_loss_mlp": 1.0170356, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.8277992786481598, + "language_loss": 0.80709338, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82857502, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11767578, + "step": 10625, + "time_per_iteration": 3.8841660022735596 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.04636657, + "balance_loss_mlp": 1.01970875, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.7581533309877617, + "language_loss": 0.72713298, + "learning_rate": 1.218874349031654e-06, + "loss": 0.74866378, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12487793, + "step": 10626, + "time_per_iteration": 2.4335951805114746 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01030153, + "balance_loss_clip": 1.0481559, + "balance_loss_mlp": 1.01748085, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 2.2842066626318545, + "language_loss": 0.72416508, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74568212, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12670898, + "step": 10627, + "time_per_iteration": 2.4448912143707275 + }, + { + "auxiliary_loss_clip": 0.01123098, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.04298949, + "balance_loss_mlp": 1.02277732, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 2.5525228844882455, + "language_loss": 0.66851115, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69011283, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.14300537, + "step": 10628, + "time_per_iteration": 2.5720279216766357 + }, + { + "auxiliary_loss_clip": 0.01122174, + "auxiliary_loss_mlp": 0.01023528, + "balance_loss_clip": 1.05042052, + "balance_loss_mlp": 1.01271534, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.811626644793972, + "language_loss": 0.67822587, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.69968289, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10809326, + "step": 10629, + "time_per_iteration": 2.5546069145202637 + }, + { + "auxiliary_loss_clip": 0.01128537, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.04864597, + "balance_loss_mlp": 1.02299821, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.6010312629500851, + "language_loss": 0.75632584, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77797854, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13720703, + "step": 10630, + "time_per_iteration": 2.507040500640869 + }, + { + "auxiliary_loss_clip": 0.01117193, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.04217637, + "balance_loss_mlp": 1.02291799, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.8279888384960645, + "language_loss": 0.70592648, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.7274406, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11309814, + "step": 10631, + "time_per_iteration": 2.5296454429626465 + }, + { + "auxiliary_loss_clip": 0.0105499, + "auxiliary_loss_mlp": 0.010059, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.00459898, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7651479827042502, + "language_loss": 0.62966728, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65027618, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01301575, + "step": 10632, + "time_per_iteration": 3.148926019668579 + }, + { + "auxiliary_loss_clip": 0.0112503, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.0532676, + "balance_loss_mlp": 1.02368283, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 2.1128310606344267, + "language_loss": 0.66719085, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68879759, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11956787, + "step": 10633, + "time_per_iteration": 2.526094675064087 + }, + { + "auxiliary_loss_clip": 0.01124165, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.05006051, + "balance_loss_mlp": 1.01723111, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 2.263686979150765, + "language_loss": 0.81934363, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84087104, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11334229, + "step": 10634, + "time_per_iteration": 2.4572947025299072 + }, + { + "auxiliary_loss_clip": 0.01117246, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.04491091, + "balance_loss_mlp": 1.02237463, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.5952356451938152, + "language_loss": 0.75114083, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77266383, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12683105, + "step": 10635, + "time_per_iteration": 2.5350680351257324 + }, + { + "auxiliary_loss_clip": 0.0112331, + "auxiliary_loss_mlp": 0.01027565, + "balance_loss_clip": 1.04833782, + "balance_loss_mlp": 1.01562583, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.6501965500486642, + "language_loss": 0.71413934, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73564816, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.1194458, + "step": 10636, + "time_per_iteration": 2.532768726348877 + }, + { + "auxiliary_loss_clip": 0.01116051, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.04048467, + "balance_loss_mlp": 1.02084064, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 1.8836300658854659, + "language_loss": 0.73950028, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.76099467, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12536621, + "step": 10637, + "time_per_iteration": 2.5398190021514893 + }, + { + "auxiliary_loss_clip": 0.01114953, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.03936481, + "balance_loss_mlp": 1.02024484, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 2.467752607479473, + "language_loss": 0.78274703, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.80422628, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12744141, + "step": 10638, + "time_per_iteration": 2.4498586654663086 + }, + { + "auxiliary_loss_clip": 0.01122585, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.04774332, + "balance_loss_mlp": 1.01533818, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 3.8003803664919795, + "language_loss": 0.82097399, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.84247643, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12316895, + "step": 10639, + "time_per_iteration": 2.5335733890533447 + }, + { + "auxiliary_loss_clip": 0.01044317, + "auxiliary_loss_mlp": 0.01003505, + "balance_loss_clip": 1.0192647, + "balance_loss_mlp": 1.00226188, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8198966619027849, + "language_loss": 0.5906049, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61108315, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.01243591, + "step": 10640, + "time_per_iteration": 3.0506536960601807 + }, + { + "auxiliary_loss_clip": 0.01106562, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.03661609, + "balance_loss_mlp": 1.01613677, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.7986103947521022, + "language_loss": 0.78251868, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80386555, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11981201, + "step": 10641, + "time_per_iteration": 2.5200564861297607 + }, + { + "auxiliary_loss_clip": 0.01123963, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.04283071, + "balance_loss_mlp": 1.02136421, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.842861829461779, + "language_loss": 0.63281918, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65439737, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12493896, + "step": 10642, + "time_per_iteration": 2.486205816268921 + }, + { + "auxiliary_loss_clip": 0.01049578, + "auxiliary_loss_mlp": 0.01003658, + "balance_loss_clip": 1.02389944, + "balance_loss_mlp": 1.00239444, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.9336580189276573, + "language_loss": 0.55958062, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58011293, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01263428, + "step": 10643, + "time_per_iteration": 3.0837831497192383 + }, + { + "auxiliary_loss_clip": 0.01123409, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.04569578, + "balance_loss_mlp": 1.01509202, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 2.0246571225996366, + "language_loss": 0.76896656, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79047906, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12750244, + "step": 10644, + "time_per_iteration": 3.912437915802002 + }, + { + "auxiliary_loss_clip": 0.011164, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.04335189, + "balance_loss_mlp": 1.02089846, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.470114941229001, + "language_loss": 0.82888049, + "learning_rate": 1.212067656542203e-06, + "loss": 0.85040319, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.14971924, + "step": 10645, + "time_per_iteration": 2.4992892742156982 + }, + { + "auxiliary_loss_clip": 0.01124762, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.04637694, + "balance_loss_mlp": 1.01989245, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.8023175632264858, + "language_loss": 0.7362532, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75783455, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13476562, + "step": 10646, + "time_per_iteration": 2.536018133163452 + }, + { + "auxiliary_loss_clip": 0.01122204, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.04564035, + "balance_loss_mlp": 1.01684785, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.2307616957888534, + "language_loss": 0.80093145, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82244843, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12658691, + "step": 10647, + "time_per_iteration": 2.4552085399627686 + }, + { + "auxiliary_loss_clip": 0.01119197, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.04533172, + "balance_loss_mlp": 1.01840901, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.86440383479713, + "language_loss": 0.75739324, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77889419, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12487793, + "step": 10648, + "time_per_iteration": 2.5058677196502686 + }, + { + "auxiliary_loss_clip": 0.0112, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.04428816, + "balance_loss_mlp": 1.01500058, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 2.0671241403182394, + "language_loss": 0.79047132, + "learning_rate": 1.210636039936138e-06, + "loss": 0.81193829, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11700439, + "step": 10649, + "time_per_iteration": 2.484053134918213 + }, + { + "auxiliary_loss_clip": 0.011241, + "auxiliary_loss_mlp": 0.01034978, + "balance_loss_clip": 1.04955673, + "balance_loss_mlp": 1.02156711, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.6499696259638448, + "language_loss": 0.75602067, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77761143, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13421631, + "step": 10650, + "time_per_iteration": 2.4492547512054443 + }, + { + "auxiliary_loss_clip": 0.01127228, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0520432, + "balance_loss_mlp": 1.0203402, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.5808504861460762, + "language_loss": 0.70283818, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72444379, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13000488, + "step": 10651, + "time_per_iteration": 2.4747488498687744 + }, + { + "auxiliary_loss_clip": 0.01119537, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.04601836, + "balance_loss_mlp": 1.02228463, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.1891074399494506, + "language_loss": 0.63658094, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65812957, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.13024902, + "step": 10652, + "time_per_iteration": 2.4881789684295654 + }, + { + "auxiliary_loss_clip": 0.01124219, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.04964519, + "balance_loss_mlp": 1.01403213, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 1.9431689572470645, + "language_loss": 0.79272491, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81422794, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1206665, + "step": 10653, + "time_per_iteration": 2.424808979034424 + }, + { + "auxiliary_loss_clip": 0.01131719, + "auxiliary_loss_mlp": 0.01046197, + "balance_loss_clip": 1.04899001, + "balance_loss_mlp": 1.03175437, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 3.5194079217357612, + "language_loss": 0.70645499, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72823417, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14434814, + "step": 10654, + "time_per_iteration": 2.4587697982788086 + }, + { + "auxiliary_loss_clip": 0.01129875, + "auxiliary_loss_mlp": 0.01047901, + "balance_loss_clip": 1.046767, + "balance_loss_mlp": 1.03322065, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 2.3716483486377706, + "language_loss": 0.72680008, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74857789, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14685059, + "step": 10655, + "time_per_iteration": 2.493201971054077 + }, + { + "auxiliary_loss_clip": 0.01124189, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.04717565, + "balance_loss_mlp": 1.02095079, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 2.2100667220616823, + "language_loss": 0.82651985, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.84809846, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.1272583, + "step": 10656, + "time_per_iteration": 2.517559289932251 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.05144882, + "balance_loss_mlp": 1.02023816, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.245671043984589, + "language_loss": 0.72620201, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.7478019, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1138916, + "step": 10657, + "time_per_iteration": 2.466269016265869 + }, + { + "auxiliary_loss_clip": 0.01126099, + "auxiliary_loss_mlp": 0.01041224, + "balance_loss_clip": 1.05155778, + "balance_loss_mlp": 1.02870107, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.1756853383935244, + "language_loss": 0.77320206, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79487526, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12512207, + "step": 10658, + "time_per_iteration": 3.9518678188323975 + }, + { + "auxiliary_loss_clip": 0.01124125, + "auxiliary_loss_mlp": 0.01036837, + "balance_loss_clip": 1.04690897, + "balance_loss_mlp": 1.02398634, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 2.226072170422382, + "language_loss": 0.76379764, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78540719, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.128479, + "step": 10659, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.04405379, + "balance_loss_mlp": 1.0184201, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.9828541681641663, + "language_loss": 0.7825312, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80402613, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12487793, + "step": 10660, + "time_per_iteration": 2.451019287109375 + }, + { + "auxiliary_loss_clip": 0.01135314, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.05040956, + "balance_loss_mlp": 1.02442551, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 2.2613350586014116, + "language_loss": 0.68233013, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70406938, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14190674, + "step": 10661, + "time_per_iteration": 2.482469081878662 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.04705644, + "balance_loss_mlp": 1.02451396, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.8806096913870272, + "language_loss": 0.75812781, + "learning_rate": 1.205986598033362e-06, + "loss": 0.77967012, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11303711, + "step": 10662, + "time_per_iteration": 2.4894936084747314 + }, + { + "auxiliary_loss_clip": 0.01125197, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.05061209, + "balance_loss_mlp": 1.01397264, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 2.6654710417274763, + "language_loss": 0.69621754, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.717731, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12182617, + "step": 10663, + "time_per_iteration": 3.89359974861145 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01043991, + "balance_loss_clip": 1.04424167, + "balance_loss_mlp": 1.029567, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 1.967707731593236, + "language_loss": 0.68521559, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70686436, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.14422607, + "step": 10664, + "time_per_iteration": 2.470210552215576 + }, + { + "auxiliary_loss_clip": 0.01112799, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.03969884, + "balance_loss_mlp": 1.02136111, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.7687123593290208, + "language_loss": 0.66062129, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68208927, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12646484, + "step": 10665, + "time_per_iteration": 2.488369941711426 + }, + { + "auxiliary_loss_clip": 0.0112171, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.04864562, + "balance_loss_mlp": 1.01889229, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.7776972882987045, + "language_loss": 0.64405143, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66557962, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12231445, + "step": 10666, + "time_per_iteration": 2.4957900047302246 + }, + { + "auxiliary_loss_clip": 0.01120798, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.04500508, + "balance_loss_mlp": 1.02305579, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.529007505989575, + "language_loss": 0.70865875, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73021841, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12097168, + "step": 10667, + "time_per_iteration": 2.4353010654449463 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01043896, + "balance_loss_clip": 1.04783142, + "balance_loss_mlp": 1.02928042, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.283012598804286, + "language_loss": 0.77779078, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79952836, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.14599609, + "step": 10668, + "time_per_iteration": 3.9084651470184326 + }, + { + "auxiliary_loss_clip": 0.01123866, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.04967046, + "balance_loss_mlp": 1.02031553, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.605190719022394, + "language_loss": 0.67647707, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69804478, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12597656, + "step": 10669, + "time_per_iteration": 2.568627119064331 + }, + { + "auxiliary_loss_clip": 0.01135148, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.05296052, + "balance_loss_mlp": 1.02449632, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.7498649757872546, + "language_loss": 0.78405356, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80578482, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13494873, + "step": 10670, + "time_per_iteration": 2.509671211242676 + }, + { + "auxiliary_loss_clip": 0.01127197, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.04835844, + "balance_loss_mlp": 1.01766312, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.746842751491424, + "language_loss": 0.88663626, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90821576, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13092041, + "step": 10671, + "time_per_iteration": 2.569023609161377 + }, + { + "auxiliary_loss_clip": 0.0111297, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.04212844, + "balance_loss_mlp": 1.01751292, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.5977405269224167, + "language_loss": 0.69193375, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.7133522, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1137085, + "step": 10672, + "time_per_iteration": 2.4955942630767822 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.04736793, + "balance_loss_mlp": 1.01539469, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.998489833892765, + "language_loss": 0.74627608, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.7678408, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.14324951, + "step": 10673, + "time_per_iteration": 2.521345376968384 + }, + { + "auxiliary_loss_clip": 0.01123463, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.0476414, + "balance_loss_mlp": 1.02202725, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 2.372736107353752, + "language_loss": 0.69242883, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.7140196, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13568115, + "step": 10674, + "time_per_iteration": 2.5617425441741943 + }, + { + "auxiliary_loss_clip": 0.0112119, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.04208899, + "balance_loss_mlp": 1.01584959, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 2.0393875505442978, + "language_loss": 0.66438258, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68588334, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13049316, + "step": 10675, + "time_per_iteration": 2.4778122901916504 + }, + { + "auxiliary_loss_clip": 0.01122722, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.04774761, + "balance_loss_mlp": 1.02368402, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 2.190835249855628, + "language_loss": 0.66112971, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68271381, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12017822, + "step": 10676, + "time_per_iteration": 2.514028310775757 + }, + { + "auxiliary_loss_clip": 0.01118121, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.0424509, + "balance_loss_mlp": 1.01744258, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 1.9270805942000957, + "language_loss": 0.75585973, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77735639, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.14086914, + "step": 10677, + "time_per_iteration": 2.5602798461914062 + }, + { + "auxiliary_loss_clip": 0.01053884, + "auxiliary_loss_mlp": 0.01011772, + "balance_loss_clip": 1.02842605, + "balance_loss_mlp": 1.01009715, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7639826648554519, + "language_loss": 0.60715598, + "learning_rate": 1.200271196442818e-06, + "loss": 0.6278125, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01675415, + "step": 10678, + "time_per_iteration": 3.1674468517303467 + }, + { + "auxiliary_loss_clip": 0.01125803, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.05275202, + "balance_loss_mlp": 1.02248108, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 2.051640230791283, + "language_loss": 0.67776883, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69938529, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.13360596, + "step": 10679, + "time_per_iteration": 2.5328571796417236 + }, + { + "auxiliary_loss_clip": 0.01126688, + "auxiliary_loss_mlp": 0.01037512, + "balance_loss_clip": 1.04688358, + "balance_loss_mlp": 1.02186012, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.6245389729723279, + "language_loss": 0.73147535, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75311732, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.15649414, + "step": 10680, + "time_per_iteration": 2.489293098449707 + }, + { + "auxiliary_loss_clip": 0.01131688, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.05308473, + "balance_loss_mlp": 1.01853836, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 2.0197381960761946, + "language_loss": 0.6729545, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.69457489, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11810303, + "step": 10681, + "time_per_iteration": 2.5767133235931396 + }, + { + "auxiliary_loss_clip": 0.01114327, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.04107106, + "balance_loss_mlp": 1.01929665, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 2.014939302815809, + "language_loss": 0.74681878, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76829243, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.13739014, + "step": 10682, + "time_per_iteration": 2.4338860511779785 + }, + { + "auxiliary_loss_clip": 0.01113362, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.04150772, + "balance_loss_mlp": 1.01866603, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.5104863970714093, + "language_loss": 0.79603648, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81747031, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11358643, + "step": 10683, + "time_per_iteration": 2.4958243370056152 + }, + { + "auxiliary_loss_clip": 0.01127229, + "auxiliary_loss_mlp": 0.01049304, + "balance_loss_clip": 1.04606652, + "balance_loss_mlp": 1.03487921, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.9052427204108437, + "language_loss": 0.67479044, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69655573, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14422607, + "step": 10684, + "time_per_iteration": 2.4678308963775635 + }, + { + "auxiliary_loss_clip": 0.01119119, + "auxiliary_loss_mlp": 0.01028856, + "balance_loss_clip": 1.04337323, + "balance_loss_mlp": 1.01610637, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.226986089561788, + "language_loss": 0.71154445, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73302424, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12744141, + "step": 10685, + "time_per_iteration": 2.517355442047119 + }, + { + "auxiliary_loss_clip": 0.01121757, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.0469197, + "balance_loss_mlp": 1.02187133, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.528197712729126, + "language_loss": 0.75540125, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77695489, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11737061, + "step": 10686, + "time_per_iteration": 2.5105130672454834 + }, + { + "auxiliary_loss_clip": 0.01132822, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.0502758, + "balance_loss_mlp": 1.0216074, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 7.28450591489321, + "language_loss": 0.69182837, + "learning_rate": 1.197059691144867e-06, + "loss": 0.71350986, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13720703, + "step": 10687, + "time_per_iteration": 2.528419017791748 + }, + { + "auxiliary_loss_clip": 0.01121886, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.04629576, + "balance_loss_mlp": 1.02094579, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.8851043125853482, + "language_loss": 0.66282427, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68437898, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12652588, + "step": 10688, + "time_per_iteration": 4.006505250930786 + }, + { + "auxiliary_loss_clip": 0.01124098, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.04770958, + "balance_loss_mlp": 1.0174942, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.6734934614827073, + "language_loss": 0.73128968, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75283569, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13006592, + "step": 10689, + "time_per_iteration": 2.4849491119384766 + }, + { + "auxiliary_loss_clip": 0.01119632, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.0477922, + "balance_loss_mlp": 1.01896834, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.323552322645886, + "language_loss": 0.72209728, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74359572, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11236572, + "step": 10690, + "time_per_iteration": 2.5868566036224365 + }, + { + "auxiliary_loss_clip": 0.01117527, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.04223609, + "balance_loss_mlp": 1.01948094, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 2.068246644998994, + "language_loss": 0.7760843, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.79758471, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13031006, + "step": 10691, + "time_per_iteration": 2.6722893714904785 + }, + { + "auxiliary_loss_clip": 0.01132241, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.05596566, + "balance_loss_mlp": 1.02065945, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 2.1466355894787816, + "language_loss": 0.74325687, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76491129, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12548828, + "step": 10692, + "time_per_iteration": 2.5946600437164307 + }, + { + "auxiliary_loss_clip": 0.01124322, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.04576397, + "balance_loss_mlp": 1.02345443, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 2.004874035622632, + "language_loss": 0.61502814, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63663578, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13000488, + "step": 10693, + "time_per_iteration": 2.519313335418701 + }, + { + "auxiliary_loss_clip": 0.01131642, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.046808, + "balance_loss_mlp": 1.02375782, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.7465652095229622, + "language_loss": 0.59837151, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.62006164, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.1362915, + "step": 10694, + "time_per_iteration": 2.546110153198242 + }, + { + "auxiliary_loss_clip": 0.01125643, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.04557014, + "balance_loss_mlp": 1.02575278, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.4703192487195493, + "language_loss": 0.80036312, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82200229, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12524414, + "step": 10695, + "time_per_iteration": 2.4521372318267822 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.01038165, + "balance_loss_clip": 1.04272366, + "balance_loss_mlp": 1.02515292, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.8796651605834611, + "language_loss": 0.73632395, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75789893, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13037109, + "step": 10696, + "time_per_iteration": 2.4863147735595703 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.05150092, + "balance_loss_mlp": 1.01530743, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 2.0276365326766217, + "language_loss": 0.7584759, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.78000504, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11975098, + "step": 10697, + "time_per_iteration": 2.524282455444336 + }, + { + "auxiliary_loss_clip": 0.0112549, + "auxiliary_loss_mlp": 0.01028275, + "balance_loss_clip": 1.05003285, + "balance_loss_mlp": 1.01640821, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 2.58664788685104, + "language_loss": 0.66231072, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68384838, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11859131, + "step": 10698, + "time_per_iteration": 2.681434392929077 + }, + { + "auxiliary_loss_clip": 0.01062998, + "auxiliary_loss_mlp": 0.01008983, + "balance_loss_clip": 1.03669357, + "balance_loss_mlp": 1.00768781, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8326996444628378, + "language_loss": 0.63439727, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65511703, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.26318359, + "router_z_loss_mlp": 0.01295471, + "step": 10699, + "time_per_iteration": 3.091724395751953 + }, + { + "auxiliary_loss_clip": 0.01133702, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.05933976, + "balance_loss_mlp": 1.01816618, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 2.1450780178339808, + "language_loss": 0.69615698, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71778601, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11035156, + "step": 10700, + "time_per_iteration": 2.47717022895813 + }, + { + "auxiliary_loss_clip": 0.01126515, + "auxiliary_loss_mlp": 0.01024951, + "balance_loss_clip": 1.05166197, + "balance_loss_mlp": 1.01320279, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.621458820062875, + "language_loss": 0.73369968, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75521433, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11755371, + "step": 10701, + "time_per_iteration": 3.931448459625244 + }, + { + "auxiliary_loss_clip": 0.01121327, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.04421043, + "balance_loss_mlp": 1.01734424, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 2.285440744501267, + "language_loss": 0.82194245, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84346426, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13519287, + "step": 10702, + "time_per_iteration": 2.4089195728302 + }, + { + "auxiliary_loss_clip": 0.01127183, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.05448711, + "balance_loss_mlp": 1.02425528, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 2.585233149087785, + "language_loss": 0.74521738, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76685327, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1217041, + "step": 10703, + "time_per_iteration": 2.437798500061035 + }, + { + "auxiliary_loss_clip": 0.01060594, + "auxiliary_loss_mlp": 0.01002979, + "balance_loss_clip": 1.0358032, + "balance_loss_mlp": 1.00167084, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6536505375201829, + "language_loss": 0.5458436, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56647933, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01307678, + "step": 10704, + "time_per_iteration": 3.075758695602417 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.0102412, + "balance_loss_clip": 1.04559529, + "balance_loss_mlp": 1.01361132, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.8459911120569674, + "language_loss": 0.76959312, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79103613, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1050415, + "step": 10705, + "time_per_iteration": 2.485852003097534 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.0103611, + "balance_loss_clip": 1.04679346, + "balance_loss_mlp": 1.02455246, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 5.25654631707979, + "language_loss": 0.79266804, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81423414, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11560059, + "step": 10706, + "time_per_iteration": 2.4813015460968018 + }, + { + "auxiliary_loss_clip": 0.01122256, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.04560709, + "balance_loss_mlp": 1.01651323, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.2289103949356437, + "language_loss": 0.8028878, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82440281, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12731934, + "step": 10707, + "time_per_iteration": 3.9587833881378174 + }, + { + "auxiliary_loss_clip": 0.01122534, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.04605603, + "balance_loss_mlp": 1.01869345, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 5.187264636524386, + "language_loss": 0.85935152, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.88088298, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11938477, + "step": 10708, + "time_per_iteration": 2.5438287258148193 + }, + { + "auxiliary_loss_clip": 0.01133702, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.05054677, + "balance_loss_mlp": 1.02358747, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.1765043267262643, + "language_loss": 0.65737963, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67909026, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13757324, + "step": 10709, + "time_per_iteration": 2.439117908477783 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.04633427, + "balance_loss_mlp": 1.0193181, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 1.889234113444999, + "language_loss": 0.80602688, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82752824, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11798096, + "step": 10710, + "time_per_iteration": 2.500434160232544 + }, + { + "auxiliary_loss_clip": 0.01115656, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.04370964, + "balance_loss_mlp": 1.01573336, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 2.04409963599734, + "language_loss": 0.66292125, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68434966, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11437988, + "step": 10711, + "time_per_iteration": 2.5779590606689453 + }, + { + "auxiliary_loss_clip": 0.01125972, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.0515939, + "balance_loss_mlp": 1.0188477, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.6546124036695524, + "language_loss": 0.78973734, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.81130826, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12280273, + "step": 10712, + "time_per_iteration": 2.5117063522338867 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.04563808, + "balance_loss_mlp": 1.0219214, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.6335093123882605, + "language_loss": 0.82731771, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84887981, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12121582, + "step": 10713, + "time_per_iteration": 3.869971752166748 + }, + { + "auxiliary_loss_clip": 0.01127881, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.05523884, + "balance_loss_mlp": 1.02355194, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.5148744446406823, + "language_loss": 0.786578, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80820704, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11456299, + "step": 10714, + "time_per_iteration": 2.4865026473999023 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.04430079, + "balance_loss_mlp": 1.0196476, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.4754140340791133, + "language_loss": 0.81470752, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83618444, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11358643, + "step": 10715, + "time_per_iteration": 2.590151071548462 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.04267454, + "balance_loss_mlp": 1.01795745, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 2.3948410129411877, + "language_loss": 0.81366611, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83513713, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1192627, + "step": 10716, + "time_per_iteration": 2.5215635299682617 + }, + { + "auxiliary_loss_clip": 0.01122712, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.04685998, + "balance_loss_mlp": 1.02100563, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.84056739443744, + "language_loss": 0.78015912, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80173612, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.13977051, + "step": 10717, + "time_per_iteration": 2.556107997894287 + }, + { + "auxiliary_loss_clip": 0.0111419, + "auxiliary_loss_mlp": 0.01039741, + "balance_loss_clip": 1.04224443, + "balance_loss_mlp": 1.02689648, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.6276140257009288, + "language_loss": 0.68258524, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70412457, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12835693, + "step": 10718, + "time_per_iteration": 2.4989945888519287 + }, + { + "auxiliary_loss_clip": 0.01047058, + "auxiliary_loss_mlp": 0.0100823, + "balance_loss_clip": 1.02180982, + "balance_loss_mlp": 1.00696385, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7571202877566957, + "language_loss": 0.4963454, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51689833, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.25292969, + "router_z_loss_mlp": 0.01266479, + "step": 10719, + "time_per_iteration": 3.2490358352661133 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.04818583, + "balance_loss_mlp": 1.02465916, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 1.727656220976854, + "language_loss": 0.78053761, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80216563, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12988281, + "step": 10720, + "time_per_iteration": 2.4677183628082275 + }, + { + "auxiliary_loss_clip": 0.01120947, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.0445919, + "balance_loss_mlp": 1.02177358, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 3.3337483481612704, + "language_loss": 0.77145684, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79301465, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.1307373, + "step": 10721, + "time_per_iteration": 2.502293586730957 + }, + { + "auxiliary_loss_clip": 0.01125473, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.05049002, + "balance_loss_mlp": 1.01899123, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 4.425479257419662, + "language_loss": 0.73083234, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75240314, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.1260376, + "step": 10722, + "time_per_iteration": 2.521005153656006 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.05415726, + "balance_loss_mlp": 1.01625574, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.6269238123966456, + "language_loss": 0.7812863, + "learning_rate": 1.184238431012635e-06, + "loss": 0.8028568, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11169434, + "step": 10723, + "time_per_iteration": 2.508544445037842 + }, + { + "auxiliary_loss_clip": 0.01125448, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.04769588, + "balance_loss_mlp": 1.02327502, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 2.1789226456578836, + "language_loss": 0.58475107, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60636663, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.128479, + "step": 10724, + "time_per_iteration": 2.4881935119628906 + }, + { + "auxiliary_loss_clip": 0.01120588, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.0478245, + "balance_loss_mlp": 1.01733291, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 26.249396333185857, + "language_loss": 0.83439368, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85588312, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11029053, + "step": 10725, + "time_per_iteration": 2.545351505279541 + }, + { + "auxiliary_loss_clip": 0.01111863, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.03761292, + "balance_loss_mlp": 1.02390361, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 3.1620330162511783, + "language_loss": 0.82164514, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.8431344, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.13165283, + "step": 10726, + "time_per_iteration": 2.4865922927856445 + }, + { + "auxiliary_loss_clip": 0.01118425, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.04168653, + "balance_loss_mlp": 1.02199554, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 1.9787777437071659, + "language_loss": 0.81533945, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83686662, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.1229248, + "step": 10727, + "time_per_iteration": 2.4848122596740723 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.04370761, + "balance_loss_mlp": 1.01977217, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.05660095245398, + "language_loss": 0.79445511, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81606793, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14154053, + "step": 10728, + "time_per_iteration": 2.421377658843994 + }, + { + "auxiliary_loss_clip": 0.01115215, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.03932214, + "balance_loss_mlp": 1.02011228, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.6182304673472294, + "language_loss": 0.74465972, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76614994, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13702393, + "step": 10729, + "time_per_iteration": 2.5190277099609375 + }, + { + "auxiliary_loss_clip": 0.0111864, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.04311669, + "balance_loss_mlp": 1.0194931, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.6309805305282175, + "language_loss": 0.66718221, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68870521, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.1416626, + "step": 10730, + "time_per_iteration": 2.482421875 + }, + { + "auxiliary_loss_clip": 0.01116486, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.04155326, + "balance_loss_mlp": 1.02048445, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 2.0467680992173194, + "language_loss": 0.63544893, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65696454, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.14599609, + "step": 10731, + "time_per_iteration": 2.468046188354492 + }, + { + "auxiliary_loss_clip": 0.01113082, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.04007351, + "balance_loss_mlp": 1.02084053, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 2.3987698409419593, + "language_loss": 0.68192792, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70339835, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.13110352, + "step": 10732, + "time_per_iteration": 3.8901171684265137 + }, + { + "auxiliary_loss_clip": 0.01113957, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.04103279, + "balance_loss_mlp": 1.02502728, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.0318304877441453, + "language_loss": 0.75855184, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.78007901, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1373291, + "step": 10733, + "time_per_iteration": 2.450308084487915 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01043329, + "balance_loss_clip": 1.04769504, + "balance_loss_mlp": 1.02920866, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 2.508658091129268, + "language_loss": 0.66651964, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.68821919, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.14123535, + "step": 10734, + "time_per_iteration": 2.507455348968506 + }, + { + "auxiliary_loss_clip": 0.01122828, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_clip": 1.0498538, + "balance_loss_mlp": 1.03095794, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 1.8978276835752126, + "language_loss": 0.73575479, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75741643, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1237793, + "step": 10735, + "time_per_iteration": 2.436880588531494 + }, + { + "auxiliary_loss_clip": 0.01123618, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.04972303, + "balance_loss_mlp": 1.01996815, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.7332093398943333, + "language_loss": 0.74618244, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.7677418, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12353516, + "step": 10736, + "time_per_iteration": 2.4825756549835205 + }, + { + "auxiliary_loss_clip": 0.01126141, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.0494616, + "balance_loss_mlp": 1.01788807, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.9583653491299546, + "language_loss": 0.70467246, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72624195, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12927246, + "step": 10737, + "time_per_iteration": 2.484886884689331 + }, + { + "auxiliary_loss_clip": 0.01050578, + "auxiliary_loss_mlp": 0.01007398, + "balance_loss_clip": 1.02524388, + "balance_loss_mlp": 1.00625086, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7742783327514703, + "language_loss": 0.58445472, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60503447, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.25317383, + "router_z_loss_mlp": 0.01147461, + "step": 10738, + "time_per_iteration": 3.192650318145752 + }, + { + "auxiliary_loss_clip": 0.01125064, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.05188215, + "balance_loss_mlp": 1.01858521, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 2.15910506798244, + "language_loss": 0.74693358, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.7684924, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12237549, + "step": 10739, + "time_per_iteration": 2.584232807159424 + }, + { + "auxiliary_loss_clip": 0.01123752, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.04729247, + "balance_loss_mlp": 1.01704609, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 2.3608705112397232, + "language_loss": 0.71557593, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.73711216, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1282959, + "step": 10740, + "time_per_iteration": 2.5907795429229736 + }, + { + "auxiliary_loss_clip": 0.01075139, + "auxiliary_loss_mlp": 0.01011133, + "balance_loss_clip": 1.04870462, + "balance_loss_mlp": 1.00943685, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6600357412807003, + "language_loss": 0.55230099, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57316369, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.26489258, + "router_z_loss_mlp": 0.01695251, + "step": 10741, + "time_per_iteration": 3.1383283138275146 + }, + { + "auxiliary_loss_clip": 0.01122897, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.04562116, + "balance_loss_mlp": 1.02643335, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.615817102838568, + "language_loss": 0.80739903, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.8290242, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13189697, + "step": 10742, + "time_per_iteration": 2.4649622440338135 + }, + { + "auxiliary_loss_clip": 0.01124236, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.05266023, + "balance_loss_mlp": 1.01925826, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 2.809399361310461, + "language_loss": 0.81717503, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.83872795, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11798096, + "step": 10743, + "time_per_iteration": 2.5160295963287354 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.04412532, + "balance_loss_mlp": 1.01638401, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.9361484943631062, + "language_loss": 0.71740639, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.73887748, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12329102, + "step": 10744, + "time_per_iteration": 2.504286527633667 + }, + { + "auxiliary_loss_clip": 0.01119542, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.04394579, + "balance_loss_mlp": 1.01339483, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.7626520369771086, + "language_loss": 0.66777271, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68922132, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11914062, + "step": 10745, + "time_per_iteration": 2.6711325645446777 + }, + { + "auxiliary_loss_clip": 0.011196, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.04631174, + "balance_loss_mlp": 1.01901925, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.1811996584918747, + "language_loss": 0.73336625, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75487506, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12268066, + "step": 10746, + "time_per_iteration": 3.9220774173736572 + }, + { + "auxiliary_loss_clip": 0.01124992, + "auxiliary_loss_mlp": 0.0103146, + "balance_loss_clip": 1.04991305, + "balance_loss_mlp": 1.01978314, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4484822458184021, + "language_loss": 0.66672415, + "learning_rate": 1.175713157660413e-06, + "loss": 0.68828863, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11669922, + "step": 10747, + "time_per_iteration": 2.495206356048584 + }, + { + "auxiliary_loss_clip": 0.0112405, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.04935503, + "balance_loss_mlp": 1.02589476, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.6723549569616367, + "language_loss": 0.67186266, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69348216, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12011719, + "step": 10748, + "time_per_iteration": 2.430058479309082 + }, + { + "auxiliary_loss_clip": 0.011223, + "auxiliary_loss_mlp": 0.01035078, + "balance_loss_clip": 1.0458647, + "balance_loss_mlp": 1.02171504, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.808160828057081, + "language_loss": 0.76546955, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78704333, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13360596, + "step": 10749, + "time_per_iteration": 2.445502758026123 + }, + { + "auxiliary_loss_clip": 0.01123712, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.0489018, + "balance_loss_mlp": 1.01969576, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.5100988650891343, + "language_loss": 0.76917797, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79072881, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11676025, + "step": 10750, + "time_per_iteration": 4.153299570083618 + }, + { + "auxiliary_loss_clip": 0.0112389, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.04712737, + "balance_loss_mlp": 1.01965821, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 3.2782240185483205, + "language_loss": 0.68283564, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70441008, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13885498, + "step": 10751, + "time_per_iteration": 2.4623284339904785 + }, + { + "auxiliary_loss_clip": 0.01119931, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.045017, + "balance_loss_mlp": 1.01425028, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 2.6801733520146263, + "language_loss": 0.71286535, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73433387, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12683105, + "step": 10752, + "time_per_iteration": 2.4929847717285156 + }, + { + "auxiliary_loss_clip": 0.01121, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.04451919, + "balance_loss_mlp": 1.01862264, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 2.3744401053893296, + "language_loss": 0.78351772, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80505294, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13885498, + "step": 10753, + "time_per_iteration": 2.414215087890625 + }, + { + "auxiliary_loss_clip": 0.01125821, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.05094111, + "balance_loss_mlp": 1.02530849, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.6701554016511144, + "language_loss": 0.85266256, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87429613, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12249756, + "step": 10754, + "time_per_iteration": 2.480809450149536 + }, + { + "auxiliary_loss_clip": 0.01122211, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.04704285, + "balance_loss_mlp": 1.01933575, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.368396379291397, + "language_loss": 0.59408122, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61561894, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12219238, + "step": 10755, + "time_per_iteration": 2.4584293365478516 + }, + { + "auxiliary_loss_clip": 0.01118795, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.04538476, + "balance_loss_mlp": 1.01694345, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 5.955177423292711, + "language_loss": 0.68148541, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70296383, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12109375, + "step": 10756, + "time_per_iteration": 2.443737506866455 + }, + { + "auxiliary_loss_clip": 0.0112644, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.04525816, + "balance_loss_mlp": 1.02124095, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.600797592177053, + "language_loss": 0.74296486, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76456982, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.12811279, + "step": 10757, + "time_per_iteration": 3.9144997596740723 + }, + { + "auxiliary_loss_clip": 0.01122083, + "auxiliary_loss_mlp": 0.01029218, + "balance_loss_clip": 1.04933882, + "balance_loss_mlp": 1.01730299, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.4567120214201263, + "language_loss": 0.74764568, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76915866, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1192627, + "step": 10758, + "time_per_iteration": 2.481041669845581 + }, + { + "auxiliary_loss_clip": 0.01121377, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.0439198, + "balance_loss_mlp": 1.02024436, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.710462258567646, + "language_loss": 0.67815715, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.69971204, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1385498, + "step": 10759, + "time_per_iteration": 2.454254150390625 + }, + { + "auxiliary_loss_clip": 0.01127969, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.05017495, + "balance_loss_mlp": 1.01736808, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.5930677091533398, + "language_loss": 0.7560119, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77759629, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13098145, + "step": 10760, + "time_per_iteration": 2.4863295555114746 + }, + { + "auxiliary_loss_clip": 0.01116155, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.04225862, + "balance_loss_mlp": 1.0166533, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.7705909416877357, + "language_loss": 0.65531021, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67675722, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11901855, + "step": 10761, + "time_per_iteration": 2.7225708961486816 + }, + { + "auxiliary_loss_clip": 0.01121432, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.04588056, + "balance_loss_mlp": 1.01721048, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 3.6320098849857625, + "language_loss": 0.69484586, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71636379, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13140869, + "step": 10762, + "time_per_iteration": 2.4664156436920166 + }, + { + "auxiliary_loss_clip": 0.01120203, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.04270232, + "balance_loss_mlp": 1.02169776, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.643528893078893, + "language_loss": 0.82596672, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.84751493, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12915039, + "step": 10763, + "time_per_iteration": 2.4231183528900146 + }, + { + "auxiliary_loss_clip": 0.01055101, + "auxiliary_loss_mlp": 0.01003724, + "balance_loss_clip": 1.02997136, + "balance_loss_mlp": 1.00246513, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7116883586819858, + "language_loss": 0.57751018, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.5980984, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.0125885, + "step": 10764, + "time_per_iteration": 3.2429311275482178 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.04310799, + "balance_loss_mlp": 1.02323914, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 2.5030701296762885, + "language_loss": 0.60805339, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62957072, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12689209, + "step": 10765, + "time_per_iteration": 2.6574172973632812 + }, + { + "auxiliary_loss_clip": 0.01119568, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.04586601, + "balance_loss_mlp": 1.01733339, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 2.1692220656036274, + "language_loss": 0.63134432, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65283549, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12219238, + "step": 10766, + "time_per_iteration": 2.531456470489502 + }, + { + "auxiliary_loss_clip": 0.01122964, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.04819393, + "balance_loss_mlp": 1.01878059, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.7406830735469303, + "language_loss": 0.75605357, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77759451, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12353516, + "step": 10767, + "time_per_iteration": 2.4701855182647705 + }, + { + "auxiliary_loss_clip": 0.0112177, + "auxiliary_loss_mlp": 0.01028683, + "balance_loss_clip": 1.04782128, + "balance_loss_mlp": 1.01655376, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 1.758434885092229, + "language_loss": 0.7759198, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79742432, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12127686, + "step": 10768, + "time_per_iteration": 2.430473804473877 + }, + { + "auxiliary_loss_clip": 0.01118463, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.04233575, + "balance_loss_mlp": 1.01601148, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.8982932674228938, + "language_loss": 0.71718884, + "learning_rate": 1.167914135250663e-06, + "loss": 0.73866057, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12683105, + "step": 10769, + "time_per_iteration": 2.5365500450134277 + }, + { + "auxiliary_loss_clip": 0.01126039, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.05409265, + "balance_loss_mlp": 1.02297688, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9699201790764642, + "language_loss": 0.72523868, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74685252, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1237793, + "step": 10770, + "time_per_iteration": 2.447315216064453 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01033476, + "balance_loss_clip": 1.04610372, + "balance_loss_mlp": 1.01998782, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.5991742998247847, + "language_loss": 0.73166215, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75325578, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13482666, + "step": 10771, + "time_per_iteration": 2.5644004344940186 + }, + { + "auxiliary_loss_clip": 0.01113389, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.04017258, + "balance_loss_mlp": 1.02028227, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 2.1582608591167958, + "language_loss": 0.74059737, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76206577, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1315918, + "step": 10772, + "time_per_iteration": 2.4718708992004395 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.03811944, + "balance_loss_mlp": 1.02273762, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.4927278448450374, + "language_loss": 0.82765669, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.84910226, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11334229, + "step": 10773, + "time_per_iteration": 2.5880813598632812 + }, + { + "auxiliary_loss_clip": 0.01115201, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.04349732, + "balance_loss_mlp": 1.01762211, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 2.1411209180503565, + "language_loss": 0.78393269, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.8053807, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11975098, + "step": 10774, + "time_per_iteration": 2.412487506866455 + }, + { + "auxiliary_loss_clip": 0.01124076, + "auxiliary_loss_mlp": 0.01040352, + "balance_loss_clip": 1.04622483, + "balance_loss_mlp": 1.02763247, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 7.312658043175546, + "language_loss": 0.69273627, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.7143805, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.1272583, + "step": 10775, + "time_per_iteration": 2.4709479808807373 + }, + { + "auxiliary_loss_clip": 0.01126701, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.04795086, + "balance_loss_mlp": 1.02005601, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 2.2245339961543524, + "language_loss": 0.65522456, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67681813, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12591553, + "step": 10776, + "time_per_iteration": 3.9217164516448975 + }, + { + "auxiliary_loss_clip": 0.0112672, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.0491358, + "balance_loss_mlp": 1.02093792, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 10.26820591495322, + "language_loss": 0.79281533, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81442297, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13098145, + "step": 10777, + "time_per_iteration": 2.457815408706665 + }, + { + "auxiliary_loss_clip": 0.01118881, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.04365206, + "balance_loss_mlp": 1.02262616, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 1.990667744147001, + "language_loss": 0.73479223, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75633955, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13238525, + "step": 10778, + "time_per_iteration": 2.5093557834625244 + }, + { + "auxiliary_loss_clip": 0.01112921, + "auxiliary_loss_mlp": 0.01040925, + "balance_loss_clip": 1.03936458, + "balance_loss_mlp": 1.02604771, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.7210730587298104, + "language_loss": 0.78025603, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80179453, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.14880371, + "step": 10779, + "time_per_iteration": 2.568706750869751 + }, + { + "auxiliary_loss_clip": 0.01044711, + "auxiliary_loss_mlp": 0.01004901, + "balance_loss_clip": 1.01959705, + "balance_loss_mlp": 1.00343204, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7256365233325744, + "language_loss": 0.59408975, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61458588, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01469421, + "step": 10780, + "time_per_iteration": 3.075377941131592 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.04273057, + "balance_loss_mlp": 1.02151632, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 2.0631717447038866, + "language_loss": 0.79461586, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81614333, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13409424, + "step": 10781, + "time_per_iteration": 2.490415096282959 + }, + { + "auxiliary_loss_clip": 0.01125702, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.04869986, + "balance_loss_mlp": 1.02166975, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 1.9884436193995634, + "language_loss": 0.78633213, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.80794418, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13842773, + "step": 10782, + "time_per_iteration": 2.4371259212493896 + }, + { + "auxiliary_loss_clip": 0.01124415, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.04833078, + "balance_loss_mlp": 1.02121484, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.4419994332353663, + "language_loss": 0.64011192, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66170514, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13702393, + "step": 10783, + "time_per_iteration": 2.497075080871582 + }, + { + "auxiliary_loss_clip": 0.01130652, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.05066872, + "balance_loss_mlp": 1.02269161, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 2.0998418475405334, + "language_loss": 0.88780767, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90948486, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.14385986, + "step": 10784, + "time_per_iteration": 2.5100512504577637 + }, + { + "auxiliary_loss_clip": 0.01116407, + "auxiliary_loss_mlp": 0.01037152, + "balance_loss_clip": 1.04195619, + "balance_loss_mlp": 1.0219115, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 4.3357980434907075, + "language_loss": 0.73283815, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75437379, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.15222168, + "step": 10785, + "time_per_iteration": 2.447265386581421 + }, + { + "auxiliary_loss_clip": 0.01112148, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.04010868, + "balance_loss_mlp": 1.02126026, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.8493952162826304, + "language_loss": 0.69427645, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71574146, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.13092041, + "step": 10786, + "time_per_iteration": 2.532635450363159 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.04789329, + "balance_loss_mlp": 1.01792169, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 2.8357425679902257, + "language_loss": 0.71020257, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73174071, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1350708, + "step": 10787, + "time_per_iteration": 2.5360987186431885 + }, + { + "auxiliary_loss_clip": 0.01133949, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.05405068, + "balance_loss_mlp": 1.01911604, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 1.9204964862554283, + "language_loss": 0.84622526, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86788362, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12768555, + "step": 10788, + "time_per_iteration": 2.4360220432281494 + }, + { + "auxiliary_loss_clip": 0.01122255, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.04630303, + "balance_loss_mlp": 1.01706409, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.945065635273478, + "language_loss": 0.77743638, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79897815, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.14868164, + "step": 10789, + "time_per_iteration": 3.848615884780884 + }, + { + "auxiliary_loss_clip": 0.01113724, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.04146671, + "balance_loss_mlp": 1.01840854, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.7062780858533233, + "language_loss": 0.76361775, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78507209, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.13305664, + "step": 10790, + "time_per_iteration": 2.607520580291748 + }, + { + "auxiliary_loss_clip": 0.0112067, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.04696035, + "balance_loss_mlp": 1.02353716, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.240351745599954, + "language_loss": 0.60059249, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62216306, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12841797, + "step": 10791, + "time_per_iteration": 2.4024171829223633 + }, + { + "auxiliary_loss_clip": 0.0112634, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.04786754, + "balance_loss_mlp": 1.01973009, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.6877217322515463, + "language_loss": 0.8597821, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88137674, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.1338501, + "step": 10792, + "time_per_iteration": 2.499734401702881 + }, + { + "auxiliary_loss_clip": 0.01123519, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.04522347, + "balance_loss_mlp": 1.02827621, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.2248607353826557, + "language_loss": 0.77902389, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80069244, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.15057373, + "step": 10793, + "time_per_iteration": 2.4757368564605713 + }, + { + "auxiliary_loss_clip": 0.01124724, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.0462563, + "balance_loss_mlp": 1.01572514, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.8211275662611601, + "language_loss": 0.74701309, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76854438, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12677002, + "step": 10794, + "time_per_iteration": 3.903015613555908 + }, + { + "auxiliary_loss_clip": 0.01129965, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.05329132, + "balance_loss_mlp": 1.01970172, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.7257239170647738, + "language_loss": 0.6981138, + "learning_rate": 1.158716808837621e-06, + "loss": 0.71973437, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12384033, + "step": 10795, + "time_per_iteration": 2.4970667362213135 + }, + { + "auxiliary_loss_clip": 0.01128639, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.04806423, + "balance_loss_mlp": 1.0186969, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 2.05519758545726, + "language_loss": 0.53571928, + "learning_rate": 1.158363494676679e-06, + "loss": 0.55733734, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14465332, + "step": 10796, + "time_per_iteration": 2.4871318340301514 + }, + { + "auxiliary_loss_clip": 0.01120158, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.04538536, + "balance_loss_mlp": 1.01630759, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.6104161450806698, + "language_loss": 0.77660632, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79809135, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12042236, + "step": 10797, + "time_per_iteration": 2.4749605655670166 + }, + { + "auxiliary_loss_clip": 0.01113658, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.04202223, + "balance_loss_mlp": 1.01997709, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.0784508249049276, + "language_loss": 0.71148658, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.73295707, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.13415527, + "step": 10798, + "time_per_iteration": 2.4716877937316895 + }, + { + "auxiliary_loss_clip": 0.01127294, + "auxiliary_loss_mlp": 0.01042317, + "balance_loss_clip": 1.04617906, + "balance_loss_mlp": 1.02921641, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.9610959784809356, + "language_loss": 0.77161396, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79331005, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13116455, + "step": 10799, + "time_per_iteration": 2.4930155277252197 + }, + { + "auxiliary_loss_clip": 0.01123188, + "auxiliary_loss_mlp": 0.01031514, + "balance_loss_clip": 1.04446864, + "balance_loss_mlp": 1.0185318, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 2.3280689222753184, + "language_loss": 0.71613884, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.73768592, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12982178, + "step": 10800, + "time_per_iteration": 2.5060853958129883 + }, + { + "auxiliary_loss_clip": 0.01059377, + "auxiliary_loss_mlp": 0.010026, + "balance_loss_clip": 1.03361964, + "balance_loss_mlp": 1.00140917, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.763824910728847, + "language_loss": 0.60209209, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62271184, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01190186, + "step": 10801, + "time_per_iteration": 4.589957237243652 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.05386293, + "balance_loss_mlp": 1.02034688, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 2.00232507073823, + "language_loss": 0.78566563, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80734116, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13336182, + "step": 10802, + "time_per_iteration": 2.46250581741333 + }, + { + "auxiliary_loss_clip": 0.01120517, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_clip": 1.04280663, + "balance_loss_mlp": 1.02753651, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.5964307572972676, + "language_loss": 0.7477982, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76941735, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13867188, + "step": 10803, + "time_per_iteration": 2.4709291458129883 + }, + { + "auxiliary_loss_clip": 0.01121011, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.04596901, + "balance_loss_mlp": 1.01995707, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 3.359075737666724, + "language_loss": 0.69511175, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.716649, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12756348, + "step": 10804, + "time_per_iteration": 2.4370501041412354 + }, + { + "auxiliary_loss_clip": 0.01120586, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.04473817, + "balance_loss_mlp": 1.01569474, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 3.0776800885733255, + "language_loss": 0.72539949, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74689531, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13299561, + "step": 10805, + "time_per_iteration": 2.542412519454956 + }, + { + "auxiliary_loss_clip": 0.01122166, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.04565072, + "balance_loss_mlp": 1.0189774, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 3.3290175145019676, + "language_loss": 0.65656799, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.67809743, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11810303, + "step": 10806, + "time_per_iteration": 2.503486394882202 + }, + { + "auxiliary_loss_clip": 0.01124068, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.04655552, + "balance_loss_mlp": 1.01775503, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.16399115674675, + "language_loss": 0.78886247, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.81041378, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13305664, + "step": 10807, + "time_per_iteration": 2.4730961322784424 + }, + { + "auxiliary_loss_clip": 0.01051517, + "auxiliary_loss_mlp": 0.01002003, + "balance_loss_clip": 1.02615786, + "balance_loss_mlp": 1.00059366, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7869568084279461, + "language_loss": 0.5885402, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60907543, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.25366211, + "router_z_loss_mlp": 0.01408386, + "step": 10808, + "time_per_iteration": 3.266738176345825 + }, + { + "auxiliary_loss_clip": 0.01116381, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.04424572, + "balance_loss_mlp": 1.01537013, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.8818234989915574, + "language_loss": 0.63154376, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65298653, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12530518, + "step": 10809, + "time_per_iteration": 2.5667145252227783 + }, + { + "auxiliary_loss_clip": 0.01121618, + "auxiliary_loss_mlp": 0.01037694, + "balance_loss_clip": 1.04803658, + "balance_loss_mlp": 1.02454543, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.6429351257762626, + "language_loss": 0.81610858, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83770174, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1315918, + "step": 10810, + "time_per_iteration": 2.5240108966827393 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.0431757, + "balance_loss_mlp": 1.02193856, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6115753598358884, + "language_loss": 0.72037357, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.74188298, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11621094, + "step": 10811, + "time_per_iteration": 2.445260524749756 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.04613113, + "balance_loss_mlp": 1.02018619, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.4571134113132438, + "language_loss": 0.77865231, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.80018061, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12121582, + "step": 10812, + "time_per_iteration": 2.4874892234802246 + }, + { + "auxiliary_loss_clip": 0.0112025, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.04435086, + "balance_loss_mlp": 1.02203679, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.7852762927786772, + "language_loss": 0.85220605, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87376225, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13330078, + "step": 10813, + "time_per_iteration": 2.434372901916504 + }, + { + "auxiliary_loss_clip": 0.01126068, + "auxiliary_loss_mlp": 0.01039154, + "balance_loss_clip": 1.04724383, + "balance_loss_mlp": 1.0268935, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.9856223227925003, + "language_loss": 0.80146778, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82311994, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12255859, + "step": 10814, + "time_per_iteration": 2.4257423877716064 + }, + { + "auxiliary_loss_clip": 0.0112335, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.04617119, + "balance_loss_mlp": 1.02127492, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.7986136071266297, + "language_loss": 0.6542452, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67581975, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12823486, + "step": 10815, + "time_per_iteration": 2.715609073638916 + }, + { + "auxiliary_loss_clip": 0.01126186, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.0460794, + "balance_loss_mlp": 1.0171144, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 1.876140295392181, + "language_loss": 0.74826294, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.76984626, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.15020752, + "step": 10816, + "time_per_iteration": 2.442628860473633 + }, + { + "auxiliary_loss_clip": 0.01127631, + "auxiliary_loss_mlp": 0.01029718, + "balance_loss_clip": 1.05245066, + "balance_loss_mlp": 1.0175705, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.8518692822590714, + "language_loss": 0.7308656, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75243902, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12145996, + "step": 10817, + "time_per_iteration": 2.4998064041137695 + }, + { + "auxiliary_loss_clip": 0.01121902, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.04510427, + "balance_loss_mlp": 1.01793969, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.3882095180401084, + "language_loss": 0.72297668, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74450397, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12878418, + "step": 10818, + "time_per_iteration": 2.9187068939208984 + }, + { + "auxiliary_loss_clip": 0.01124866, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.04773891, + "balance_loss_mlp": 1.01763487, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 2.164602210045584, + "language_loss": 0.65216154, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67371118, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12457275, + "step": 10819, + "time_per_iteration": 3.9570472240448 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.04591775, + "balance_loss_mlp": 1.02018023, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.9664581109579073, + "language_loss": 0.83390343, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85545659, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12780762, + "step": 10820, + "time_per_iteration": 2.485036849975586 + }, + { + "auxiliary_loss_clip": 0.01127698, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.04974008, + "balance_loss_mlp": 1.01379299, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.829076928564814, + "language_loss": 0.78626668, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80781341, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13171387, + "step": 10821, + "time_per_iteration": 2.4414451122283936 + }, + { + "auxiliary_loss_clip": 0.01125328, + "auxiliary_loss_mlp": 0.01027923, + "balance_loss_clip": 1.05279064, + "balance_loss_mlp": 1.01695538, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.4739198309131367, + "language_loss": 0.80159795, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82313049, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10974121, + "step": 10822, + "time_per_iteration": 2.6146373748779297 + }, + { + "auxiliary_loss_clip": 0.01122284, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.04630876, + "balance_loss_mlp": 1.01656759, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.866100334029867, + "language_loss": 0.87431288, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89582449, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12316895, + "step": 10823, + "time_per_iteration": 2.4270060062408447 + }, + { + "auxiliary_loss_clip": 0.01123066, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.04797816, + "balance_loss_mlp": 1.01721311, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 2.2041874300592643, + "language_loss": 0.66907066, + "learning_rate": 1.148483704558183e-06, + "loss": 0.69059694, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12353516, + "step": 10824, + "time_per_iteration": 2.5209264755249023 + }, + { + "auxiliary_loss_clip": 0.01129797, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.05252314, + "balance_loss_mlp": 1.01744974, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 2.379347636741772, + "language_loss": 0.87559772, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89719945, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12927246, + "step": 10825, + "time_per_iteration": 2.4027583599090576 + }, + { + "auxiliary_loss_clip": 0.01122018, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.04446554, + "balance_loss_mlp": 1.01406789, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.6997599258544516, + "language_loss": 0.73205388, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75355613, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.14141846, + "step": 10826, + "time_per_iteration": 2.4494566917419434 + }, + { + "auxiliary_loss_clip": 0.01125968, + "auxiliary_loss_mlp": 0.01028217, + "balance_loss_clip": 1.05000114, + "balance_loss_mlp": 1.01663613, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 2.8487342389496724, + "language_loss": 0.68788695, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.70942885, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11590576, + "step": 10827, + "time_per_iteration": 2.4529311656951904 + }, + { + "auxiliary_loss_clip": 0.01121742, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.04596627, + "balance_loss_mlp": 1.01721096, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 3.258180361572715, + "language_loss": 0.76617128, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.78767514, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11437988, + "step": 10828, + "time_per_iteration": 2.467860698699951 + }, + { + "auxiliary_loss_clip": 0.0111408, + "auxiliary_loss_mlp": 0.01030704, + "balance_loss_clip": 1.04040003, + "balance_loss_mlp": 1.01874089, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.8126454704677604, + "language_loss": 0.8926146, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91406238, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11956787, + "step": 10829, + "time_per_iteration": 2.5341975688934326 + }, + { + "auxiliary_loss_clip": 0.01045369, + "auxiliary_loss_mlp": 0.01004955, + "balance_loss_clip": 1.01985168, + "balance_loss_mlp": 1.00365567, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6387516390011992, + "language_loss": 0.55379903, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57430232, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01300049, + "step": 10830, + "time_per_iteration": 3.3159823417663574 + }, + { + "auxiliary_loss_clip": 0.01114969, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.03804159, + "balance_loss_mlp": 1.01709843, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 2.9636165913438255, + "language_loss": 0.75174367, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.77319831, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13409424, + "step": 10831, + "time_per_iteration": 2.601670742034912 + }, + { + "auxiliary_loss_clip": 0.01051765, + "auxiliary_loss_mlp": 0.01006022, + "balance_loss_clip": 1.02540398, + "balance_loss_mlp": 1.00456226, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.829515289347243, + "language_loss": 0.50993705, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53051496, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 0.01461792, + "step": 10832, + "time_per_iteration": 3.1851327419281006 + }, + { + "auxiliary_loss_clip": 0.01122023, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.04408181, + "balance_loss_mlp": 1.01873469, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 3.2270949507860767, + "language_loss": 0.83223438, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85377353, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.1317749, + "step": 10833, + "time_per_iteration": 2.5312466621398926 + }, + { + "auxiliary_loss_clip": 0.01118153, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.04254103, + "balance_loss_mlp": 1.02788138, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 4.4007318999088305, + "language_loss": 0.83414435, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85574466, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13989258, + "step": 10834, + "time_per_iteration": 3.808274984359741 + }, + { + "auxiliary_loss_clip": 0.01126742, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.0502255, + "balance_loss_mlp": 1.02064872, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.5625604495403038, + "language_loss": 0.77317083, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79477024, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12561035, + "step": 10835, + "time_per_iteration": 2.559849262237549 + }, + { + "auxiliary_loss_clip": 0.01130925, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.05596209, + "balance_loss_mlp": 1.02358067, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.415149568189423, + "language_loss": 0.77243477, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79410386, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.1239624, + "step": 10836, + "time_per_iteration": 2.5160200595855713 + }, + { + "auxiliary_loss_clip": 0.01120806, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.04561198, + "balance_loss_mlp": 1.01849353, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 1.9297621659304065, + "language_loss": 0.82637584, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84789145, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12255859, + "step": 10837, + "time_per_iteration": 3.862724542617798 + }, + { + "auxiliary_loss_clip": 0.01120386, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.04747283, + "balance_loss_mlp": 1.01604795, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.8481778145854066, + "language_loss": 0.58702326, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60852933, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.14172363, + "step": 10838, + "time_per_iteration": 2.714648485183716 + }, + { + "auxiliary_loss_clip": 0.01044077, + "auxiliary_loss_mlp": 0.01004206, + "balance_loss_clip": 1.01889551, + "balance_loss_mlp": 1.00291395, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7273537739676474, + "language_loss": 0.60783595, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62831879, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01292419, + "step": 10839, + "time_per_iteration": 3.157611846923828 + }, + { + "auxiliary_loss_clip": 0.01113578, + "auxiliary_loss_mlp": 0.01025056, + "balance_loss_clip": 1.04116702, + "balance_loss_mlp": 1.01363015, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.676551871293447, + "language_loss": 0.67539209, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.69677842, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11431885, + "step": 10840, + "time_per_iteration": 2.615889549255371 + }, + { + "auxiliary_loss_clip": 0.01111343, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.03750885, + "balance_loss_mlp": 1.01962733, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.806898049264146, + "language_loss": 0.73479629, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75622058, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11474609, + "step": 10841, + "time_per_iteration": 2.567892551422119 + }, + { + "auxiliary_loss_clip": 0.01131883, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.05380893, + "balance_loss_mlp": 1.01834762, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.6289049440155834, + "language_loss": 0.62490618, + "learning_rate": 1.142145760331648e-06, + "loss": 0.6465292, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12072754, + "step": 10842, + "time_per_iteration": 2.587932586669922 + }, + { + "auxiliary_loss_clip": 0.01049611, + "auxiliary_loss_mlp": 0.01001595, + "balance_loss_clip": 1.02443194, + "balance_loss_mlp": 1.00014329, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8134327325088384, + "language_loss": 0.56123388, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58174592, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01449585, + "step": 10843, + "time_per_iteration": 2.947821617126465 + }, + { + "auxiliary_loss_clip": 0.01119347, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.04024231, + "balance_loss_mlp": 1.02243948, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.8480592770594284, + "language_loss": 0.8267833, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84834063, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13952637, + "step": 10844, + "time_per_iteration": 2.5019869804382324 + }, + { + "auxiliary_loss_clip": 0.01120618, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.04534149, + "balance_loss_mlp": 1.01748991, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.2014825941515417, + "language_loss": 0.59844398, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.6199522, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1270752, + "step": 10845, + "time_per_iteration": 3.942866563796997 + }, + { + "auxiliary_loss_clip": 0.01118086, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.04325223, + "balance_loss_mlp": 1.01541197, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 1.6201933106199717, + "language_loss": 0.79176646, + "learning_rate": 1.140738756857194e-06, + "loss": 0.8132292, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12780762, + "step": 10846, + "time_per_iteration": 2.4858577251434326 + }, + { + "auxiliary_loss_clip": 0.01051302, + "auxiliary_loss_mlp": 0.01003886, + "balance_loss_clip": 1.02589083, + "balance_loss_mlp": 1.00248098, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7073049856471323, + "language_loss": 0.60200286, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62255478, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01405334, + "step": 10847, + "time_per_iteration": 3.2190990447998047 + }, + { + "auxiliary_loss_clip": 0.01127188, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.0500927, + "balance_loss_mlp": 1.0244782, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.586503810077542, + "language_loss": 0.80893052, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83057678, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12957764, + "step": 10848, + "time_per_iteration": 2.541224956512451 + }, + { + "auxiliary_loss_clip": 0.01118369, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.04507828, + "balance_loss_mlp": 1.02316272, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.242112049754089, + "language_loss": 0.74489987, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76644093, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12561035, + "step": 10849, + "time_per_iteration": 2.524101972579956 + }, + { + "auxiliary_loss_clip": 0.01109798, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.03849626, + "balance_loss_mlp": 1.02152729, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.4836369783479078, + "language_loss": 0.68099993, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.7024368, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12347412, + "step": 10850, + "time_per_iteration": 2.5542690753936768 + }, + { + "auxiliary_loss_clip": 0.01110742, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.03983212, + "balance_loss_mlp": 1.01679969, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.7161819262964932, + "language_loss": 0.66405666, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68545091, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11877441, + "step": 10851, + "time_per_iteration": 2.5298755168914795 + }, + { + "auxiliary_loss_clip": 0.01123509, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.04658127, + "balance_loss_mlp": 1.0172435, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.4459607497866607, + "language_loss": 0.73684698, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75838673, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13232422, + "step": 10852, + "time_per_iteration": 2.5309226512908936 + }, + { + "auxiliary_loss_clip": 0.01120946, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.04212737, + "balance_loss_mlp": 1.01818335, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 1.8988703085137157, + "language_loss": 0.66916502, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.6906969, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14068604, + "step": 10853, + "time_per_iteration": 2.455629825592041 + }, + { + "auxiliary_loss_clip": 0.0104456, + "auxiliary_loss_mlp": 0.01001779, + "balance_loss_clip": 1.01911879, + "balance_loss_mlp": 1.00045002, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7191805904719663, + "language_loss": 0.62973684, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65020025, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01330566, + "step": 10854, + "time_per_iteration": 3.263906955718994 + }, + { + "auxiliary_loss_clip": 0.01121165, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.04299116, + "balance_loss_mlp": 1.0262996, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.8122261560501476, + "language_loss": 0.77674007, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79835612, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.14129639, + "step": 10855, + "time_per_iteration": 2.511627674102783 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.04136395, + "balance_loss_mlp": 1.01566339, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 2.602268625980567, + "language_loss": 0.79245973, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81388736, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1270752, + "step": 10856, + "time_per_iteration": 2.515195608139038 + }, + { + "auxiliary_loss_clip": 0.01119453, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.0442307, + "balance_loss_mlp": 1.01701164, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.875680963232038, + "language_loss": 0.73705113, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75855148, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13574219, + "step": 10857, + "time_per_iteration": 2.516319513320923 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.04361951, + "balance_loss_mlp": 1.01810265, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.0527878153694683, + "language_loss": 0.63122165, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.65271151, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11810303, + "step": 10858, + "time_per_iteration": 2.464913845062256 + }, + { + "auxiliary_loss_clip": 0.01117856, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.04399633, + "balance_loss_mlp": 1.02065563, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.6356116752180432, + "language_loss": 0.78449345, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80599779, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11920166, + "step": 10859, + "time_per_iteration": 2.5059945583343506 + }, + { + "auxiliary_loss_clip": 0.01120914, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.04369235, + "balance_loss_mlp": 1.01604974, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.6637881840852644, + "language_loss": 0.68156338, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.7030611, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12817383, + "step": 10860, + "time_per_iteration": 2.5144195556640625 + }, + { + "auxiliary_loss_clip": 0.01126397, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.04934216, + "balance_loss_mlp": 1.01874614, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 1.9297269073408958, + "language_loss": 0.66507745, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68665397, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12506104, + "step": 10861, + "time_per_iteration": 2.452453374862671 + }, + { + "auxiliary_loss_clip": 0.01122578, + "auxiliary_loss_mlp": 0.01038234, + "balance_loss_clip": 1.04631829, + "balance_loss_mlp": 1.02229571, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.789567824611238, + "language_loss": 0.64915162, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67075974, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1595459, + "step": 10862, + "time_per_iteration": 3.9912304878234863 + }, + { + "auxiliary_loss_clip": 0.01129167, + "auxiliary_loss_mlp": 0.0104032, + "balance_loss_clip": 1.05053687, + "balance_loss_mlp": 1.0276835, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.6741173041898834, + "language_loss": 0.77230287, + "learning_rate": 1.13476481851592e-06, + "loss": 0.7939977, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12640381, + "step": 10863, + "time_per_iteration": 2.522914409637451 + }, + { + "auxiliary_loss_clip": 0.01125249, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.04760242, + "balance_loss_mlp": 1.01969719, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.7074864272877128, + "language_loss": 0.74652874, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76809841, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12036133, + "step": 10864, + "time_per_iteration": 2.522705554962158 + }, + { + "auxiliary_loss_clip": 0.01124124, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.05080295, + "balance_loss_mlp": 1.02095532, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.9690858176179502, + "language_loss": 0.86106598, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88263148, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11462402, + "step": 10865, + "time_per_iteration": 2.581270694732666 + }, + { + "auxiliary_loss_clip": 0.0112499, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.04664612, + "balance_loss_mlp": 1.02275026, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.585909075557989, + "language_loss": 0.81485671, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83645976, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12542725, + "step": 10866, + "time_per_iteration": 2.5341219902038574 + }, + { + "auxiliary_loss_clip": 0.01115774, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.04266, + "balance_loss_mlp": 1.01814699, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.975951684001041, + "language_loss": 0.81984228, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84130615, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12469482, + "step": 10867, + "time_per_iteration": 2.5572729110717773 + }, + { + "auxiliary_loss_clip": 0.01114361, + "auxiliary_loss_mlp": 0.01027222, + "balance_loss_clip": 1.03951716, + "balance_loss_mlp": 1.01540244, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 2.8402692893569124, + "language_loss": 0.81018567, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.8316015, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1182251, + "step": 10868, + "time_per_iteration": 2.51448655128479 + }, + { + "auxiliary_loss_clip": 0.01127882, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.04852676, + "balance_loss_mlp": 1.01935518, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.969888546251799, + "language_loss": 0.79710704, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81871772, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13824463, + "step": 10869, + "time_per_iteration": 2.465038299560547 + }, + { + "auxiliary_loss_clip": 0.01130968, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.05356622, + "balance_loss_mlp": 1.0203923, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 1.751092918884509, + "language_loss": 0.72186577, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74350655, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12744141, + "step": 10870, + "time_per_iteration": 2.498037099838257 + }, + { + "auxiliary_loss_clip": 0.01126691, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.05233383, + "balance_loss_mlp": 1.02227592, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.2878753812868764, + "language_loss": 0.74838293, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.76999557, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12310791, + "step": 10871, + "time_per_iteration": 2.5177087783813477 + }, + { + "auxiliary_loss_clip": 0.01118506, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.04822922, + "balance_loss_mlp": 1.0207386, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.5778790990173148, + "language_loss": 0.55670017, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57820857, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.1159668, + "step": 10872, + "time_per_iteration": 2.5666184425354004 + }, + { + "auxiliary_loss_clip": 0.01113259, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.04153132, + "balance_loss_mlp": 1.02172923, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.6661092542598241, + "language_loss": 0.74976879, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77124178, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12316895, + "step": 10873, + "time_per_iteration": 2.5350961685180664 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.04036045, + "balance_loss_mlp": 1.01743853, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.5394578241988215, + "language_loss": 0.75456834, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77601027, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12347412, + "step": 10874, + "time_per_iteration": 2.538257122039795 + }, + { + "auxiliary_loss_clip": 0.0111561, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.04071105, + "balance_loss_mlp": 1.02146578, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.7185819349387368, + "language_loss": 0.81568623, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83718175, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12475586, + "step": 10875, + "time_per_iteration": 2.5345911979675293 + }, + { + "auxiliary_loss_clip": 0.01120952, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.04605854, + "balance_loss_mlp": 1.02268672, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.8672685890644967, + "language_loss": 0.69733918, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.71889281, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11730957, + "step": 10876, + "time_per_iteration": 2.5588464736938477 + }, + { + "auxiliary_loss_clip": 0.01116434, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.04313493, + "balance_loss_mlp": 1.02390337, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 2.113202671504352, + "language_loss": 0.79543245, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81696022, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12426758, + "step": 10877, + "time_per_iteration": 3.8689181804656982 + }, + { + "auxiliary_loss_clip": 0.01119419, + "auxiliary_loss_mlp": 0.0102674, + "balance_loss_clip": 1.04429901, + "balance_loss_mlp": 1.01439047, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 2.6160842097259103, + "language_loss": 0.79807246, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81953406, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12359619, + "step": 10878, + "time_per_iteration": 2.5697388648986816 + }, + { + "auxiliary_loss_clip": 0.01112996, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.03946495, + "balance_loss_mlp": 1.01961493, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 9.906542155077767, + "language_loss": 0.8450644, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86653399, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.14343262, + "step": 10879, + "time_per_iteration": 2.504722833633423 + }, + { + "auxiliary_loss_clip": 0.01116914, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.04007244, + "balance_loss_mlp": 1.01955843, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 3.477288689391096, + "language_loss": 0.71541339, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73691082, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.1328125, + "step": 10880, + "time_per_iteration": 3.836982488632202 + }, + { + "auxiliary_loss_clip": 0.01114602, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.04117644, + "balance_loss_mlp": 1.02234805, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 3.470719293281799, + "language_loss": 0.84419096, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86569548, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.13494873, + "step": 10881, + "time_per_iteration": 2.4175496101379395 + }, + { + "auxiliary_loss_clip": 0.01121864, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.04251218, + "balance_loss_mlp": 1.0243454, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.9640252527469018, + "language_loss": 0.78264523, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.80425256, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.14520264, + "step": 10882, + "time_per_iteration": 2.440246820449829 + }, + { + "auxiliary_loss_clip": 0.01128671, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.05141842, + "balance_loss_mlp": 1.01677418, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.794592846407162, + "language_loss": 0.82008135, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84166527, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12939453, + "step": 10883, + "time_per_iteration": 2.4588985443115234 + }, + { + "auxiliary_loss_clip": 0.01125566, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.04976404, + "balance_loss_mlp": 1.02630901, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.465454081940214, + "language_loss": 0.85131955, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87296981, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1315918, + "step": 10884, + "time_per_iteration": 2.5936193466186523 + }, + { + "auxiliary_loss_clip": 0.01129214, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.05344009, + "balance_loss_mlp": 1.02493918, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.5481797002021005, + "language_loss": 0.79969108, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82136011, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12762451, + "step": 10885, + "time_per_iteration": 2.4970312118530273 + }, + { + "auxiliary_loss_clip": 0.01128303, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.05391371, + "balance_loss_mlp": 1.01651073, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.897720449160408, + "language_loss": 0.72365642, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74523127, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12670898, + "step": 10886, + "time_per_iteration": 2.4997103214263916 + }, + { + "auxiliary_loss_clip": 0.01118003, + "auxiliary_loss_mlp": 0.01030029, + "balance_loss_clip": 1.0441035, + "balance_loss_mlp": 1.01798916, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.8374665291854482, + "language_loss": 0.78093493, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80241525, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12042236, + "step": 10887, + "time_per_iteration": 2.522784948348999 + }, + { + "auxiliary_loss_clip": 0.01118526, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.04541528, + "balance_loss_mlp": 1.02136183, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 2.473533386287735, + "language_loss": 0.79213661, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81365693, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12139893, + "step": 10888, + "time_per_iteration": 3.8696529865264893 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.04040992, + "balance_loss_mlp": 1.01810622, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 2.066355930427905, + "language_loss": 0.66566294, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68710989, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.13244629, + "step": 10889, + "time_per_iteration": 2.612091064453125 + }, + { + "auxiliary_loss_clip": 0.01120565, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.04391456, + "balance_loss_mlp": 1.02141011, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4057099548575878, + "language_loss": 0.79692078, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.81846964, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12908936, + "step": 10890, + "time_per_iteration": 2.4719159603118896 + }, + { + "auxiliary_loss_clip": 0.0111715, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.04097354, + "balance_loss_mlp": 1.01454163, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.1420642746279013, + "language_loss": 0.65708411, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67852813, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12713623, + "step": 10891, + "time_per_iteration": 2.4850666522979736 + }, + { + "auxiliary_loss_clip": 0.01117471, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.04316127, + "balance_loss_mlp": 1.02570915, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 3.7354548650412642, + "language_loss": 0.79440856, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81596214, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12176514, + "step": 10892, + "time_per_iteration": 2.4915084838867188 + }, + { + "auxiliary_loss_clip": 0.01122009, + "auxiliary_loss_mlp": 0.01037293, + "balance_loss_clip": 1.04495502, + "balance_loss_mlp": 1.02282143, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 2.0742559611910485, + "language_loss": 0.78483611, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80642915, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.14465332, + "step": 10893, + "time_per_iteration": 2.482975482940674 + }, + { + "auxiliary_loss_clip": 0.01122618, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.04510367, + "balance_loss_mlp": 1.01885033, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.5339881072662753, + "language_loss": 0.70411551, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72566283, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13275146, + "step": 10894, + "time_per_iteration": 2.442772388458252 + }, + { + "auxiliary_loss_clip": 0.01120463, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.0447886, + "balance_loss_mlp": 1.02190483, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 3.4307461516879965, + "language_loss": 0.62666261, + "learning_rate": 1.123545533127549e-06, + "loss": 0.64821208, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12573242, + "step": 10895, + "time_per_iteration": 2.4726510047912598 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.0408628, + "balance_loss_mlp": 1.02484417, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 1.8986167593448182, + "language_loss": 0.79408729, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81560487, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.1182251, + "step": 10896, + "time_per_iteration": 2.433298110961914 + }, + { + "auxiliary_loss_clip": 0.01116668, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.0446732, + "balance_loss_mlp": 1.01918316, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.4138998554510191, + "language_loss": 0.7070573, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72853518, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11938477, + "step": 10897, + "time_per_iteration": 2.5138895511627197 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.04241538, + "balance_loss_mlp": 1.01635122, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.6573816138723658, + "language_loss": 0.7544927, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.7759636, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12200928, + "step": 10898, + "time_per_iteration": 2.4543039798736572 + }, + { + "auxiliary_loss_clip": 0.01118496, + "auxiliary_loss_mlp": 0.01034788, + "balance_loss_clip": 1.04565096, + "balance_loss_mlp": 1.0227654, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.75723530292199, + "language_loss": 0.73277283, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75430566, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12011719, + "step": 10899, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.01116074, + "auxiliary_loss_mlp": 0.01025432, + "balance_loss_clip": 1.04142392, + "balance_loss_mlp": 1.01360011, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.185503401990374, + "language_loss": 0.55804139, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.57945645, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11828613, + "step": 10900, + "time_per_iteration": 2.4934308528900146 + }, + { + "auxiliary_loss_clip": 0.01121881, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.04828119, + "balance_loss_mlp": 1.02119517, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.720743371397284, + "language_loss": 0.7695477, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79110926, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.13079834, + "step": 10901, + "time_per_iteration": 2.429457664489746 + }, + { + "auxiliary_loss_clip": 0.01115685, + "auxiliary_loss_mlp": 0.0102584, + "balance_loss_clip": 1.04270303, + "balance_loss_mlp": 1.01338267, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 2.40379114379981, + "language_loss": 0.73412466, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75553989, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12445068, + "step": 10902, + "time_per_iteration": 2.4481096267700195 + }, + { + "auxiliary_loss_clip": 0.01122757, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.0459497, + "balance_loss_mlp": 1.02504134, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.8749361887219598, + "language_loss": 0.67822278, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69984245, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.14160156, + "step": 10903, + "time_per_iteration": 2.436708450317383 + }, + { + "auxiliary_loss_clip": 0.01134553, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.05263269, + "balance_loss_mlp": 1.02117431, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.9823709712715687, + "language_loss": 0.66925782, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.69095242, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13720703, + "step": 10904, + "time_per_iteration": 2.5086872577667236 + }, + { + "auxiliary_loss_clip": 0.01127249, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.05139697, + "balance_loss_mlp": 1.02166474, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 2.4773353226776362, + "language_loss": 0.90821123, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92983985, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13952637, + "step": 10905, + "time_per_iteration": 3.9610140323638916 + }, + { + "auxiliary_loss_clip": 0.01124442, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.05093217, + "balance_loss_mlp": 1.02179062, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 1.7499998098590708, + "language_loss": 0.75687021, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77845466, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12207031, + "step": 10906, + "time_per_iteration": 2.443594217300415 + }, + { + "auxiliary_loss_clip": 0.01123626, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.04704463, + "balance_loss_mlp": 1.02374411, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 3.268443719708563, + "language_loss": 0.74876177, + "learning_rate": 1.119347051825267e-06, + "loss": 0.77036059, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12524414, + "step": 10907, + "time_per_iteration": 2.4568662643432617 + }, + { + "auxiliary_loss_clip": 0.01119199, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.04410768, + "balance_loss_mlp": 1.01445019, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.6437824764250153, + "language_loss": 0.72249842, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74397349, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13861084, + "step": 10908, + "time_per_iteration": 2.631542921066284 + }, + { + "auxiliary_loss_clip": 0.01114271, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.04018641, + "balance_loss_mlp": 1.02161694, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.9130864788121795, + "language_loss": 0.81237173, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83386594, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.13531494, + "step": 10909, + "time_per_iteration": 2.5334084033966064 + }, + { + "auxiliary_loss_clip": 0.01121882, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.04569006, + "balance_loss_mlp": 1.01871538, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 3.3046915917563693, + "language_loss": 0.64445323, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.66599286, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13360596, + "step": 10910, + "time_per_iteration": 2.5949525833129883 + }, + { + "auxiliary_loss_clip": 0.01122197, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.04249692, + "balance_loss_mlp": 1.02273655, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 2.9465941115317595, + "language_loss": 0.7540431, + "learning_rate": 1.117948625548313e-06, + "loss": 0.77564639, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.15393066, + "step": 10911, + "time_per_iteration": 2.570479154586792 + }, + { + "auxiliary_loss_clip": 0.01110857, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.03899837, + "balance_loss_mlp": 1.0213387, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 2.0498656365831307, + "language_loss": 0.75403982, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77547818, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11651611, + "step": 10912, + "time_per_iteration": 2.5599091053009033 + }, + { + "auxiliary_loss_clip": 0.0112622, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.04559302, + "balance_loss_mlp": 1.02157521, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 2.1650404969020665, + "language_loss": 0.77555645, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79717052, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.1361084, + "step": 10913, + "time_per_iteration": 2.5668342113494873 + }, + { + "auxiliary_loss_clip": 0.01114567, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.04343104, + "balance_loss_mlp": 1.01778305, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.9049996621229595, + "language_loss": 0.71375757, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73519993, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11889648, + "step": 10914, + "time_per_iteration": 2.43727970123291 + }, + { + "auxiliary_loss_clip": 0.01123361, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.0465107, + "balance_loss_mlp": 1.01712728, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 2.3459863205690006, + "language_loss": 0.73865306, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76018649, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12872314, + "step": 10915, + "time_per_iteration": 2.457310199737549 + }, + { + "auxiliary_loss_clip": 0.01119213, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.04510021, + "balance_loss_mlp": 1.02346659, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.5090789981930783, + "language_loss": 0.79687256, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81844896, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.14953613, + "step": 10916, + "time_per_iteration": 2.4667115211486816 + }, + { + "auxiliary_loss_clip": 0.01122181, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.04470706, + "balance_loss_mlp": 1.01940584, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 2.2368957945556334, + "language_loss": 0.76208758, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78361988, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11645508, + "step": 10917, + "time_per_iteration": 2.442978620529175 + }, + { + "auxiliary_loss_clip": 0.01126525, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.05100012, + "balance_loss_mlp": 1.01535416, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.714390846923303, + "language_loss": 0.69615752, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71769845, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12213135, + "step": 10918, + "time_per_iteration": 2.487517833709717 + }, + { + "auxiliary_loss_clip": 0.01127148, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.05447435, + "balance_loss_mlp": 1.01756549, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.5155400840357285, + "language_loss": 0.76354039, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78510308, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11560059, + "step": 10919, + "time_per_iteration": 2.4675869941711426 + }, + { + "auxiliary_loss_clip": 0.01061463, + "auxiliary_loss_mlp": 0.01006032, + "balance_loss_clip": 1.03621387, + "balance_loss_mlp": 1.00472689, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7192607654768113, + "language_loss": 0.53085279, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55152774, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01306152, + "step": 10920, + "time_per_iteration": 4.412121534347534 + }, + { + "auxiliary_loss_clip": 0.01128405, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.05156422, + "balance_loss_mlp": 1.0212667, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.5075489164524707, + "language_loss": 0.6567784, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67840457, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.1293335, + "step": 10921, + "time_per_iteration": 2.5109975337982178 + }, + { + "auxiliary_loss_clip": 0.01120445, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.04712868, + "balance_loss_mlp": 1.01933241, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.831101925125275, + "language_loss": 0.81252432, + "learning_rate": 1.114105715254205e-06, + "loss": 0.834059, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.13702393, + "step": 10922, + "time_per_iteration": 2.4666860103607178 + }, + { + "auxiliary_loss_clip": 0.0111884, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.04412019, + "balance_loss_mlp": 1.02273047, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.0616071521715487, + "language_loss": 0.70792985, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.72947139, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12585449, + "step": 10923, + "time_per_iteration": 3.89762020111084 + }, + { + "auxiliary_loss_clip": 0.01123942, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.04725409, + "balance_loss_mlp": 1.0189451, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 2.6760446688044723, + "language_loss": 0.80646086, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82801652, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12683105, + "step": 10924, + "time_per_iteration": 2.4358274936676025 + }, + { + "auxiliary_loss_clip": 0.01121062, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.04662204, + "balance_loss_mlp": 1.01562619, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.6396758393774626, + "language_loss": 0.7264328, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74791908, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1194458, + "step": 10925, + "time_per_iteration": 2.499361038208008 + }, + { + "auxiliary_loss_clip": 0.01118215, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.04285872, + "balance_loss_mlp": 1.01662385, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.3206609764289765, + "language_loss": 0.72669804, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74816406, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11743164, + "step": 10926, + "time_per_iteration": 2.458056926727295 + }, + { + "auxiliary_loss_clip": 0.0111973, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.04281473, + "balance_loss_mlp": 1.01873183, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.949876407418962, + "language_loss": 0.72006249, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74157584, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12890625, + "step": 10927, + "time_per_iteration": 2.461538314819336 + }, + { + "auxiliary_loss_clip": 0.01056059, + "auxiliary_loss_mlp": 0.01006641, + "balance_loss_clip": 1.02989721, + "balance_loss_mlp": 1.00503922, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7351101447609462, + "language_loss": 0.64488202, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66550905, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01602173, + "step": 10928, + "time_per_iteration": 3.0539541244506836 + }, + { + "auxiliary_loss_clip": 0.01125069, + "auxiliary_loss_mlp": 0.01030253, + "balance_loss_clip": 1.05033064, + "balance_loss_mlp": 1.01686633, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 2.1845609681447775, + "language_loss": 0.78063178, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80218506, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1340332, + "step": 10929, + "time_per_iteration": 2.4661741256713867 + }, + { + "auxiliary_loss_clip": 0.01122588, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.04849577, + "balance_loss_mlp": 1.01797748, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.6882877141794348, + "language_loss": 0.65620828, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67773747, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12365723, + "step": 10930, + "time_per_iteration": 2.4576964378356934 + }, + { + "auxiliary_loss_clip": 0.0111621, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.0405643, + "balance_loss_mlp": 1.01794767, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 2.064118980126673, + "language_loss": 0.70956552, + "learning_rate": 1.110964538515258e-06, + "loss": 0.73104078, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13342285, + "step": 10931, + "time_per_iteration": 3.9194817543029785 + }, + { + "auxiliary_loss_clip": 0.01125244, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.04831505, + "balance_loss_mlp": 1.01555192, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.0322050149259483, + "language_loss": 0.68389952, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70542979, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12231445, + "step": 10932, + "time_per_iteration": 2.4130101203918457 + }, + { + "auxiliary_loss_clip": 0.01117747, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.04421043, + "balance_loss_mlp": 1.01825917, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 5.016290282649676, + "language_loss": 0.80518878, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82666969, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12097168, + "step": 10933, + "time_per_iteration": 2.6879751682281494 + }, + { + "auxiliary_loss_clip": 0.01122461, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.04737365, + "balance_loss_mlp": 1.01873446, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.7813717548936012, + "language_loss": 0.73788011, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75941765, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12567139, + "step": 10934, + "time_per_iteration": 2.460659980773926 + }, + { + "auxiliary_loss_clip": 0.01121052, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.04841948, + "balance_loss_mlp": 1.02251124, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.7416442401789982, + "language_loss": 0.76450247, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78606331, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12524414, + "step": 10935, + "time_per_iteration": 2.685183048248291 + }, + { + "auxiliary_loss_clip": 0.01114456, + "auxiliary_loss_mlp": 0.01038252, + "balance_loss_clip": 1.04016495, + "balance_loss_mlp": 1.02430475, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.6040986337662575, + "language_loss": 0.78761679, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.8091439, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.13946533, + "step": 10936, + "time_per_iteration": 2.4739205837249756 + }, + { + "auxiliary_loss_clip": 0.01119334, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.04793119, + "balance_loss_mlp": 1.01995158, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 2.0514426234928114, + "language_loss": 0.69329882, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71480817, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11645508, + "step": 10937, + "time_per_iteration": 2.4992477893829346 + }, + { + "auxiliary_loss_clip": 0.01118901, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.04556131, + "balance_loss_mlp": 1.01536846, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.396303390652529, + "language_loss": 0.68867528, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.71014416, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12634277, + "step": 10938, + "time_per_iteration": 2.406615734100342 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.04403508, + "balance_loss_mlp": 1.01790404, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 5.3467477186301835, + "language_loss": 0.71648252, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73798394, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1307373, + "step": 10939, + "time_per_iteration": 2.482218027114868 + }, + { + "auxiliary_loss_clip": 0.01120093, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.04394889, + "balance_loss_mlp": 1.01528645, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.4465830448526287, + "language_loss": 0.78047681, + "learning_rate": 1.107826092473037e-06, + "loss": 0.80196333, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.1328125, + "step": 10940, + "time_per_iteration": 2.446669816970825 + }, + { + "auxiliary_loss_clip": 0.01125106, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.04684091, + "balance_loss_mlp": 1.01759291, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 2.4707391676386625, + "language_loss": 0.68729508, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70884693, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12481689, + "step": 10941, + "time_per_iteration": 2.632690191268921 + }, + { + "auxiliary_loss_clip": 0.01119726, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.04560733, + "balance_loss_mlp": 1.01641417, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.8462608103135898, + "language_loss": 0.68280256, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70429146, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12731934, + "step": 10942, + "time_per_iteration": 2.4640097618103027 + }, + { + "auxiliary_loss_clip": 0.0112499, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.04510188, + "balance_loss_mlp": 1.01992416, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.1002848974589177, + "language_loss": 0.71611357, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73770398, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.14123535, + "step": 10943, + "time_per_iteration": 2.4406774044036865 + }, + { + "auxiliary_loss_clip": 0.01115328, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.04331875, + "balance_loss_mlp": 1.01821518, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 2.1190117518388014, + "language_loss": 0.595734, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61719298, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12347412, + "step": 10944, + "time_per_iteration": 2.521695852279663 + }, + { + "auxiliary_loss_clip": 0.01122524, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.04367352, + "balance_loss_mlp": 1.022825, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.5512025253143684, + "language_loss": 0.7247858, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74637401, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13470459, + "step": 10945, + "time_per_iteration": 2.5477123260498047 + }, + { + "auxiliary_loss_clip": 0.01116513, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.04263496, + "balance_loss_mlp": 1.01638269, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.8428322600052613, + "language_loss": 0.70827335, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72972161, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11932373, + "step": 10946, + "time_per_iteration": 2.6380858421325684 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.04226851, + "balance_loss_mlp": 1.02131307, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.9320840623546411, + "language_loss": 0.82080585, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84230804, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12591553, + "step": 10947, + "time_per_iteration": 2.4845542907714844 + }, + { + "auxiliary_loss_clip": 0.01118685, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.04273915, + "balance_loss_mlp": 1.01629353, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5258316121471014, + "language_loss": 0.77479613, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79626513, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11907959, + "step": 10948, + "time_per_iteration": 3.873222589492798 + }, + { + "auxiliary_loss_clip": 0.0111874, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.04398024, + "balance_loss_mlp": 1.02077675, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.765950511418098, + "language_loss": 0.79155123, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81306773, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12127686, + "step": 10949, + "time_per_iteration": 2.44088077545166 + }, + { + "auxiliary_loss_clip": 0.01050939, + "auxiliary_loss_mlp": 0.0101168, + "balance_loss_clip": 1.02564394, + "balance_loss_mlp": 1.01011729, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7361266346220414, + "language_loss": 0.61833733, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63896358, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01564026, + "step": 10950, + "time_per_iteration": 3.157297372817993 + }, + { + "auxiliary_loss_clip": 0.01119098, + "auxiliary_loss_mlp": 0.01035266, + "balance_loss_clip": 1.04819822, + "balance_loss_mlp": 1.02415538, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 1.9709362035705533, + "language_loss": 0.67332768, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69487131, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11114502, + "step": 10951, + "time_per_iteration": 2.4033517837524414 + }, + { + "auxiliary_loss_clip": 0.01119889, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.04639411, + "balance_loss_mlp": 1.02333724, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.488054003862199, + "language_loss": 0.76570851, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78725648, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11572266, + "step": 10952, + "time_per_iteration": 2.5191893577575684 + }, + { + "auxiliary_loss_clip": 0.01123055, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.05019069, + "balance_loss_mlp": 1.02046454, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.8088120872501405, + "language_loss": 0.7385717, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.76012534, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11846924, + "step": 10953, + "time_per_iteration": 2.461977005004883 + }, + { + "auxiliary_loss_clip": 0.01121821, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.04651451, + "balance_loss_mlp": 1.02367282, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 1.8106737256025704, + "language_loss": 0.78587437, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80745572, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12628174, + "step": 10954, + "time_per_iteration": 2.5464019775390625 + }, + { + "auxiliary_loss_clip": 0.01119225, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.04548359, + "balance_loss_mlp": 1.02520227, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.5664102396645982, + "language_loss": 0.69893891, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.72051179, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12866211, + "step": 10955, + "time_per_iteration": 2.476073741912842 + }, + { + "auxiliary_loss_clip": 0.01113552, + "auxiliary_loss_mlp": 0.01036121, + "balance_loss_clip": 1.04341722, + "balance_loss_mlp": 1.02498651, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 4.006104491696312, + "language_loss": 0.80868471, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.83018142, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11132812, + "step": 10956, + "time_per_iteration": 2.5443947315216064 + }, + { + "auxiliary_loss_clip": 0.0111996, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.04653382, + "balance_loss_mlp": 1.02386081, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.090517806092407, + "language_loss": 0.81787384, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83944046, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12835693, + "step": 10957, + "time_per_iteration": 2.464566230773926 + }, + { + "auxiliary_loss_clip": 0.01116966, + "auxiliary_loss_mlp": 0.01029038, + "balance_loss_clip": 1.04498529, + "balance_loss_mlp": 1.01776648, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.7351332407382596, + "language_loss": 0.76248163, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78394169, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11273193, + "step": 10958, + "time_per_iteration": 2.699645757675171 + }, + { + "auxiliary_loss_clip": 0.01119559, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.04504395, + "balance_loss_mlp": 1.02301741, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.6033917675024527, + "language_loss": 0.74983013, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77137309, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11706543, + "step": 10959, + "time_per_iteration": 2.5696280002593994 + }, + { + "auxiliary_loss_clip": 0.01123102, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.04978561, + "balance_loss_mlp": 1.01787591, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.6170692037273868, + "language_loss": 0.64660162, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66812837, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11700439, + "step": 10960, + "time_per_iteration": 2.479207992553711 + }, + { + "auxiliary_loss_clip": 0.01126386, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.04873121, + "balance_loss_mlp": 1.02374649, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.538806075374103, + "language_loss": 0.81807065, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83970499, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13293457, + "step": 10961, + "time_per_iteration": 2.4141035079956055 + }, + { + "auxiliary_loss_clip": 0.01126497, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.05347574, + "balance_loss_mlp": 1.01716101, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 2.390234039593504, + "language_loss": 0.73517501, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75672984, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11834717, + "step": 10962, + "time_per_iteration": 2.567984104156494 + }, + { + "auxiliary_loss_clip": 0.01118882, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.04371083, + "balance_loss_mlp": 1.01826024, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1.8512180982906137, + "language_loss": 0.79901367, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82050401, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11895752, + "step": 10963, + "time_per_iteration": 3.8059933185577393 + }, + { + "auxiliary_loss_clip": 0.01118413, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.04586303, + "balance_loss_mlp": 1.01864171, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 2.0938302886825495, + "language_loss": 0.78139806, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80289358, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.125, + "step": 10964, + "time_per_iteration": 2.4704484939575195 + }, + { + "auxiliary_loss_clip": 0.0112156, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.04510343, + "balance_loss_mlp": 1.02056921, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.9997869132663832, + "language_loss": 0.73904204, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76058286, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1194458, + "step": 10965, + "time_per_iteration": 2.4686150550842285 + }, + { + "auxiliary_loss_clip": 0.01121995, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.04450881, + "balance_loss_mlp": 1.01908326, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 3.1241091223217334, + "language_loss": 0.73936945, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.76091695, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13671875, + "step": 10966, + "time_per_iteration": 2.4133427143096924 + }, + { + "auxiliary_loss_clip": 0.01120674, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.0470407, + "balance_loss_mlp": 1.01654339, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.5377533928964513, + "language_loss": 0.77041394, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79190922, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12316895, + "step": 10967, + "time_per_iteration": 3.9459280967712402 + }, + { + "auxiliary_loss_clip": 0.01062603, + "auxiliary_loss_mlp": 0.01004006, + "balance_loss_clip": 1.03758323, + "balance_loss_mlp": 1.00227606, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.693839799346054, + "language_loss": 0.48450279, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50516891, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01728821, + "step": 10968, + "time_per_iteration": 3.065244197845459 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.04208195, + "balance_loss_mlp": 1.02346277, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.95430279359581, + "language_loss": 0.79068732, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81222045, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13415527, + "step": 10969, + "time_per_iteration": 2.393984794616699 + }, + { + "auxiliary_loss_clip": 0.0112295, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.04775214, + "balance_loss_mlp": 1.01932108, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 1.9041391453158576, + "language_loss": 0.65379626, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67533731, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11834717, + "step": 10970, + "time_per_iteration": 2.4215261936187744 + }, + { + "auxiliary_loss_clip": 0.01116291, + "auxiliary_loss_mlp": 0.0102477, + "balance_loss_clip": 1.04308307, + "balance_loss_mlp": 1.01254535, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.7252190031068106, + "language_loss": 0.76327699, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78468758, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12207031, + "step": 10971, + "time_per_iteration": 2.4582648277282715 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.04185402, + "balance_loss_mlp": 1.01915157, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 3.4251134260000042, + "language_loss": 0.70114827, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72263718, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11962891, + "step": 10972, + "time_per_iteration": 2.418497085571289 + }, + { + "auxiliary_loss_clip": 0.0111682, + "auxiliary_loss_mlp": 0.01024704, + "balance_loss_clip": 1.04205716, + "balance_loss_mlp": 1.01208007, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 2.198311787828064, + "language_loss": 0.55907238, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.58048761, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1262207, + "step": 10973, + "time_per_iteration": 2.5538246631622314 + }, + { + "auxiliary_loss_clip": 0.01124687, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.04649925, + "balance_loss_mlp": 1.02175379, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 1.9995759477413695, + "language_loss": 0.79105914, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81264883, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12518311, + "step": 10974, + "time_per_iteration": 3.864039897918701 + }, + { + "auxiliary_loss_clip": 0.01121632, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.04438519, + "balance_loss_mlp": 1.02679598, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.5374011354116357, + "language_loss": 0.68606341, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.70766997, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12225342, + "step": 10975, + "time_per_iteration": 2.4459917545318604 + }, + { + "auxiliary_loss_clip": 0.01137503, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.0597527, + "balance_loss_mlp": 1.01510906, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.9521415624071068, + "language_loss": 0.70848137, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.73012537, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11798096, + "step": 10976, + "time_per_iteration": 2.4894766807556152 + }, + { + "auxiliary_loss_clip": 0.01112949, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.04299653, + "balance_loss_mlp": 1.01672029, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.610501525076049, + "language_loss": 0.67607445, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.697492, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.12084961, + "step": 10977, + "time_per_iteration": 2.536452054977417 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.03860044, + "balance_loss_mlp": 1.01801407, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 1.9874947291735323, + "language_loss": 0.81223404, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83369863, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13293457, + "step": 10978, + "time_per_iteration": 2.49119234085083 + }, + { + "auxiliary_loss_clip": 0.01112878, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.03764319, + "balance_loss_mlp": 1.02247584, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 2.301305402220437, + "language_loss": 0.6768316, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69831192, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12677002, + "step": 10979, + "time_per_iteration": 2.5806102752685547 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.0432471, + "balance_loss_mlp": 1.01866269, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.331526124104998, + "language_loss": 0.73633128, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75784922, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12927246, + "step": 10980, + "time_per_iteration": 2.4786946773529053 + }, + { + "auxiliary_loss_clip": 0.01106019, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.03709662, + "balance_loss_mlp": 1.02068686, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.6204824353994898, + "language_loss": 0.72920275, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.75057793, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10809326, + "step": 10981, + "time_per_iteration": 2.5758237838745117 + }, + { + "auxiliary_loss_clip": 0.01119405, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.04226851, + "balance_loss_mlp": 1.02642608, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.0119635128538085, + "language_loss": 0.68682438, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.70841575, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13317871, + "step": 10982, + "time_per_iteration": 2.49448561668396 + }, + { + "auxiliary_loss_clip": 0.01116952, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.04521394, + "balance_loss_mlp": 1.01467204, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.5456574936350846, + "language_loss": 0.69770181, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71913642, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1184082, + "step": 10983, + "time_per_iteration": 2.4203505516052246 + }, + { + "auxiliary_loss_clip": 0.01124942, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.04886293, + "balance_loss_mlp": 1.01722217, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.7046443815827417, + "language_loss": 0.70472211, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72626841, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12469482, + "step": 10984, + "time_per_iteration": 2.5680596828460693 + }, + { + "auxiliary_loss_clip": 0.01122008, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.04830086, + "balance_loss_mlp": 1.01895666, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.6146535193014895, + "language_loss": 0.8407793, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.86230671, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11767578, + "step": 10985, + "time_per_iteration": 2.414405107498169 + }, + { + "auxiliary_loss_clip": 0.01117528, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.04363513, + "balance_loss_mlp": 1.01994944, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.0350511353702734, + "language_loss": 0.74369866, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76519579, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12243652, + "step": 10986, + "time_per_iteration": 2.495434522628784 + }, + { + "auxiliary_loss_clip": 0.01124256, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.05345905, + "balance_loss_mlp": 1.01982045, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 2.050145706501778, + "language_loss": 0.79025865, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.811818, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11865234, + "step": 10987, + "time_per_iteration": 2.4233970642089844 + }, + { + "auxiliary_loss_clip": 0.01051789, + "auxiliary_loss_mlp": 0.01004298, + "balance_loss_clip": 1.02512062, + "balance_loss_mlp": 1.00266516, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8183324117458091, + "language_loss": 0.54106337, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56162429, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01635742, + "step": 10988, + "time_per_iteration": 3.147425413131714 + }, + { + "auxiliary_loss_clip": 0.01115255, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.04214787, + "balance_loss_mlp": 1.02359247, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.5603301408107781, + "language_loss": 0.77394205, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79543829, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10784912, + "step": 10989, + "time_per_iteration": 2.5070972442626953 + }, + { + "auxiliary_loss_clip": 0.01117359, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.04608071, + "balance_loss_mlp": 1.02213407, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.8882898410732791, + "language_loss": 0.7725296, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.79403543, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11090088, + "step": 10990, + "time_per_iteration": 2.4086220264434814 + }, + { + "auxiliary_loss_clip": 0.01123097, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.04642391, + "balance_loss_mlp": 1.01789224, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 3.7927692885402835, + "language_loss": 0.60998976, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.6315251, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12548828, + "step": 10991, + "time_per_iteration": 2.4059927463531494 + }, + { + "auxiliary_loss_clip": 0.01121561, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.04501832, + "balance_loss_mlp": 1.02372241, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.554341205841975, + "language_loss": 0.6904639, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.71204066, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12371826, + "step": 10992, + "time_per_iteration": 3.8108818531036377 + }, + { + "auxiliary_loss_clip": 0.01125519, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.04749537, + "balance_loss_mlp": 1.0223999, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 1.8929406947090461, + "language_loss": 0.88194716, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.90355396, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12774658, + "step": 10993, + "time_per_iteration": 2.5261714458465576 + }, + { + "auxiliary_loss_clip": 0.01123018, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.04482067, + "balance_loss_mlp": 1.01906145, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.5618905502075449, + "language_loss": 0.66759253, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.68916994, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.15667725, + "step": 10994, + "time_per_iteration": 2.502408742904663 + }, + { + "auxiliary_loss_clip": 0.01126641, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.05073225, + "balance_loss_mlp": 1.02094531, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.7766912332894618, + "language_loss": 0.77044499, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.79204941, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.128479, + "step": 10995, + "time_per_iteration": 2.4170501232147217 + }, + { + "auxiliary_loss_clip": 0.01123034, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.04397845, + "balance_loss_mlp": 1.02138424, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.920041124793793, + "language_loss": 0.74285781, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76442742, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12536621, + "step": 10996, + "time_per_iteration": 2.451321840286255 + }, + { + "auxiliary_loss_clip": 0.01124792, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.05035472, + "balance_loss_mlp": 1.02123189, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.985391671468361, + "language_loss": 0.69077706, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71235746, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12017822, + "step": 10997, + "time_per_iteration": 2.4193899631500244 + }, + { + "auxiliary_loss_clip": 0.01124084, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.04508018, + "balance_loss_mlp": 1.01813245, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 2.0300254939734748, + "language_loss": 0.68599886, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70754272, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.1217041, + "step": 10998, + "time_per_iteration": 2.420520305633545 + }, + { + "auxiliary_loss_clip": 0.01054803, + "auxiliary_loss_mlp": 0.01004445, + "balance_loss_clip": 1.02732611, + "balance_loss_mlp": 1.00264835, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6542098073594683, + "language_loss": 0.51101458, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53160703, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.01797485, + "step": 10999, + "time_per_iteration": 3.0379278659820557 + }, + { + "auxiliary_loss_clip": 0.01128397, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.05207038, + "balance_loss_mlp": 1.02266407, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.489989974466796, + "language_loss": 0.70497823, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72660655, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11773682, + "step": 11000, + "time_per_iteration": 2.476977825164795 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.03884268, + "balance_loss_mlp": 1.02286327, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 2.0919917416968077, + "language_loss": 0.65177941, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67321587, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11248779, + "step": 11001, + "time_per_iteration": 2.5842058658599854 + }, + { + "auxiliary_loss_clip": 0.01119804, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.04472947, + "balance_loss_mlp": 1.01938796, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.9393075145207814, + "language_loss": 0.72948658, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75099814, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11962891, + "step": 11002, + "time_per_iteration": 2.4892773628234863 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.04218364, + "balance_loss_mlp": 1.01952505, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 2.003182501976737, + "language_loss": 0.79049134, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.8119666, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12438965, + "step": 11003, + "time_per_iteration": 2.450299024581909 + }, + { + "auxiliary_loss_clip": 0.01125498, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.0511167, + "balance_loss_mlp": 1.02084923, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.8690769980981876, + "language_loss": 0.69312561, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71472478, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.13568115, + "step": 11004, + "time_per_iteration": 2.4300618171691895 + }, + { + "auxiliary_loss_clip": 0.01126478, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.048877, + "balance_loss_mlp": 1.0178833, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.8658118481071986, + "language_loss": 0.69589388, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71747732, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13989258, + "step": 11005, + "time_per_iteration": 2.47724986076355 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.04616189, + "balance_loss_mlp": 1.01946235, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.6485342094786897, + "language_loss": 0.78301191, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80451608, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1194458, + "step": 11006, + "time_per_iteration": 2.502912998199463 + }, + { + "auxiliary_loss_clip": 0.01110455, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.03821874, + "balance_loss_mlp": 1.02435732, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5277777077739536, + "language_loss": 0.76804805, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78954244, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1463623, + "step": 11007, + "time_per_iteration": 3.923363208770752 + }, + { + "auxiliary_loss_clip": 0.01113867, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.04074466, + "balance_loss_mlp": 1.02176213, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.942897775722228, + "language_loss": 0.78774434, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80922568, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12512207, + "step": 11008, + "time_per_iteration": 2.5191760063171387 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.04666054, + "balance_loss_mlp": 1.02056742, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.6937717036773137, + "language_loss": 0.81833375, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83990729, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13525391, + "step": 11009, + "time_per_iteration": 2.538944959640503 + }, + { + "auxiliary_loss_clip": 0.01052015, + "auxiliary_loss_mlp": 0.01003572, + "balance_loss_clip": 1.02632618, + "balance_loss_mlp": 1.00175691, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9842699514270852, + "language_loss": 0.6742928, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69484866, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01815796, + "step": 11010, + "time_per_iteration": 4.468024492263794 + }, + { + "auxiliary_loss_clip": 0.01119926, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.04361701, + "balance_loss_mlp": 1.0224508, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.6560253300912313, + "language_loss": 0.70906538, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73062581, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13671875, + "step": 11011, + "time_per_iteration": 2.4539005756378174 + }, + { + "auxiliary_loss_clip": 0.01126394, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.02400303, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.4551128045710962, + "language_loss": 0.72326916, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74489343, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12030029, + "step": 11012, + "time_per_iteration": 2.455094337463379 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.04479337, + "balance_loss_mlp": 1.02158499, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.6268941359335984, + "language_loss": 0.79621601, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81766701, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.10552979, + "step": 11013, + "time_per_iteration": 2.4496216773986816 + }, + { + "auxiliary_loss_clip": 0.01122661, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.05018139, + "balance_loss_mlp": 1.01883364, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 2.0095255996695256, + "language_loss": 0.70461398, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72615051, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12164307, + "step": 11014, + "time_per_iteration": 2.4210546016693115 + }, + { + "auxiliary_loss_clip": 0.01118524, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.04673576, + "balance_loss_mlp": 1.0160563, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.7082384309345824, + "language_loss": 0.77070308, + "learning_rate": 1.081779858400137e-06, + "loss": 0.79216206, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11328125, + "step": 11015, + "time_per_iteration": 2.416478157043457 + }, + { + "auxiliary_loss_clip": 0.01122099, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.04994226, + "balance_loss_mlp": 1.01475, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.7169544305386204, + "language_loss": 0.82723975, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.84873474, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12664795, + "step": 11016, + "time_per_iteration": 2.451019048690796 + }, + { + "auxiliary_loss_clip": 0.01116301, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.0415436, + "balance_loss_mlp": 1.02109313, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 3.00623874688504, + "language_loss": 0.70218319, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.72367918, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12194824, + "step": 11017, + "time_per_iteration": 2.403524875640869 + }, + { + "auxiliary_loss_clip": 0.01117571, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.04535961, + "balance_loss_mlp": 1.02033556, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.81658086367919, + "language_loss": 0.77114582, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79264855, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12359619, + "step": 11018, + "time_per_iteration": 4.098193407058716 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.04097486, + "balance_loss_mlp": 1.02404976, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 2.3502882908048393, + "language_loss": 0.83217794, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85370386, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12689209, + "step": 11019, + "time_per_iteration": 2.462214946746826 + }, + { + "auxiliary_loss_clip": 0.01110767, + "auxiliary_loss_mlp": 0.01027526, + "balance_loss_clip": 1.03951979, + "balance_loss_mlp": 1.01551533, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.7128102976379511, + "language_loss": 0.72023857, + "learning_rate": 1.080050345253328e-06, + "loss": 0.74162143, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12005615, + "step": 11020, + "time_per_iteration": 2.484422206878662 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03911781, + "balance_loss_mlp": 1.01387918, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 2.000094117860449, + "language_loss": 0.72540522, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74683213, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13568115, + "step": 11021, + "time_per_iteration": 2.4978880882263184 + }, + { + "auxiliary_loss_clip": 0.0111801, + "auxiliary_loss_mlp": 0.01032269, + "balance_loss_clip": 1.0450139, + "balance_loss_mlp": 1.01982391, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.0791192453932457, + "language_loss": 0.8337096, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85521233, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12463379, + "step": 11022, + "time_per_iteration": 2.4637086391448975 + }, + { + "auxiliary_loss_clip": 0.0112447, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.04321265, + "balance_loss_mlp": 1.01801682, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 3.0280379678982383, + "language_loss": 0.72831035, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74987608, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14093018, + "step": 11023, + "time_per_iteration": 2.3812880516052246 + }, + { + "auxiliary_loss_clip": 0.01110573, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.03821659, + "balance_loss_mlp": 1.01680136, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.6634990303283255, + "language_loss": 0.74956858, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77097499, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.13256836, + "step": 11024, + "time_per_iteration": 2.446817398071289 + }, + { + "auxiliary_loss_clip": 0.01123377, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.04744899, + "balance_loss_mlp": 1.01465392, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.290316955440132, + "language_loss": 0.69248611, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71399599, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12969971, + "step": 11025, + "time_per_iteration": 2.4092907905578613 + }, + { + "auxiliary_loss_clip": 0.01124668, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.04920197, + "balance_loss_mlp": 1.01806712, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.51700831372696, + "language_loss": 0.78871405, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.81026286, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.121521, + "step": 11026, + "time_per_iteration": 2.4600260257720947 + }, + { + "auxiliary_loss_clip": 0.01113953, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.04259694, + "balance_loss_mlp": 1.01731515, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.60259431707426, + "language_loss": 0.76130223, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.7827279, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11303711, + "step": 11027, + "time_per_iteration": 2.430166721343994 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.04504895, + "balance_loss_mlp": 1.0195967, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.870587186236546, + "language_loss": 0.70609701, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72761983, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12506104, + "step": 11028, + "time_per_iteration": 2.452784299850464 + }, + { + "auxiliary_loss_clip": 0.01109562, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.03706491, + "balance_loss_mlp": 1.01705742, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 2.584313263037007, + "language_loss": 0.79736459, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.81874007, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10931396, + "step": 11029, + "time_per_iteration": 2.4661619663238525 + }, + { + "auxiliary_loss_clip": 0.01121078, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.04461563, + "balance_loss_mlp": 1.01897836, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 1.9553245578811254, + "language_loss": 0.75766605, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.77919561, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12902832, + "step": 11030, + "time_per_iteration": 2.4808688163757324 + }, + { + "auxiliary_loss_clip": 0.01119351, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.04195452, + "balance_loss_mlp": 1.01996076, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.1796718641759876, + "language_loss": 0.75488621, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.77640861, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12927246, + "step": 11031, + "time_per_iteration": 2.415971279144287 + }, + { + "auxiliary_loss_clip": 0.01120258, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.0429579, + "balance_loss_mlp": 1.01987648, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.872092913987917, + "language_loss": 0.74968731, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77121425, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12561035, + "step": 11032, + "time_per_iteration": 2.445267915725708 + }, + { + "auxiliary_loss_clip": 0.01118578, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.04539979, + "balance_loss_mlp": 1.01928973, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.8816299511359549, + "language_loss": 0.81123519, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.83272952, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11566162, + "step": 11033, + "time_per_iteration": 2.4451065063476562 + }, + { + "auxiliary_loss_clip": 0.01113571, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.0388267, + "balance_loss_mlp": 1.01959467, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.942557955082308, + "language_loss": 0.80799705, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82945782, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12908936, + "step": 11034, + "time_per_iteration": 2.4954447746276855 + }, + { + "auxiliary_loss_clip": 0.01116563, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.04390001, + "balance_loss_mlp": 1.0170126, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.6567988613776683, + "language_loss": 0.75779903, + "learning_rate": 1.074867045054166e-06, + "loss": 0.7792474, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11254883, + "step": 11035, + "time_per_iteration": 3.869877815246582 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01025336, + "balance_loss_clip": 1.04485512, + "balance_loss_mlp": 1.0128727, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 2.0608401974491133, + "language_loss": 0.82774472, + "learning_rate": 1.074521771867622e-06, + "loss": 0.84922272, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12445068, + "step": 11036, + "time_per_iteration": 2.4153337478637695 + }, + { + "auxiliary_loss_clip": 0.0106222, + "auxiliary_loss_mlp": 0.01004042, + "balance_loss_clip": 1.03514671, + "balance_loss_mlp": 1.00219476, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7832375482634858, + "language_loss": 0.52281761, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54348022, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01846313, + "step": 11037, + "time_per_iteration": 3.067246437072754 + }, + { + "auxiliary_loss_clip": 0.01122209, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.04736602, + "balance_loss_mlp": 1.02230561, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.6315994509739669, + "language_loss": 0.78933406, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81090331, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12414551, + "step": 11038, + "time_per_iteration": 2.508265256881714 + }, + { + "auxiliary_loss_clip": 0.01123457, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.04868484, + "balance_loss_mlp": 1.02233136, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 2.1815438996534042, + "language_loss": 0.64236677, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66395313, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1282959, + "step": 11039, + "time_per_iteration": 2.615319013595581 + }, + { + "auxiliary_loss_clip": 0.01121813, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.04613328, + "balance_loss_mlp": 1.01392567, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.5974156186732402, + "language_loss": 0.63931596, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66079283, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11956787, + "step": 11040, + "time_per_iteration": 2.4893670082092285 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.03675032, + "balance_loss_mlp": 1.01974487, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 2.099576446173408, + "language_loss": 0.71806031, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73945969, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11828613, + "step": 11041, + "time_per_iteration": 2.4587745666503906 + }, + { + "auxiliary_loss_clip": 0.01112851, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.03992116, + "balance_loss_mlp": 1.02414799, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.1409590934854688, + "language_loss": 0.61609721, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63759387, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12658691, + "step": 11042, + "time_per_iteration": 2.5051989555358887 + }, + { + "auxiliary_loss_clip": 0.01114764, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.03809452, + "balance_loss_mlp": 1.01631689, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.654095218958999, + "language_loss": 0.68498892, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70643091, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13110352, + "step": 11043, + "time_per_iteration": 2.496673107147217 + }, + { + "auxiliary_loss_clip": 0.01111109, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.04239237, + "balance_loss_mlp": 1.02145195, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.7934155261261846, + "language_loss": 0.84073853, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86218381, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.11987305, + "step": 11044, + "time_per_iteration": 2.4883012771606445 + }, + { + "auxiliary_loss_clip": 0.01125444, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.04990053, + "balance_loss_mlp": 1.01780438, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 2.123359062807428, + "language_loss": 0.6974349, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71899331, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12591553, + "step": 11045, + "time_per_iteration": 2.4318864345550537 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.0440855, + "balance_loss_mlp": 1.01444745, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.436651899226787, + "language_loss": 0.64709061, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.6685428, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11999512, + "step": 11046, + "time_per_iteration": 2.5069966316223145 + }, + { + "auxiliary_loss_clip": 0.0111483, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.0426532, + "balance_loss_mlp": 1.02210689, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.8640796064950573, + "language_loss": 0.71315384, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73466688, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.14361572, + "step": 11047, + "time_per_iteration": 2.6310322284698486 + }, + { + "auxiliary_loss_clip": 0.01120447, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.04622686, + "balance_loss_mlp": 1.0214777, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.975464708222802, + "language_loss": 0.77078164, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79232365, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1227417, + "step": 11048, + "time_per_iteration": 2.479484796524048 + }, + { + "auxiliary_loss_clip": 0.01058526, + "auxiliary_loss_mlp": 0.01003152, + "balance_loss_clip": 1.03183973, + "balance_loss_mlp": 1.00151134, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.7520196307837378, + "language_loss": 0.55008036, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57069707, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.26660156, + "router_z_loss_mlp": 0.0164032, + "step": 11049, + "time_per_iteration": 3.0757455825805664 + }, + { + "auxiliary_loss_clip": 0.01126111, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.05150199, + "balance_loss_mlp": 1.01685524, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.7800163820989483, + "language_loss": 0.64803624, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66958225, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11633301, + "step": 11050, + "time_per_iteration": 3.961261510848999 + }, + { + "auxiliary_loss_clip": 0.01122908, + "auxiliary_loss_mlp": 0.01023667, + "balance_loss_clip": 1.05093908, + "balance_loss_mlp": 1.01293778, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.804807965293757, + "language_loss": 0.78932691, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.81079268, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10723877, + "step": 11051, + "time_per_iteration": 2.5308117866516113 + }, + { + "auxiliary_loss_clip": 0.01119575, + "auxiliary_loss_mlp": 0.01027856, + "balance_loss_clip": 1.04671407, + "balance_loss_mlp": 1.01617968, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 2.239335612270901, + "language_loss": 0.85580432, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87727869, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11682129, + "step": 11052, + "time_per_iteration": 2.5191378593444824 + }, + { + "auxiliary_loss_clip": 0.01120956, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.04656816, + "balance_loss_mlp": 1.02308249, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.3897405755391574, + "language_loss": 0.75156152, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.77312928, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12738037, + "step": 11053, + "time_per_iteration": 3.854961395263672 + }, + { + "auxiliary_loss_clip": 0.0112247, + "auxiliary_loss_mlp": 0.01026854, + "balance_loss_clip": 1.05168986, + "balance_loss_mlp": 1.01540375, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.635139737707176, + "language_loss": 0.79778111, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81927431, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11456299, + "step": 11054, + "time_per_iteration": 2.461519718170166 + }, + { + "auxiliary_loss_clip": 0.01117212, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.04608452, + "balance_loss_mlp": 1.01960087, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.5410295011786888, + "language_loss": 0.74453604, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76601189, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10772705, + "step": 11055, + "time_per_iteration": 2.4355742931365967 + }, + { + "auxiliary_loss_clip": 0.01123289, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.04920387, + "balance_loss_mlp": 1.02376866, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 2.6947258339615026, + "language_loss": 0.72843802, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75003743, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12890625, + "step": 11056, + "time_per_iteration": 2.4471535682678223 + }, + { + "auxiliary_loss_clip": 0.01116271, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.04530561, + "balance_loss_mlp": 1.01779509, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 4.317233641449091, + "language_loss": 0.69686937, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71832579, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11578369, + "step": 11057, + "time_per_iteration": 2.43805193901062 + }, + { + "auxiliary_loss_clip": 0.01119614, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.04667604, + "balance_loss_mlp": 1.02159917, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 1.9443534752661218, + "language_loss": 0.80618745, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82772416, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12469482, + "step": 11058, + "time_per_iteration": 2.461503267288208 + }, + { + "auxiliary_loss_clip": 0.01074384, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.00549734, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.7941689637677124, + "language_loss": 0.62594157, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64675945, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01904297, + "step": 11059, + "time_per_iteration": 3.0032577514648438 + }, + { + "auxiliary_loss_clip": 0.0111642, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.04382479, + "balance_loss_mlp": 1.02210665, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.3824187516806983, + "language_loss": 0.78622293, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80771792, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10974121, + "step": 11060, + "time_per_iteration": 2.5607192516326904 + }, + { + "auxiliary_loss_clip": 0.01117541, + "auxiliary_loss_mlp": 0.01039912, + "balance_loss_clip": 1.04389882, + "balance_loss_mlp": 1.02582121, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.9951952417829295, + "language_loss": 0.7888847, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81045926, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.14074707, + "step": 11061, + "time_per_iteration": 2.5077438354492188 + }, + { + "auxiliary_loss_clip": 0.0112062, + "auxiliary_loss_mlp": 0.01028662, + "balance_loss_clip": 1.04849339, + "balance_loss_mlp": 1.0177598, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.2460105455685166, + "language_loss": 0.57165468, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59314752, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10900879, + "step": 11062, + "time_per_iteration": 3.913095712661743 + }, + { + "auxiliary_loss_clip": 0.01122698, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.04635549, + "balance_loss_mlp": 1.01694703, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.8333881525957716, + "language_loss": 0.75647998, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.77800834, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13201904, + "step": 11063, + "time_per_iteration": 2.410628080368042 + }, + { + "auxiliary_loss_clip": 0.01123634, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.05024076, + "balance_loss_mlp": 1.02024388, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.3264962599102033, + "language_loss": 0.70535219, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72690284, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11187744, + "step": 11064, + "time_per_iteration": 2.4927377700805664 + }, + { + "auxiliary_loss_clip": 0.01063068, + "auxiliary_loss_mlp": 0.01006649, + "balance_loss_clip": 1.03706264, + "balance_loss_mlp": 1.00502026, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8539249264523221, + "language_loss": 0.62997729, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65067446, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01625061, + "step": 11065, + "time_per_iteration": 3.0059428215026855 + }, + { + "auxiliary_loss_clip": 0.01125727, + "auxiliary_loss_mlp": 0.01028875, + "balance_loss_clip": 1.050807, + "balance_loss_mlp": 1.01698422, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.8498807223056215, + "language_loss": 0.62231076, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64385676, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11883545, + "step": 11066, + "time_per_iteration": 2.477299213409424 + }, + { + "auxiliary_loss_clip": 0.01126273, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.05181348, + "balance_loss_mlp": 1.01478589, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.491668052569566, + "language_loss": 0.70093298, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72247201, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.128479, + "step": 11067, + "time_per_iteration": 2.553569793701172 + }, + { + "auxiliary_loss_clip": 0.01054689, + "auxiliary_loss_mlp": 0.01004543, + "balance_loss_clip": 1.02773094, + "balance_loss_mlp": 1.0026896, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9132804773948598, + "language_loss": 0.72058272, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74117506, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.01849365, + "step": 11068, + "time_per_iteration": 3.043422222137451 + }, + { + "auxiliary_loss_clip": 0.01056793, + "auxiliary_loss_mlp": 0.01002284, + "balance_loss_clip": 1.02996886, + "balance_loss_mlp": 1.00030768, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7069654643860376, + "language_loss": 0.57798678, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.5985775, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01974487, + "step": 11069, + "time_per_iteration": 3.1846401691436768 + }, + { + "auxiliary_loss_clip": 0.0106202, + "auxiliary_loss_mlp": 0.01004776, + "balance_loss_clip": 1.03594959, + "balance_loss_mlp": 1.00304449, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7575879343169476, + "language_loss": 0.63572752, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65639555, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01733398, + "step": 11070, + "time_per_iteration": 3.1441807746887207 + }, + { + "auxiliary_loss_clip": 0.01116478, + "auxiliary_loss_mlp": 0.01025202, + "balance_loss_clip": 1.04436934, + "balance_loss_mlp": 1.01376414, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.7988344673930978, + "language_loss": 0.58836484, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60978162, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11444092, + "step": 11071, + "time_per_iteration": 2.6245040893554688 + }, + { + "auxiliary_loss_clip": 0.01126331, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.05419409, + "balance_loss_mlp": 1.02061796, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 1.8608176933228087, + "language_loss": 0.72571385, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.74729633, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11303711, + "step": 11072, + "time_per_iteration": 2.4772236347198486 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.04792428, + "balance_loss_mlp": 1.02094841, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 1.6506944604492662, + "language_loss": 0.70679784, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72837341, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12878418, + "step": 11073, + "time_per_iteration": 2.698094367980957 + }, + { + "auxiliary_loss_clip": 0.01123697, + "auxiliary_loss_mlp": 0.01034678, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.02282894, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.0442474396712105, + "language_loss": 0.55618382, + "learning_rate": 1.061427515134354e-06, + "loss": 0.57776761, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11853027, + "step": 11074, + "time_per_iteration": 2.478400230407715 + }, + { + "auxiliary_loss_clip": 0.01119977, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.04821754, + "balance_loss_mlp": 1.01608634, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.5500049737616046, + "language_loss": 0.72346032, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74493283, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11181641, + "step": 11075, + "time_per_iteration": 2.560279607772827 + }, + { + "auxiliary_loss_clip": 0.01117701, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.04777646, + "balance_loss_mlp": 1.02056503, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.4727498020024767, + "language_loss": 0.6607573, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68224919, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10906982, + "step": 11076, + "time_per_iteration": 2.6037697792053223 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01033889, + "balance_loss_clip": 1.04471397, + "balance_loss_mlp": 1.02175927, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.7129660132655364, + "language_loss": 0.7530098, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77452326, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12139893, + "step": 11077, + "time_per_iteration": 2.4823834896087646 + }, + { + "auxiliary_loss_clip": 0.01115718, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.04378915, + "balance_loss_mlp": 1.02007222, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.9578909689616617, + "language_loss": 0.66433066, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68580055, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11212158, + "step": 11078, + "time_per_iteration": 2.500887393951416 + }, + { + "auxiliary_loss_clip": 0.01122784, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.04803813, + "balance_loss_mlp": 1.02152836, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 3.200792712298242, + "language_loss": 0.69916111, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.7207315, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1272583, + "step": 11079, + "time_per_iteration": 3.9552247524261475 + }, + { + "auxiliary_loss_clip": 0.01119151, + "auxiliary_loss_mlp": 0.01023976, + "balance_loss_clip": 1.04756033, + "balance_loss_mlp": 1.01238275, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.578096803417794, + "language_loss": 0.80249393, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.8239252, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11590576, + "step": 11080, + "time_per_iteration": 2.526144504547119 + }, + { + "auxiliary_loss_clip": 0.01119291, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.0500226, + "balance_loss_mlp": 1.01838279, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 2.0147093327136094, + "language_loss": 0.78059494, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80207688, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10516357, + "step": 11081, + "time_per_iteration": 2.515228509902954 + }, + { + "auxiliary_loss_clip": 0.01129802, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.05516732, + "balance_loss_mlp": 1.01544499, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.8247061729768024, + "language_loss": 0.79797077, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.81954741, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12408447, + "step": 11082, + "time_per_iteration": 2.486316442489624 + }, + { + "auxiliary_loss_clip": 0.01123855, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.05068624, + "balance_loss_mlp": 1.02260458, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.5082682909971032, + "language_loss": 0.83820385, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.85977727, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10882568, + "step": 11083, + "time_per_iteration": 2.5063533782958984 + }, + { + "auxiliary_loss_clip": 0.01125658, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.04987764, + "balance_loss_mlp": 1.02264214, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.7016440999050304, + "language_loss": 0.85469651, + "learning_rate": 1.057990170638731e-06, + "loss": 0.8763057, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.1260376, + "step": 11084, + "time_per_iteration": 2.471095085144043 + }, + { + "auxiliary_loss_clip": 0.01121999, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.04710937, + "balance_loss_mlp": 1.01858258, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 3.260404510960702, + "language_loss": 0.73509586, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75662911, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12731934, + "step": 11085, + "time_per_iteration": 2.468018054962158 + }, + { + "auxiliary_loss_clip": 0.01119968, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.04790318, + "balance_loss_mlp": 1.01775134, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 2.4627083780113224, + "language_loss": 0.80411232, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82560831, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11877441, + "step": 11086, + "time_per_iteration": 2.477123737335205 + }, + { + "auxiliary_loss_clip": 0.01118556, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.04338586, + "balance_loss_mlp": 1.02343333, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.9134294215755752, + "language_loss": 0.74216509, + "learning_rate": 1.056959663258702e-06, + "loss": 0.76371026, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12530518, + "step": 11087, + "time_per_iteration": 2.4890894889831543 + }, + { + "auxiliary_loss_clip": 0.01123411, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.04863358, + "balance_loss_mlp": 1.01497638, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.6213949939298837, + "language_loss": 0.65107173, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.672575, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1192627, + "step": 11088, + "time_per_iteration": 2.524545431137085 + }, + { + "auxiliary_loss_clip": 0.01122527, + "auxiliary_loss_mlp": 0.01024869, + "balance_loss_clip": 1.04709542, + "balance_loss_mlp": 1.01287615, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 2.438826109195905, + "language_loss": 0.64076543, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66223943, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11999512, + "step": 11089, + "time_per_iteration": 2.4652726650238037 + }, + { + "auxiliary_loss_clip": 0.01119142, + "auxiliary_loss_mlp": 0.01026742, + "balance_loss_clip": 1.04490769, + "balance_loss_mlp": 1.0154531, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.4537867354672867, + "language_loss": 0.81227142, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83373022, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.112854, + "step": 11090, + "time_per_iteration": 2.4303982257843018 + }, + { + "auxiliary_loss_clip": 0.01125826, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.04888558, + "balance_loss_mlp": 1.01998591, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 2.0387436763769733, + "language_loss": 0.77827585, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79985654, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12255859, + "step": 11091, + "time_per_iteration": 2.4747812747955322 + }, + { + "auxiliary_loss_clip": 0.01119979, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.04638982, + "balance_loss_mlp": 1.01793504, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 2.6694960085772057, + "language_loss": 0.79294872, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81444973, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12176514, + "step": 11092, + "time_per_iteration": 2.514417886734009 + }, + { + "auxiliary_loss_clip": 0.01057877, + "auxiliary_loss_mlp": 0.01008446, + "balance_loss_clip": 1.03163886, + "balance_loss_mlp": 1.00659227, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7624823505439539, + "language_loss": 0.57655901, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59722221, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01852417, + "step": 11093, + "time_per_iteration": 3.131683349609375 + }, + { + "auxiliary_loss_clip": 0.01121745, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.04897642, + "balance_loss_mlp": 1.0155313, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 2.725449118522058, + "language_loss": 0.76670927, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78819591, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1138916, + "step": 11094, + "time_per_iteration": 3.8567543029785156 + }, + { + "auxiliary_loss_clip": 0.01121407, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.04787552, + "balance_loss_mlp": 1.02142262, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.923639006211382, + "language_loss": 0.7349031, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75646186, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.13043213, + "step": 11095, + "time_per_iteration": 2.573577404022217 + }, + { + "auxiliary_loss_clip": 0.011208, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.04938579, + "balance_loss_mlp": 1.01892972, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.151252442703262, + "language_loss": 0.73146594, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75297999, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11663818, + "step": 11096, + "time_per_iteration": 2.4788641929626465 + }, + { + "auxiliary_loss_clip": 0.01117721, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.04747403, + "balance_loss_mlp": 1.02126098, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 2.4162249422982116, + "language_loss": 0.64344722, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.6649521, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11505127, + "step": 11097, + "time_per_iteration": 2.5434629917144775 + }, + { + "auxiliary_loss_clip": 0.01122225, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.04718351, + "balance_loss_mlp": 1.01790261, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.9563161086396654, + "language_loss": 0.75437146, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77589011, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11743164, + "step": 11098, + "time_per_iteration": 3.914476156234741 + }, + { + "auxiliary_loss_clip": 0.01120179, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.04624724, + "balance_loss_mlp": 1.01931214, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.6527254513769445, + "language_loss": 0.74303079, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76453573, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11010742, + "step": 11099, + "time_per_iteration": 2.5038182735443115 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.04335785, + "balance_loss_mlp": 1.01955485, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.8763961569426166, + "language_loss": 0.78033853, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80177903, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11132812, + "step": 11100, + "time_per_iteration": 2.472653388977051 + }, + { + "auxiliary_loss_clip": 0.01116083, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.04503369, + "balance_loss_mlp": 1.0203073, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 1.7697440604943837, + "language_loss": 0.60082704, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62230498, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11401367, + "step": 11101, + "time_per_iteration": 2.4779698848724365 + }, + { + "auxiliary_loss_clip": 0.01124308, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.04731154, + "balance_loss_mlp": 1.01787949, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 2.287343564118854, + "language_loss": 0.71719146, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73874432, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13092041, + "step": 11102, + "time_per_iteration": 2.469695806503296 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.03895509, + "balance_loss_mlp": 1.01502371, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.8145178476580133, + "language_loss": 0.84275723, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86414069, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11645508, + "step": 11103, + "time_per_iteration": 2.449069023132324 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.04055822, + "balance_loss_mlp": 1.01825833, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 1.9895204618068578, + "language_loss": 0.78667796, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.8081544, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.1385498, + "step": 11104, + "time_per_iteration": 2.4219322204589844 + }, + { + "auxiliary_loss_clip": 0.01125914, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.04920411, + "balance_loss_mlp": 1.01526642, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 2.1117053030482023, + "language_loss": 0.58263671, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60416567, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11712646, + "step": 11105, + "time_per_iteration": 2.5986688137054443 + }, + { + "auxiliary_loss_clip": 0.01127626, + "auxiliary_loss_mlp": 0.01036893, + "balance_loss_clip": 1.04802847, + "balance_loss_mlp": 1.02305281, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.577444771400146, + "language_loss": 0.73184574, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75349092, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.1383667, + "step": 11106, + "time_per_iteration": 3.8912250995635986 + }, + { + "auxiliary_loss_clip": 0.01117321, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_clip": 1.04230642, + "balance_loss_mlp": 1.03026319, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.6156207001577052, + "language_loss": 0.77105868, + "learning_rate": 1.0500978558659e-06, + "loss": 0.79268432, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.14990234, + "step": 11107, + "time_per_iteration": 2.5807437896728516 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.04375541, + "balance_loss_mlp": 1.01995111, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.437866211164505, + "language_loss": 0.89471114, + "learning_rate": 1.049755142845583e-06, + "loss": 0.91620851, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.1239624, + "step": 11108, + "time_per_iteration": 2.553948163986206 + }, + { + "auxiliary_loss_clip": 0.01115795, + "auxiliary_loss_mlp": 0.01022717, + "balance_loss_clip": 1.04403567, + "balance_loss_mlp": 1.01241732, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.584538799653443, + "language_loss": 0.82541132, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84679645, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10296631, + "step": 11109, + "time_per_iteration": 2.6401307582855225 + }, + { + "auxiliary_loss_clip": 0.01118259, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.04469109, + "balance_loss_mlp": 1.0181303, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 2.687153492372951, + "language_loss": 0.69147277, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71295929, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1227417, + "step": 11110, + "time_per_iteration": 2.4633524417877197 + }, + { + "auxiliary_loss_clip": 0.01127077, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.04968452, + "balance_loss_mlp": 1.01955187, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.516691974138931, + "language_loss": 0.73511899, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75671482, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12945557, + "step": 11111, + "time_per_iteration": 2.550358772277832 + }, + { + "auxiliary_loss_clip": 0.01120929, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.04740548, + "balance_loss_mlp": 1.01688635, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 2.051274494145059, + "language_loss": 0.65347701, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67496741, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11242676, + "step": 11112, + "time_per_iteration": 2.503894567489624 + }, + { + "auxiliary_loss_clip": 0.01120607, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.04706645, + "balance_loss_mlp": 1.01715684, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 2.03532219050861, + "language_loss": 0.63652718, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65802461, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11987305, + "step": 11113, + "time_per_iteration": 2.508810520172119 + }, + { + "auxiliary_loss_clip": 0.0112031, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.04964662, + "balance_loss_mlp": 1.02402103, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.9756398008086908, + "language_loss": 0.66013861, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68169725, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11523438, + "step": 11114, + "time_per_iteration": 2.5104761123657227 + }, + { + "auxiliary_loss_clip": 0.01114061, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.03972566, + "balance_loss_mlp": 1.02346396, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.5322115829884475, + "language_loss": 0.78353161, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80502737, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1204834, + "step": 11115, + "time_per_iteration": 2.5267956256866455 + }, + { + "auxiliary_loss_clip": 0.01117459, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.04263532, + "balance_loss_mlp": 1.01558638, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.8758382492982153, + "language_loss": 0.80069882, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.82215148, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12213135, + "step": 11116, + "time_per_iteration": 2.5173072814941406 + }, + { + "auxiliary_loss_clip": 0.01124302, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.04682517, + "balance_loss_mlp": 1.01712084, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 2.373041657697375, + "language_loss": 0.79327756, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81482714, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13525391, + "step": 11117, + "time_per_iteration": 2.507460594177246 + }, + { + "auxiliary_loss_clip": 0.01122644, + "auxiliary_loss_mlp": 0.01031887, + "balance_loss_clip": 1.04803991, + "balance_loss_mlp": 1.01811838, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 2.1830666377752093, + "language_loss": 0.66004181, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.6815871, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.13781738, + "step": 11118, + "time_per_iteration": 2.4866960048675537 + }, + { + "auxiliary_loss_clip": 0.01116845, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.04542017, + "balance_loss_mlp": 1.01647139, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.6281811478253647, + "language_loss": 0.69214213, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71358585, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1105957, + "step": 11119, + "time_per_iteration": 2.4482431411743164 + }, + { + "auxiliary_loss_clip": 0.01116026, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.04200983, + "balance_loss_mlp": 1.01847565, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.6376669517044613, + "language_loss": 0.67691886, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.6983881, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12414551, + "step": 11120, + "time_per_iteration": 2.6509313583374023 + }, + { + "auxiliary_loss_clip": 0.01118088, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.04374576, + "balance_loss_mlp": 1.02097511, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 2.1210087625716856, + "language_loss": 0.71683109, + "learning_rate": 1.045303157347638e-06, + "loss": 0.73834956, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12768555, + "step": 11121, + "time_per_iteration": 2.4734458923339844 + }, + { + "auxiliary_loss_clip": 0.01117515, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.0421716, + "balance_loss_mlp": 1.02144933, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 3.0757842268561855, + "language_loss": 0.70329636, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72481143, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12530518, + "step": 11122, + "time_per_iteration": 2.4016594886779785 + }, + { + "auxiliary_loss_clip": 0.01112775, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.04053926, + "balance_loss_mlp": 1.02511692, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.8336942156541527, + "language_loss": 0.71836555, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73986769, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12322998, + "step": 11123, + "time_per_iteration": 3.93544602394104 + }, + { + "auxiliary_loss_clip": 0.0112296, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.04670644, + "balance_loss_mlp": 1.02316558, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.5513514513744497, + "language_loss": 0.79201603, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81360966, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13244629, + "step": 11124, + "time_per_iteration": 2.468824863433838 + }, + { + "auxiliary_loss_clip": 0.01119571, + "auxiliary_loss_mlp": 0.01039908, + "balance_loss_clip": 1.04660201, + "balance_loss_mlp": 1.02631271, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.9793855334672534, + "language_loss": 0.74471217, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.766307, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13604736, + "step": 11125, + "time_per_iteration": 2.4497604370117188 + }, + { + "auxiliary_loss_clip": 0.01125452, + "auxiliary_loss_mlp": 0.01035994, + "balance_loss_clip": 1.05037618, + "balance_loss_mlp": 1.02326822, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.1587831944915203, + "language_loss": 0.66205472, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68366915, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12719727, + "step": 11126, + "time_per_iteration": 2.476280450820923 + }, + { + "auxiliary_loss_clip": 0.01118063, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.04471302, + "balance_loss_mlp": 1.01694226, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 12.410602274055986, + "language_loss": 0.70987391, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73134065, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11657715, + "step": 11127, + "time_per_iteration": 2.4743356704711914 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01036098, + "balance_loss_clip": 1.04069281, + "balance_loss_mlp": 1.02090454, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 2.0072031503479706, + "language_loss": 0.80258656, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82414675, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.15197754, + "step": 11128, + "time_per_iteration": 2.4661121368408203 + }, + { + "auxiliary_loss_clip": 0.01124788, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.04723465, + "balance_loss_mlp": 1.01958656, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.929641946364735, + "language_loss": 0.81067139, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83224761, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13244629, + "step": 11129, + "time_per_iteration": 2.4757986068725586 + }, + { + "auxiliary_loss_clip": 0.01113258, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.04202223, + "balance_loss_mlp": 1.02443898, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.8033965913399532, + "language_loss": 0.70666945, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72817302, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12664795, + "step": 11130, + "time_per_iteration": 2.5432682037353516 + }, + { + "auxiliary_loss_clip": 0.01116633, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.04576957, + "balance_loss_mlp": 1.0207839, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.7980406865593954, + "language_loss": 0.70310819, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72459573, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11346436, + "step": 11131, + "time_per_iteration": 2.5340464115142822 + }, + { + "auxiliary_loss_clip": 0.01121089, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.04534638, + "balance_loss_mlp": 1.01748037, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 3.040667841444949, + "language_loss": 0.65521598, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67673725, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13562012, + "step": 11132, + "time_per_iteration": 2.484495162963867 + }, + { + "auxiliary_loss_clip": 0.01123335, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.04392862, + "balance_loss_mlp": 1.0225625, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.6412231986656673, + "language_loss": 0.74525928, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76685691, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13879395, + "step": 11133, + "time_per_iteration": 2.4549779891967773 + }, + { + "auxiliary_loss_clip": 0.01132158, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.05394626, + "balance_loss_mlp": 1.01989913, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.2637731183170913, + "language_loss": 0.66647053, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68812716, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.1361084, + "step": 11134, + "time_per_iteration": 2.528869867324829 + }, + { + "auxiliary_loss_clip": 0.01134163, + "auxiliary_loss_mlp": 0.01035695, + "balance_loss_clip": 1.05217957, + "balance_loss_mlp": 1.02172959, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.8314208575752464, + "language_loss": 0.77279234, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79449087, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13970947, + "step": 11135, + "time_per_iteration": 2.549863576889038 + }, + { + "auxiliary_loss_clip": 0.01118399, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.04606295, + "balance_loss_mlp": 1.01874447, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.7703349560486852, + "language_loss": 0.7394222, + "learning_rate": 1.040173855277898e-06, + "loss": 0.7609129, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11932373, + "step": 11136, + "time_per_iteration": 2.4509565830230713 + }, + { + "auxiliary_loss_clip": 0.0112746, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.0495137, + "balance_loss_mlp": 1.01939213, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.6522065155648478, + "language_loss": 0.6218667, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64347023, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13494873, + "step": 11137, + "time_per_iteration": 2.532008409500122 + }, + { + "auxiliary_loss_clip": 0.0112143, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.04596043, + "balance_loss_mlp": 1.01708317, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.6035580645323686, + "language_loss": 0.65500522, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.67651796, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12756348, + "step": 11138, + "time_per_iteration": 3.898951768875122 + }, + { + "auxiliary_loss_clip": 0.01109338, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.03890836, + "balance_loss_mlp": 1.02087355, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.7674548363888944, + "language_loss": 0.72760409, + "learning_rate": 1.039148976175053e-06, + "loss": 0.74903595, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12982178, + "step": 11139, + "time_per_iteration": 2.473348617553711 + }, + { + "auxiliary_loss_clip": 0.01109308, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.03961515, + "balance_loss_mlp": 1.02080417, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 2.9515404947108457, + "language_loss": 0.70909011, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.73049927, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10803223, + "step": 11140, + "time_per_iteration": 2.51669979095459 + }, + { + "auxiliary_loss_clip": 0.01125893, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.04724765, + "balance_loss_mlp": 1.01346254, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 1.7594382004137212, + "language_loss": 0.75578171, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77730578, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13067627, + "step": 11141, + "time_per_iteration": 2.5176808834075928 + }, + { + "auxiliary_loss_clip": 0.01115076, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.04172969, + "balance_loss_mlp": 1.01901519, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7089097136351172, + "language_loss": 0.81978679, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84125417, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12640381, + "step": 11142, + "time_per_iteration": 3.87064528465271 + }, + { + "auxiliary_loss_clip": 0.01114258, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.0424161, + "balance_loss_mlp": 1.01725912, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.8189451992059535, + "language_loss": 0.70010018, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72155178, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.13659668, + "step": 11143, + "time_per_iteration": 2.5171425342559814 + }, + { + "auxiliary_loss_clip": 0.01121807, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.05143476, + "balance_loss_mlp": 1.01929307, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.4910755885586984, + "language_loss": 0.69982743, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.72135109, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11260986, + "step": 11144, + "time_per_iteration": 2.490687608718872 + }, + { + "auxiliary_loss_clip": 0.01116629, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.0447011, + "balance_loss_mlp": 1.01676929, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.7356897476870612, + "language_loss": 0.74309915, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76457036, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.13720703, + "step": 11145, + "time_per_iteration": 2.4551568031311035 + }, + { + "auxiliary_loss_clip": 0.01116987, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.04119945, + "balance_loss_mlp": 1.0177927, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.5183935510726332, + "language_loss": 0.70752072, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.7289955, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12701416, + "step": 11146, + "time_per_iteration": 2.4850900173187256 + }, + { + "auxiliary_loss_clip": 0.01115433, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04308486, + "balance_loss_mlp": 1.01758564, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 2.6219596286528866, + "language_loss": 0.78344119, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80489635, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.125, + "step": 11147, + "time_per_iteration": 2.422419309616089 + }, + { + "auxiliary_loss_clip": 0.01121406, + "auxiliary_loss_mlp": 0.01038288, + "balance_loss_clip": 1.04745793, + "balance_loss_mlp": 1.02528834, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.9972736162484668, + "language_loss": 0.7006048, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.7222017, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13006592, + "step": 11148, + "time_per_iteration": 2.4848411083221436 + }, + { + "auxiliary_loss_clip": 0.01121699, + "auxiliary_loss_mlp": 0.01031773, + "balance_loss_clip": 1.04466152, + "balance_loss_mlp": 1.01987565, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.7326104314772168, + "language_loss": 0.69996119, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72149599, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11895752, + "step": 11149, + "time_per_iteration": 2.4682719707489014 + }, + { + "auxiliary_loss_clip": 0.01122696, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.04543352, + "balance_loss_mlp": 1.01627004, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 1.9248648136675064, + "language_loss": 0.73564255, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75714898, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11676025, + "step": 11150, + "time_per_iteration": 3.9699976444244385 + }, + { + "auxiliary_loss_clip": 0.01131567, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.05313456, + "balance_loss_mlp": 1.01701212, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 4.014958310406244, + "language_loss": 0.78222406, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80383825, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12835693, + "step": 11151, + "time_per_iteration": 2.551133155822754 + }, + { + "auxiliary_loss_clip": 0.0108705, + "auxiliary_loss_mlp": 0.01005055, + "balance_loss_clip": 1.06165075, + "balance_loss_mlp": 1.00345194, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.7970868210261006, + "language_loss": 0.55499995, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57592094, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01602173, + "step": 11152, + "time_per_iteration": 3.180732488632202 + }, + { + "auxiliary_loss_clip": 0.01125405, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.05064368, + "balance_loss_mlp": 1.01903224, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.7075877904618437, + "language_loss": 0.80981457, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.83138692, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12811279, + "step": 11153, + "time_per_iteration": 2.551485300064087 + }, + { + "auxiliary_loss_clip": 0.01118407, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.04305339, + "balance_loss_mlp": 1.01595294, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.7920260759575457, + "language_loss": 0.76050949, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78196865, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11547852, + "step": 11154, + "time_per_iteration": 2.557724952697754 + }, + { + "auxiliary_loss_clip": 0.01135354, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.05506384, + "balance_loss_mlp": 1.02512944, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.5357708604795648, + "language_loss": 0.76279306, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.7845273, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12945557, + "step": 11155, + "time_per_iteration": 2.5206515789031982 + }, + { + "auxiliary_loss_clip": 0.01115988, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.0418849, + "balance_loss_mlp": 1.02133667, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.211256159601482, + "language_loss": 0.81904173, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84053779, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12268066, + "step": 11156, + "time_per_iteration": 2.5011777877807617 + }, + { + "auxiliary_loss_clip": 0.01122754, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.0471375, + "balance_loss_mlp": 1.01694965, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.4175815232064064, + "language_loss": 0.74800563, + "learning_rate": 1.033006600114165e-06, + "loss": 0.76951796, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11535645, + "step": 11157, + "time_per_iteration": 2.531909942626953 + }, + { + "auxiliary_loss_clip": 0.01117493, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.04147935, + "balance_loss_mlp": 1.01961505, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.9495219959162102, + "language_loss": 0.74323332, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76472986, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12548828, + "step": 11158, + "time_per_iteration": 2.63043212890625 + }, + { + "auxiliary_loss_clip": 0.01122057, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.04605412, + "balance_loss_mlp": 1.02082074, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 1.6920159012441527, + "language_loss": 0.8171578, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83871537, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12884521, + "step": 11159, + "time_per_iteration": 2.6712915897369385 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.043365, + "balance_loss_mlp": 1.01634562, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.7086698687610296, + "language_loss": 0.7703765, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79185259, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12115479, + "step": 11160, + "time_per_iteration": 2.4412951469421387 + }, + { + "auxiliary_loss_clip": 0.01121044, + "auxiliary_loss_mlp": 0.01025411, + "balance_loss_clip": 1.04702067, + "balance_loss_mlp": 1.01325202, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 2.049122401100364, + "language_loss": 0.73373055, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75519502, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12158203, + "step": 11161, + "time_per_iteration": 2.5131828784942627 + }, + { + "auxiliary_loss_clip": 0.01124661, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.04578173, + "balance_loss_mlp": 1.02606463, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 2.3994689585374793, + "language_loss": 0.68480051, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70644724, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13928223, + "step": 11162, + "time_per_iteration": 2.523674249649048 + }, + { + "auxiliary_loss_clip": 0.01113835, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.04224205, + "balance_loss_mlp": 1.02194929, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.7965280048468961, + "language_loss": 0.7028386, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72431064, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11425781, + "step": 11163, + "time_per_iteration": 2.527665376663208 + }, + { + "auxiliary_loss_clip": 0.01120746, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.04805374, + "balance_loss_mlp": 1.0184505, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.6182532978144635, + "language_loss": 0.75509059, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77659792, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11523438, + "step": 11164, + "time_per_iteration": 2.463381290435791 + }, + { + "auxiliary_loss_clip": 0.0111771, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.04385948, + "balance_loss_mlp": 1.0151366, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 1.93082835236987, + "language_loss": 0.65437567, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67582738, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12316895, + "step": 11165, + "time_per_iteration": 2.4700303077697754 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.04530597, + "balance_loss_mlp": 1.01725686, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.1159383763426654, + "language_loss": 0.71848613, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73996317, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.13098145, + "step": 11166, + "time_per_iteration": 3.944641351699829 + }, + { + "auxiliary_loss_clip": 0.01117764, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.04692018, + "balance_loss_mlp": 1.01672029, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 2.2173161155970096, + "language_loss": 0.77503568, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79648679, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10620117, + "step": 11167, + "time_per_iteration": 2.5265305042266846 + }, + { + "auxiliary_loss_clip": 0.01119521, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.04638898, + "balance_loss_mlp": 1.02122688, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 1.7278460680253278, + "language_loss": 0.68797261, + "learning_rate": 1.029258769662629e-06, + "loss": 0.70949817, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11804199, + "step": 11168, + "time_per_iteration": 2.5842862129211426 + }, + { + "auxiliary_loss_clip": 0.01119394, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.04251611, + "balance_loss_mlp": 1.0231241, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 3.36366932284073, + "language_loss": 0.73306721, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75462359, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13128662, + "step": 11169, + "time_per_iteration": 2.4834818840026855 + }, + { + "auxiliary_loss_clip": 0.0111614, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.04176903, + "balance_loss_mlp": 1.01845527, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.8093764893901803, + "language_loss": 0.76411301, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78559542, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.13641357, + "step": 11170, + "time_per_iteration": 2.4478437900543213 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01027173, + "balance_loss_clip": 1.04360437, + "balance_loss_mlp": 1.01425028, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 2.7209667177479955, + "language_loss": 0.74736369, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76884025, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12921143, + "step": 11171, + "time_per_iteration": 2.450561285018921 + }, + { + "auxiliary_loss_clip": 0.01121935, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.04750276, + "balance_loss_mlp": 1.02212203, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.5028264815180585, + "language_loss": 0.86620098, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88776523, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12365723, + "step": 11172, + "time_per_iteration": 2.419218063354492 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.04919517, + "balance_loss_mlp": 1.01805067, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.6280771839607118, + "language_loss": 0.63630342, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65784538, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12207031, + "step": 11173, + "time_per_iteration": 2.4826292991638184 + }, + { + "auxiliary_loss_clip": 0.01128253, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.04886138, + "balance_loss_mlp": 1.02505183, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.4199698096371085, + "language_loss": 0.71596414, + "learning_rate": 1.02721637475002e-06, + "loss": 0.73763585, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13873291, + "step": 11174, + "time_per_iteration": 2.4300734996795654 + }, + { + "auxiliary_loss_clip": 0.01118835, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.04640388, + "balance_loss_mlp": 1.01544261, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 4.273073313933594, + "language_loss": 0.68796694, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70942843, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11871338, + "step": 11175, + "time_per_iteration": 2.4740030765533447 + }, + { + "auxiliary_loss_clip": 0.01116719, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.04595745, + "balance_loss_mlp": 1.02166879, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 1.8566557870596145, + "language_loss": 0.73834509, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.75984049, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.1116333, + "step": 11176, + "time_per_iteration": 2.4628851413726807 + }, + { + "auxiliary_loss_clip": 0.01116841, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.04224396, + "balance_loss_mlp": 1.01676416, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 2.417951764341331, + "language_loss": 0.73099387, + "learning_rate": 1.026195675108182e-06, + "loss": 0.75245762, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12786865, + "step": 11177, + "time_per_iteration": 2.4965054988861084 + }, + { + "auxiliary_loss_clip": 0.01111723, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.03813183, + "balance_loss_mlp": 1.01846349, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.0832851607876117, + "language_loss": 0.76367354, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78510773, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13214111, + "step": 11178, + "time_per_iteration": 2.601011037826538 + }, + { + "auxiliary_loss_clip": 0.01119853, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.04573154, + "balance_loss_mlp": 1.01917624, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 2.2377730316969666, + "language_loss": 0.70225334, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72375906, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11541748, + "step": 11179, + "time_per_iteration": 2.4546282291412354 + }, + { + "auxiliary_loss_clip": 0.01116227, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.04333282, + "balance_loss_mlp": 1.02421093, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.5939180553533687, + "language_loss": 0.73710203, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.75863892, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13250732, + "step": 11180, + "time_per_iteration": 3.874760866165161 + }, + { + "auxiliary_loss_clip": 0.01122348, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.04998064, + "balance_loss_mlp": 1.01477671, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.4465895219181366, + "language_loss": 0.75422227, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77570778, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11425781, + "step": 11181, + "time_per_iteration": 2.58217191696167 + }, + { + "auxiliary_loss_clip": 0.01123998, + "auxiliary_loss_mlp": 0.01026505, + "balance_loss_clip": 1.04764342, + "balance_loss_mlp": 1.01484656, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 2.5831578465232483, + "language_loss": 0.74646819, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76797324, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11651611, + "step": 11182, + "time_per_iteration": 2.495699882507324 + }, + { + "auxiliary_loss_clip": 0.01112988, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.04281831, + "balance_loss_mlp": 1.02081585, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 2.0221186331109244, + "language_loss": 0.70159203, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72305655, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.12634277, + "step": 11183, + "time_per_iteration": 2.463500738143921 + }, + { + "auxiliary_loss_clip": 0.01114789, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.04051137, + "balance_loss_mlp": 1.02191639, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6748958926911592, + "language_loss": 0.77717865, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79868776, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.14221191, + "step": 11184, + "time_per_iteration": 2.4991116523742676 + }, + { + "auxiliary_loss_clip": 0.01128138, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.04629409, + "balance_loss_mlp": 1.02093244, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 2.5616064654639166, + "language_loss": 0.66131759, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.6829555, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.14703369, + "step": 11185, + "time_per_iteration": 2.472320318222046 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.04498243, + "balance_loss_mlp": 1.01902533, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 2.5639934615891886, + "language_loss": 0.80274099, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82424527, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12420654, + "step": 11186, + "time_per_iteration": 3.942584276199341 + }, + { + "auxiliary_loss_clip": 0.01116812, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.04457998, + "balance_loss_mlp": 1.01960313, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.448433355842108, + "language_loss": 0.80578256, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82725978, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11303711, + "step": 11187, + "time_per_iteration": 2.463048219680786 + }, + { + "auxiliary_loss_clip": 0.01129796, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.05149078, + "balance_loss_mlp": 1.01770532, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 1.9094621027851961, + "language_loss": 0.70566481, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72727352, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13372803, + "step": 11188, + "time_per_iteration": 2.440729856491089 + }, + { + "auxiliary_loss_clip": 0.01121722, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.0506351, + "balance_loss_mlp": 1.02043939, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.6974948616145438, + "language_loss": 0.75816274, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77970302, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11871338, + "step": 11189, + "time_per_iteration": 2.4858202934265137 + }, + { + "auxiliary_loss_clip": 0.01124575, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.04694617, + "balance_loss_mlp": 1.01825726, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.1366773297292503, + "language_loss": 0.74723041, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.76880389, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.14520264, + "step": 11190, + "time_per_iteration": 2.421160936355591 + }, + { + "auxiliary_loss_clip": 0.01115668, + "auxiliary_loss_mlp": 0.01039786, + "balance_loss_clip": 1.04117537, + "balance_loss_mlp": 1.02517676, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 2.6975747265587424, + "language_loss": 0.77261925, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79417378, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.14611816, + "step": 11191, + "time_per_iteration": 2.4646682739257812 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.04439592, + "balance_loss_mlp": 1.01615, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 2.5966709096452973, + "language_loss": 0.86126482, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88271773, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1217041, + "step": 11192, + "time_per_iteration": 2.5228404998779297 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.04563129, + "balance_loss_mlp": 1.02300382, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 1.9049372664488884, + "language_loss": 0.75874418, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.7803129, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13238525, + "step": 11193, + "time_per_iteration": 3.8250057697296143 + }, + { + "auxiliary_loss_clip": 0.01122336, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.04811645, + "balance_loss_mlp": 1.01694679, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.8026392376886842, + "language_loss": 0.78448391, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80600083, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12420654, + "step": 11194, + "time_per_iteration": 2.460145950317383 + }, + { + "auxiliary_loss_clip": 0.01119624, + "auxiliary_loss_mlp": 0.01025807, + "balance_loss_clip": 1.04460526, + "balance_loss_mlp": 1.01402926, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.004680848734549, + "language_loss": 0.90123624, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92269051, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11779785, + "step": 11195, + "time_per_iteration": 2.4775354862213135 + }, + { + "auxiliary_loss_clip": 0.0112062, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.04781604, + "balance_loss_mlp": 1.01904964, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.741673279889103, + "language_loss": 0.72325641, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74477065, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11755371, + "step": 11196, + "time_per_iteration": 2.483473062515259 + }, + { + "auxiliary_loss_clip": 0.01079768, + "auxiliary_loss_mlp": 0.01005485, + "balance_loss_clip": 1.05474246, + "balance_loss_mlp": 1.00376666, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7730532729219116, + "language_loss": 0.56507486, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58592743, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.0171814, + "step": 11197, + "time_per_iteration": 3.0269031524658203 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01025025, + "balance_loss_clip": 1.04024982, + "balance_loss_mlp": 1.0138607, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.5619963616998533, + "language_loss": 0.75707197, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77841771, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11169434, + "step": 11198, + "time_per_iteration": 2.4595470428466797 + }, + { + "auxiliary_loss_clip": 0.01110394, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.03610587, + "balance_loss_mlp": 1.02010965, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.075207914669623, + "language_loss": 0.81725687, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83870035, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.13848877, + "step": 11199, + "time_per_iteration": 2.4356865882873535 + }, + { + "auxiliary_loss_clip": 0.01119611, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.04491687, + "balance_loss_mlp": 1.01786542, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.7940587229788034, + "language_loss": 0.71468163, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73618448, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12799072, + "step": 11200, + "time_per_iteration": 2.6031248569488525 + }, + { + "auxiliary_loss_clip": 0.01116344, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.04318881, + "balance_loss_mlp": 1.02066648, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.7908682014012445, + "language_loss": 0.64719194, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66868365, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12158203, + "step": 11201, + "time_per_iteration": 2.8387694358825684 + }, + { + "auxiliary_loss_clip": 0.01117821, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.04277921, + "balance_loss_mlp": 1.02000666, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.8067021474137126, + "language_loss": 0.63395488, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65547675, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.14343262, + "step": 11202, + "time_per_iteration": 2.4603030681610107 + }, + { + "auxiliary_loss_clip": 0.01120906, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.01769543, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.9094779027597517, + "language_loss": 0.74806619, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76957524, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12316895, + "step": 11203, + "time_per_iteration": 2.465318441390991 + }, + { + "auxiliary_loss_clip": 0.01124249, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.04397488, + "balance_loss_mlp": 1.02357864, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.9329440800905215, + "language_loss": 0.68064189, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.70227206, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.15197754, + "step": 11204, + "time_per_iteration": 2.486121654510498 + }, + { + "auxiliary_loss_clip": 0.01130716, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.05136979, + "balance_loss_mlp": 1.01690102, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.8688632306674484, + "language_loss": 0.7421537, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76376373, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13372803, + "step": 11205, + "time_per_iteration": 2.446664333343506 + }, + { + "auxiliary_loss_clip": 0.0111562, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.04354417, + "balance_loss_mlp": 1.02017474, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.592368542259438, + "language_loss": 0.71391577, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73539454, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12072754, + "step": 11206, + "time_per_iteration": 2.573132276535034 + }, + { + "auxiliary_loss_clip": 0.01128762, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.05032611, + "balance_loss_mlp": 1.0187397, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 2.774518146926174, + "language_loss": 0.67642897, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69803631, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13244629, + "step": 11207, + "time_per_iteration": 2.5870845317840576 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.05120134, + "balance_loss_mlp": 1.01931739, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.0907870900999117, + "language_loss": 0.73229134, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.75385988, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12060547, + "step": 11208, + "time_per_iteration": 2.6344540119171143 + }, + { + "auxiliary_loss_clip": 0.01124281, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.04826784, + "balance_loss_mlp": 1.0198946, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 2.13041273281752, + "language_loss": 0.75759178, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.7791729, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.13946533, + "step": 11209, + "time_per_iteration": 2.4794681072235107 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.04350829, + "balance_loss_mlp": 1.01786482, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 2.148024719788347, + "language_loss": 0.66614377, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68756044, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11083984, + "step": 11210, + "time_per_iteration": 3.8635969161987305 + }, + { + "auxiliary_loss_clip": 0.01110538, + "auxiliary_loss_mlp": 0.01027246, + "balance_loss_clip": 1.04129457, + "balance_loss_mlp": 1.01652265, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.0645869294656207, + "language_loss": 0.80414194, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82551986, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10717773, + "step": 11211, + "time_per_iteration": 2.4179036617279053 + }, + { + "auxiliary_loss_clip": 0.01112667, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.04338264, + "balance_loss_mlp": 1.01582539, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.3979823883799913, + "language_loss": 0.76578832, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78718984, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11651611, + "step": 11212, + "time_per_iteration": 2.5173792839050293 + }, + { + "auxiliary_loss_clip": 0.0112069, + "auxiliary_loss_mlp": 0.01027704, + "balance_loss_clip": 1.04413629, + "balance_loss_mlp": 1.01473403, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 2.5754157311845485, + "language_loss": 0.77518153, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.79666543, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12976074, + "step": 11213, + "time_per_iteration": 2.4693331718444824 + }, + { + "auxiliary_loss_clip": 0.01121196, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.04320455, + "balance_loss_mlp": 1.01722693, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 2.091624117008319, + "language_loss": 0.67627871, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69779557, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13250732, + "step": 11214, + "time_per_iteration": 2.4239559173583984 + }, + { + "auxiliary_loss_clip": 0.0111865, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.04513168, + "balance_loss_mlp": 1.02240968, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.6564499804903943, + "language_loss": 0.72694719, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74847728, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11956787, + "step": 11215, + "time_per_iteration": 2.5814006328582764 + }, + { + "auxiliary_loss_clip": 0.01119486, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.04566979, + "balance_loss_mlp": 1.01742291, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 2.163434016633333, + "language_loss": 0.66912055, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69059861, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.10894775, + "step": 11216, + "time_per_iteration": 2.567336082458496 + }, + { + "auxiliary_loss_clip": 0.01049672, + "auxiliary_loss_mlp": 0.01003341, + "balance_loss_clip": 1.02514362, + "balance_loss_mlp": 1.00201154, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6782116601626933, + "language_loss": 0.56273311, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58326328, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.01330566, + "step": 11217, + "time_per_iteration": 3.211648941040039 + }, + { + "auxiliary_loss_clip": 0.01111786, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.03953886, + "balance_loss_mlp": 1.015625, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 2.175841993016687, + "language_loss": 0.74364805, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76504898, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12683105, + "step": 11218, + "time_per_iteration": 2.4553422927856445 + }, + { + "auxiliary_loss_clip": 0.01122119, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_clip": 1.04443777, + "balance_loss_mlp": 1.028157, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6764927348863634, + "language_loss": 0.66165352, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68330598, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.14971924, + "step": 11219, + "time_per_iteration": 2.506953716278076 + }, + { + "auxiliary_loss_clip": 0.01121823, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.04661083, + "balance_loss_mlp": 1.02177072, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.9415675385555424, + "language_loss": 0.75246191, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77401966, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12176514, + "step": 11220, + "time_per_iteration": 2.5126144886016846 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.03632855, + "balance_loss_mlp": 1.01794899, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.6304323215874552, + "language_loss": 0.7032457, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72465116, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.13128662, + "step": 11221, + "time_per_iteration": 2.6355371475219727 + }, + { + "auxiliary_loss_clip": 0.01120575, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.04495859, + "balance_loss_mlp": 1.02002954, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 2.301838898223665, + "language_loss": 0.58274233, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60426581, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11749268, + "step": 11222, + "time_per_iteration": 2.480332136154175 + }, + { + "auxiliary_loss_clip": 0.01114824, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.042099, + "balance_loss_mlp": 1.02085173, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 4.606793003263405, + "language_loss": 0.76905656, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.79054099, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12768555, + "step": 11223, + "time_per_iteration": 2.5599873065948486 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.04911268, + "balance_loss_mlp": 1.018291, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 2.853883979973225, + "language_loss": 0.7538588, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77539313, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11993408, + "step": 11224, + "time_per_iteration": 3.883176326751709 + }, + { + "auxiliary_loss_clip": 0.0110776, + "auxiliary_loss_mlp": 0.01025305, + "balance_loss_clip": 1.03844607, + "balance_loss_mlp": 1.01462388, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.7150886067442894, + "language_loss": 0.63108134, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.652412, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10687256, + "step": 11225, + "time_per_iteration": 2.5022130012512207 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.03946948, + "balance_loss_mlp": 1.02798069, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.8722688367063027, + "language_loss": 0.64134347, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66283065, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.12756348, + "step": 11226, + "time_per_iteration": 2.4306154251098633 + }, + { + "auxiliary_loss_clip": 0.01116089, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.04064775, + "balance_loss_mlp": 1.02466333, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.2141888401099137, + "language_loss": 0.72251993, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.74405986, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.13238525, + "step": 11227, + "time_per_iteration": 2.4950718879699707 + }, + { + "auxiliary_loss_clip": 0.01120521, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.04844189, + "balance_loss_mlp": 1.01445532, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 2.957476027810783, + "language_loss": 0.72104549, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.74251837, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12316895, + "step": 11228, + "time_per_iteration": 2.4740865230560303 + }, + { + "auxiliary_loss_clip": 0.01057302, + "auxiliary_loss_mlp": 0.01001766, + "balance_loss_clip": 1.03218412, + "balance_loss_mlp": 1.00018609, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.752899778584118, + "language_loss": 0.53225833, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55284894, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01582336, + "step": 11229, + "time_per_iteration": 4.579422473907471 + }, + { + "auxiliary_loss_clip": 0.01112328, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.03974175, + "balance_loss_mlp": 1.01845837, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 2.048444056569325, + "language_loss": 0.80292833, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82436818, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13201904, + "step": 11230, + "time_per_iteration": 2.49057674407959 + }, + { + "auxiliary_loss_clip": 0.01116282, + "auxiliary_loss_mlp": 0.01022934, + "balance_loss_clip": 1.04566002, + "balance_loss_mlp": 1.01237833, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.6701343968881033, + "language_loss": 0.65678662, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.67817879, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10552979, + "step": 11231, + "time_per_iteration": 2.433072090148926 + }, + { + "auxiliary_loss_clip": 0.01123443, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.04478097, + "balance_loss_mlp": 1.02581763, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 2.197432412363281, + "language_loss": 0.66675895, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68839401, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14257812, + "step": 11232, + "time_per_iteration": 2.5187458992004395 + }, + { + "auxiliary_loss_clip": 0.01109904, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.03941226, + "balance_loss_mlp": 1.01915884, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.768385318599703, + "language_loss": 0.72738129, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74879968, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12768555, + "step": 11233, + "time_per_iteration": 2.4623208045959473 + }, + { + "auxiliary_loss_clip": 0.01120283, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.04657459, + "balance_loss_mlp": 1.01908338, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.6354496449575346, + "language_loss": 0.76728266, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.78879994, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12353516, + "step": 11234, + "time_per_iteration": 2.510617733001709 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01035013, + "balance_loss_clip": 1.04415512, + "balance_loss_mlp": 1.02180481, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5684541346046423, + "language_loss": 0.75170273, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77323651, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.13208008, + "step": 11235, + "time_per_iteration": 2.5272228717803955 + }, + { + "auxiliary_loss_clip": 0.01052929, + "auxiliary_loss_mlp": 0.01005168, + "balance_loss_clip": 1.02728426, + "balance_loss_mlp": 1.00365996, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7845979423666695, + "language_loss": 0.51354742, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53412843, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01507568, + "step": 11236, + "time_per_iteration": 3.046825647354126 + }, + { + "auxiliary_loss_clip": 0.01116033, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.04344356, + "balance_loss_mlp": 1.0159837, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 1.9365426783997648, + "language_loss": 0.7537623, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77521992, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.13757324, + "step": 11237, + "time_per_iteration": 3.891382932662964 + }, + { + "auxiliary_loss_clip": 0.01113998, + "auxiliary_loss_mlp": 0.0103578, + "balance_loss_clip": 1.04232466, + "balance_loss_mlp": 1.02431238, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 1.8735148793800283, + "language_loss": 0.7781198, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79961765, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11474609, + "step": 11238, + "time_per_iteration": 2.5638396739959717 + }, + { + "auxiliary_loss_clip": 0.01117615, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.04004574, + "balance_loss_mlp": 1.02231252, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.633296038680004, + "language_loss": 0.66689265, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.6884253, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13336182, + "step": 11239, + "time_per_iteration": 2.5157673358917236 + }, + { + "auxiliary_loss_clip": 0.01115906, + "auxiliary_loss_mlp": 0.01024619, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.01228046, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 1.9651648046777956, + "language_loss": 0.82837558, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.84978086, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12329102, + "step": 11240, + "time_per_iteration": 2.4256930351257324 + }, + { + "auxiliary_loss_clip": 0.01120225, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.04320359, + "balance_loss_mlp": 1.01764965, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 2.0264783248325804, + "language_loss": 0.7451694, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76670468, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.15637207, + "step": 11241, + "time_per_iteration": 2.4558427333831787 + }, + { + "auxiliary_loss_clip": 0.01109932, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.03753972, + "balance_loss_mlp": 1.02152944, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 3.3850821660773875, + "language_loss": 0.80738449, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82882077, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1217041, + "step": 11242, + "time_per_iteration": 2.4525482654571533 + }, + { + "auxiliary_loss_clip": 0.01110806, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.03775382, + "balance_loss_mlp": 1.0190928, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.7761946645634659, + "language_loss": 0.72760367, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74902534, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12286377, + "step": 11243, + "time_per_iteration": 2.488016128540039 + }, + { + "auxiliary_loss_clip": 0.01113293, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.02589655, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.6104615574564873, + "language_loss": 0.72392094, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74542987, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11700439, + "step": 11244, + "time_per_iteration": 2.4842755794525146 + }, + { + "auxiliary_loss_clip": 0.0111533, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04102731, + "balance_loss_mlp": 1.02298427, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 2.0498217839018262, + "language_loss": 0.85544425, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87696505, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.13781738, + "step": 11245, + "time_per_iteration": 2.437216281890869 + }, + { + "auxiliary_loss_clip": 0.0111399, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.03709328, + "balance_loss_mlp": 1.01690626, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 1.8512127263974303, + "language_loss": 0.74007714, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.76151872, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.13250732, + "step": 11246, + "time_per_iteration": 2.490081310272217 + }, + { + "auxiliary_loss_clip": 0.0111004, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.03754425, + "balance_loss_mlp": 1.016819, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.776478581142646, + "language_loss": 0.8797186, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90110826, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12109375, + "step": 11247, + "time_per_iteration": 2.4647090435028076 + }, + { + "auxiliary_loss_clip": 0.01045876, + "auxiliary_loss_mlp": 0.01002816, + "balance_loss_clip": 1.02100515, + "balance_loss_mlp": 1.00152862, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8336356829976812, + "language_loss": 0.54017127, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56065822, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01287842, + "step": 11248, + "time_per_iteration": 3.134838819503784 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.0102831, + "balance_loss_clip": 1.03983748, + "balance_loss_mlp": 1.0170033, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.772255473670857, + "language_loss": 0.73601103, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75737929, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.11315918, + "step": 11249, + "time_per_iteration": 2.54050350189209 + }, + { + "auxiliary_loss_clip": 0.01123438, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.04669142, + "balance_loss_mlp": 1.02151716, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.3049222036292445, + "language_loss": 0.74231982, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.76390302, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13354492, + "step": 11250, + "time_per_iteration": 2.4503273963928223 + }, + { + "auxiliary_loss_clip": 0.01120069, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.04570901, + "balance_loss_mlp": 1.01776028, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 3.3555019602183442, + "language_loss": 0.75227112, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77377057, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12109375, + "step": 11251, + "time_per_iteration": 2.459523916244507 + }, + { + "auxiliary_loss_clip": 0.01123145, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.05153155, + "balance_loss_mlp": 1.02165294, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 1.729412433980108, + "language_loss": 0.7023254, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72390282, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12939453, + "step": 11252, + "time_per_iteration": 2.428889036178589 + }, + { + "auxiliary_loss_clip": 0.01116235, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.04392898, + "balance_loss_mlp": 1.01872873, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 2.3420371019292436, + "language_loss": 0.67031372, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69178307, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11962891, + "step": 11253, + "time_per_iteration": 3.892592191696167 + }, + { + "auxiliary_loss_clip": 0.01123367, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.04854071, + "balance_loss_mlp": 1.02164078, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.5863081936458296, + "language_loss": 0.76791507, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.78950036, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13525391, + "step": 11254, + "time_per_iteration": 2.4211764335632324 + }, + { + "auxiliary_loss_clip": 0.01116131, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.04013717, + "balance_loss_mlp": 1.01705194, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.1223840457621073, + "language_loss": 0.72271049, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74417067, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.128479, + "step": 11255, + "time_per_iteration": 2.470702648162842 + }, + { + "auxiliary_loss_clip": 0.01119578, + "auxiliary_loss_mlp": 0.01039948, + "balance_loss_clip": 1.04609513, + "balance_loss_mlp": 1.02683485, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 3.8029209956270247, + "language_loss": 0.75523496, + "learning_rate": 9.994379131600828e-07, + "loss": 0.7768302, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.13110352, + "step": 11256, + "time_per_iteration": 2.5326807498931885 + }, + { + "auxiliary_loss_clip": 0.0112211, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.04572225, + "balance_loss_mlp": 1.02237582, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.2706662939873876, + "language_loss": 0.65970784, + "learning_rate": 9.991007116408965e-07, + "loss": 0.6812833, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13067627, + "step": 11257, + "time_per_iteration": 2.491342306137085 + }, + { + "auxiliary_loss_clip": 0.01115861, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.044631, + "balance_loss_mlp": 1.01917481, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.5372608216745332, + "language_loss": 0.75472558, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77619004, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11407471, + "step": 11258, + "time_per_iteration": 2.4693758487701416 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.01032158, + "balance_loss_clip": 1.04460597, + "balance_loss_mlp": 1.02055871, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.754689779886603, + "language_loss": 0.66878861, + "learning_rate": 9.984264224779127e-07, + "loss": 0.69028139, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11608887, + "step": 11259, + "time_per_iteration": 2.542729139328003 + }, + { + "auxiliary_loss_clip": 0.01113701, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.04089808, + "balance_loss_mlp": 1.01850486, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.2288969801412444, + "language_loss": 0.85899043, + "learning_rate": 9.980893348596839e-07, + "loss": 0.88043475, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12231445, + "step": 11260, + "time_per_iteration": 2.4318389892578125 + }, + { + "auxiliary_loss_clip": 0.01116254, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.0393405, + "balance_loss_mlp": 1.02255249, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.703594196576036, + "language_loss": 0.77727127, + "learning_rate": 9.977522852340081e-07, + "loss": 0.7987861, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12701416, + "step": 11261, + "time_per_iteration": 2.447338581085205 + }, + { + "auxiliary_loss_clip": 0.01112533, + "auxiliary_loss_mlp": 0.01038192, + "balance_loss_clip": 1.03907275, + "balance_loss_mlp": 1.02541292, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.9356226929073452, + "language_loss": 0.87903041, + "learning_rate": 9.97415273613666e-07, + "loss": 0.90053767, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12774658, + "step": 11262, + "time_per_iteration": 2.429882287979126 + }, + { + "auxiliary_loss_clip": 0.01115974, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.04169512, + "balance_loss_mlp": 1.018695, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 1.7892077620671474, + "language_loss": 0.74345851, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76493222, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12713623, + "step": 11263, + "time_per_iteration": 2.463107109069824 + }, + { + "auxiliary_loss_clip": 0.01119749, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.0433917, + "balance_loss_mlp": 1.02195048, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 2.965989407679336, + "language_loss": 0.67793983, + "learning_rate": 9.967413644401016e-07, + "loss": 0.69949555, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13848877, + "step": 11264, + "time_per_iteration": 2.4450247287750244 + }, + { + "auxiliary_loss_clip": 0.01119226, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.04598236, + "balance_loss_mlp": 1.02272058, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 1.981999310647251, + "language_loss": 0.73334968, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75489819, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12902832, + "step": 11265, + "time_per_iteration": 2.430816173553467 + }, + { + "auxiliary_loss_clip": 0.01113265, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.04111862, + "balance_loss_mlp": 1.01933479, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.7258257595800592, + "language_loss": 0.61808127, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63952565, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11853027, + "step": 11266, + "time_per_iteration": 2.4221274852752686 + }, + { + "auxiliary_loss_clip": 0.01117777, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.04293478, + "balance_loss_mlp": 1.02343869, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 1.9579988959103685, + "language_loss": 0.70864546, + "learning_rate": 9.957307860391976e-07, + "loss": 0.73020428, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.14666748, + "step": 11267, + "time_per_iteration": 3.856975555419922 + }, + { + "auxiliary_loss_clip": 0.01117013, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.04198599, + "balance_loss_mlp": 1.01827884, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.9725723079243775, + "language_loss": 0.70708966, + "learning_rate": 9.953940027191785e-07, + "loss": 0.72857165, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12921143, + "step": 11268, + "time_per_iteration": 2.4762752056121826 + }, + { + "auxiliary_loss_clip": 0.01126848, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_clip": 1.04983819, + "balance_loss_mlp": 1.01331425, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.8687281394141146, + "language_loss": 0.77270007, + "learning_rate": 9.950572574939194e-07, + "loss": 0.79423237, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13067627, + "step": 11269, + "time_per_iteration": 2.4776904582977295 + }, + { + "auxiliary_loss_clip": 0.01120263, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.04449475, + "balance_loss_mlp": 1.02249074, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 2.2802449597307723, + "language_loss": 0.74785495, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76941687, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13439941, + "step": 11270, + "time_per_iteration": 2.5088977813720703 + }, + { + "auxiliary_loss_clip": 0.01119203, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.04280186, + "balance_loss_mlp": 1.02425432, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 2.592536824537659, + "language_loss": 0.72990894, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75147873, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13531494, + "step": 11271, + "time_per_iteration": 2.5109896659851074 + }, + { + "auxiliary_loss_clip": 0.01114381, + "auxiliary_loss_mlp": 0.01032688, + "balance_loss_clip": 1.04016268, + "balance_loss_mlp": 1.02021241, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.947260233096853, + "language_loss": 0.67797244, + "learning_rate": 9.94047250514387e-07, + "loss": 0.6994431, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12481689, + "step": 11272, + "time_per_iteration": 2.551255464553833 + }, + { + "auxiliary_loss_clip": 0.01121443, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.04263234, + "balance_loss_mlp": 1.02546573, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 2.0370206482879563, + "language_loss": 0.73671407, + "learning_rate": 9.937106577958481e-07, + "loss": 0.7583344, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.15124512, + "step": 11273, + "time_per_iteration": 3.907059669494629 + }, + { + "auxiliary_loss_clip": 0.01118998, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.04688072, + "balance_loss_mlp": 1.0266999, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.8782319017860445, + "language_loss": 0.70594269, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72752577, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12609863, + "step": 11274, + "time_per_iteration": 2.4614040851593018 + }, + { + "auxiliary_loss_clip": 0.01119911, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.0456903, + "balance_loss_mlp": 1.01700544, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 2.428806811207653, + "language_loss": 0.65613711, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67763239, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12597656, + "step": 11275, + "time_per_iteration": 2.4864273071289062 + }, + { + "auxiliary_loss_clip": 0.01115959, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.04375112, + "balance_loss_mlp": 1.01835608, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 1.6164050624154307, + "language_loss": 0.7277205, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74917722, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11358643, + "step": 11276, + "time_per_iteration": 2.505383253097534 + }, + { + "auxiliary_loss_clip": 0.01115382, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.04124379, + "balance_loss_mlp": 1.0178175, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.6284700603556999, + "language_loss": 0.76845396, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78991652, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1305542, + "step": 11277, + "time_per_iteration": 2.439943790435791 + }, + { + "auxiliary_loss_clip": 0.0112626, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.04822624, + "balance_loss_mlp": 1.01607847, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 4.075524737739049, + "language_loss": 0.83327603, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85482657, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12719727, + "step": 11278, + "time_per_iteration": 2.403552293777466 + }, + { + "auxiliary_loss_clip": 0.0112072, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.04937506, + "balance_loss_mlp": 1.0197196, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.7953621646137514, + "language_loss": 0.70439893, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72592086, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11761475, + "step": 11279, + "time_per_iteration": 2.491370439529419 + }, + { + "auxiliary_loss_clip": 0.0112624, + "auxiliary_loss_mlp": 0.01035246, + "balance_loss_clip": 1.046574, + "balance_loss_mlp": 1.0209471, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 2.033776338289645, + "language_loss": 0.7434817, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76509655, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.14300537, + "step": 11280, + "time_per_iteration": 3.8782522678375244 + }, + { + "auxiliary_loss_clip": 0.01122034, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.04533672, + "balance_loss_mlp": 1.01817167, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.767162984890653, + "language_loss": 0.70315653, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72468996, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13140869, + "step": 11281, + "time_per_iteration": 2.5376906394958496 + }, + { + "auxiliary_loss_clip": 0.0111706, + "auxiliary_loss_mlp": 0.01027076, + "balance_loss_clip": 1.04499328, + "balance_loss_mlp": 1.01526237, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 2.3400924454397387, + "language_loss": 0.6362282, + "learning_rate": 9.906830419968217e-07, + "loss": 0.65766954, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1182251, + "step": 11282, + "time_per_iteration": 2.523181200027466 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01036806, + "balance_loss_clip": 1.05202508, + "balance_loss_mlp": 1.02269745, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 2.312198356925788, + "language_loss": 0.74456078, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76623416, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.14099121, + "step": 11283, + "time_per_iteration": 2.526090383529663 + }, + { + "auxiliary_loss_clip": 0.01117437, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.04411614, + "balance_loss_mlp": 1.01391387, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.842605211516809, + "language_loss": 0.5706228, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59205544, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1192627, + "step": 11284, + "time_per_iteration": 2.5903615951538086 + }, + { + "auxiliary_loss_clip": 0.01113842, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.03964925, + "balance_loss_mlp": 1.01675701, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 4.4147772721532395, + "language_loss": 0.75176722, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77319825, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12512207, + "step": 11285, + "time_per_iteration": 2.4603939056396484 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.04218316, + "balance_loss_mlp": 1.01672411, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.603501575588669, + "language_loss": 0.66724014, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68867368, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12353516, + "step": 11286, + "time_per_iteration": 2.4983580112457275 + }, + { + "auxiliary_loss_clip": 0.01111794, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.03728032, + "balance_loss_mlp": 1.01636481, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.6791240648999843, + "language_loss": 0.52442241, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54583293, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12896729, + "step": 11287, + "time_per_iteration": 2.5717012882232666 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.04170859, + "balance_loss_mlp": 1.01887703, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.856505361194839, + "language_loss": 0.77275473, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79421008, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12237549, + "step": 11288, + "time_per_iteration": 2.5196011066436768 + }, + { + "auxiliary_loss_clip": 0.01119047, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.04419494, + "balance_loss_mlp": 1.02268267, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 2.0713510132375244, + "language_loss": 0.72608674, + "learning_rate": 9.883303725426593e-07, + "loss": 0.74762559, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12158203, + "step": 11289, + "time_per_iteration": 2.519904136657715 + }, + { + "auxiliary_loss_clip": 0.01115696, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.04210448, + "balance_loss_mlp": 1.01948261, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.551817860864115, + "language_loss": 0.79925036, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82072783, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12579346, + "step": 11290, + "time_per_iteration": 2.511605739593506 + }, + { + "auxiliary_loss_clip": 0.01109953, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.04011297, + "balance_loss_mlp": 1.0181284, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 2.0042442574994346, + "language_loss": 0.74812192, + "learning_rate": 9.87658526342428e-07, + "loss": 0.76952136, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11871338, + "step": 11291, + "time_per_iteration": 2.502516746520996 + }, + { + "auxiliary_loss_clip": 0.01119026, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.04365802, + "balance_loss_mlp": 1.02023554, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 2.0971352246226043, + "language_loss": 0.75537336, + "learning_rate": 9.873226608180785e-07, + "loss": 0.7768954, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12945557, + "step": 11292, + "time_per_iteration": 2.515885353088379 + }, + { + "auxiliary_loss_clip": 0.01111411, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.03856981, + "balance_loss_mlp": 1.01939416, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 2.18716183168301, + "language_loss": 0.8419801, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86341465, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12652588, + "step": 11293, + "time_per_iteration": 2.523451089859009 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01050448, + "balance_loss_clip": 1.04393411, + "balance_loss_mlp": 1.03454554, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.2012352669660067, + "language_loss": 0.7954812, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81720561, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.15917969, + "step": 11294, + "time_per_iteration": 2.46125864982605 + }, + { + "auxiliary_loss_clip": 0.01117547, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.04263496, + "balance_loss_mlp": 1.02138066, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.957418499014004, + "language_loss": 0.78914315, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81066692, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13452148, + "step": 11295, + "time_per_iteration": 2.5151171684265137 + }, + { + "auxiliary_loss_clip": 0.01115085, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.04353213, + "balance_loss_mlp": 1.02146542, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.7903628255950854, + "language_loss": 0.71596003, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73742926, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1036377, + "step": 11296, + "time_per_iteration": 3.8905959129333496 + }, + { + "auxiliary_loss_clip": 0.01118599, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.04235613, + "balance_loss_mlp": 1.02221084, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.6584341357247019, + "language_loss": 0.71019781, + "learning_rate": 9.856439094633949e-07, + "loss": 0.73172462, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11865234, + "step": 11297, + "time_per_iteration": 2.4729249477386475 + }, + { + "auxiliary_loss_clip": 0.01126473, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.04685783, + "balance_loss_mlp": 1.01878619, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.1055246620924533, + "language_loss": 0.66507572, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68665993, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13165283, + "step": 11298, + "time_per_iteration": 2.391615152359009 + }, + { + "auxiliary_loss_clip": 0.01126399, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.04985833, + "balance_loss_mlp": 1.01954031, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.7607303550589861, + "language_loss": 0.71673781, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73831427, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11700439, + "step": 11299, + "time_per_iteration": 2.5099422931671143 + }, + { + "auxiliary_loss_clip": 0.01119797, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.04583812, + "balance_loss_mlp": 1.0207057, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.2836701982450534, + "language_loss": 0.77134365, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79287493, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12634277, + "step": 11300, + "time_per_iteration": 2.489985227584839 + }, + { + "auxiliary_loss_clip": 0.01116711, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.04378378, + "balance_loss_mlp": 1.01764619, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 2.1501546651392434, + "language_loss": 0.62930584, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65077722, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12774658, + "step": 11301, + "time_per_iteration": 2.4167022705078125 + }, + { + "auxiliary_loss_clip": 0.01120231, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.04665136, + "balance_loss_mlp": 1.01630521, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.6551651912857068, + "language_loss": 0.82650268, + "learning_rate": 9.839661197207525e-07, + "loss": 0.84798193, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11383057, + "step": 11302, + "time_per_iteration": 2.4718501567840576 + }, + { + "auxiliary_loss_clip": 0.0112385, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.04946685, + "balance_loss_mlp": 1.02125061, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 1.90716539337998, + "language_loss": 0.7011776, + "learning_rate": 9.83630677305654e-07, + "loss": 0.72275221, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12359619, + "step": 11303, + "time_per_iteration": 2.498840808868408 + }, + { + "auxiliary_loss_clip": 0.01126083, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.04672301, + "balance_loss_mlp": 1.02664959, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 1.960468630080718, + "language_loss": 0.70243812, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72410625, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.140625, + "step": 11304, + "time_per_iteration": 2.454350471496582 + }, + { + "auxiliary_loss_clip": 0.01124204, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.05020952, + "balance_loss_mlp": 1.02287412, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.5232238854500255, + "language_loss": 0.73108703, + "learning_rate": 9.829599081106536e-07, + "loss": 0.75270671, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.14886475, + "step": 11305, + "time_per_iteration": 2.508728504180908 + }, + { + "auxiliary_loss_clip": 0.01116991, + "auxiliary_loss_mlp": 0.01027332, + "balance_loss_clip": 1.04250324, + "balance_loss_mlp": 1.01493478, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.963309183952465, + "language_loss": 0.66003788, + "learning_rate": 9.826245813561882e-07, + "loss": 0.68148118, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12402344, + "step": 11306, + "time_per_iteration": 2.524543046951294 + }, + { + "auxiliary_loss_clip": 0.01120114, + "auxiliary_loss_mlp": 0.01028898, + "balance_loss_clip": 1.04739785, + "balance_loss_mlp": 1.01566005, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 2.015227547686682, + "language_loss": 0.79967332, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82116348, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.13232422, + "step": 11307, + "time_per_iteration": 2.5876688957214355 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.04720557, + "balance_loss_mlp": 1.02009869, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.7338836930512391, + "language_loss": 0.89177221, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91331446, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.13775635, + "step": 11308, + "time_per_iteration": 2.4780542850494385 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.0104185, + "balance_loss_clip": 1.04412794, + "balance_loss_mlp": 1.02876699, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.0417115117410947, + "language_loss": 0.7147401, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73636484, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.1307373, + "step": 11309, + "time_per_iteration": 2.474914073944092 + }, + { + "auxiliary_loss_clip": 0.01130925, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.05456042, + "balance_loss_mlp": 1.02215815, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 1.7825241620343222, + "language_loss": 0.84437662, + "learning_rate": 9.812836602552411e-07, + "loss": 0.866032, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12475586, + "step": 11310, + "time_per_iteration": 2.444732666015625 + }, + { + "auxiliary_loss_clip": 0.01121296, + "auxiliary_loss_mlp": 0.01027027, + "balance_loss_clip": 1.05043149, + "balance_loss_mlp": 1.01553559, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.6179954279118887, + "language_loss": 0.82904434, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85052752, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11499023, + "step": 11311, + "time_per_iteration": 3.8597187995910645 + }, + { + "auxiliary_loss_clip": 0.0112161, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.04355979, + "balance_loss_mlp": 1.01725054, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 1.6180536949747968, + "language_loss": 0.76401263, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78554106, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.14001465, + "step": 11312, + "time_per_iteration": 2.5281822681427 + }, + { + "auxiliary_loss_clip": 0.01053843, + "auxiliary_loss_mlp": 0.01007479, + "balance_loss_clip": 1.02845478, + "balance_loss_mlp": 1.00615835, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.661084133015879, + "language_loss": 0.57216626, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59277946, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01321411, + "step": 11313, + "time_per_iteration": 3.1887567043304443 + }, + { + "auxiliary_loss_clip": 0.01125836, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.04746199, + "balance_loss_mlp": 1.01588631, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.813908224643217, + "language_loss": 0.68757528, + "learning_rate": 9.799433572314754e-07, + "loss": 0.70912325, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13067627, + "step": 11314, + "time_per_iteration": 2.535062789916992 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.04448843, + "balance_loss_mlp": 1.01715493, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7991910968958602, + "language_loss": 0.81686121, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83831501, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11224365, + "step": 11315, + "time_per_iteration": 2.412580966949463 + }, + { + "auxiliary_loss_clip": 0.01111989, + "auxiliary_loss_mlp": 0.01025034, + "balance_loss_clip": 1.03967059, + "balance_loss_mlp": 1.01294661, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.7369282875088896, + "language_loss": 0.70072973, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72210002, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12078857, + "step": 11316, + "time_per_iteration": 2.549471139907837 + }, + { + "auxiliary_loss_clip": 0.01111377, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.03958237, + "balance_loss_mlp": 1.01920962, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 3.314374980811673, + "language_loss": 0.66768277, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68911719, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12854004, + "step": 11317, + "time_per_iteration": 3.9493601322174072 + }, + { + "auxiliary_loss_clip": 0.01124676, + "auxiliary_loss_mlp": 0.0104798, + "balance_loss_clip": 1.04546356, + "balance_loss_mlp": 1.03499198, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.5724935640593887, + "language_loss": 0.74945313, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77117968, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12988281, + "step": 11318, + "time_per_iteration": 2.496072292327881 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.04278374, + "balance_loss_mlp": 1.0178895, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.789762217523153, + "language_loss": 0.68195468, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70336521, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10809326, + "step": 11319, + "time_per_iteration": 2.4641647338867188 + }, + { + "auxiliary_loss_clip": 0.01120788, + "auxiliary_loss_mlp": 0.01038414, + "balance_loss_clip": 1.04449272, + "balance_loss_mlp": 1.02563512, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 2.1154291226340813, + "language_loss": 0.76744533, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78903735, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12786865, + "step": 11320, + "time_per_iteration": 2.441850185394287 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.04521596, + "balance_loss_mlp": 1.01873004, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 1.9255096205951547, + "language_loss": 0.7500034, + "learning_rate": 9.77599316633817e-07, + "loss": 0.77149588, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12438965, + "step": 11321, + "time_per_iteration": 2.4728927612304688 + }, + { + "auxiliary_loss_clip": 0.01123205, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.04915512, + "balance_loss_mlp": 1.02330661, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.6353712980543218, + "language_loss": 0.72459805, + "learning_rate": 9.772646086678758e-07, + "loss": 0.7461859, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12280273, + "step": 11322, + "time_per_iteration": 2.4396727085113525 + }, + { + "auxiliary_loss_clip": 0.01121633, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.04643226, + "balance_loss_mlp": 1.01839471, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.668847591348103, + "language_loss": 0.78629023, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80782139, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13079834, + "step": 11323, + "time_per_iteration": 3.873762607574463 + }, + { + "auxiliary_loss_clip": 0.0105584, + "auxiliary_loss_mlp": 0.01001672, + "balance_loss_clip": 1.03093708, + "balance_loss_mlp": 1.00025988, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7424328459444216, + "language_loss": 0.57122594, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59180105, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.0141449, + "step": 11324, + "time_per_iteration": 2.8827779293060303 + }, + { + "auxiliary_loss_clip": 0.01120618, + "auxiliary_loss_mlp": 0.01040088, + "balance_loss_clip": 1.04509163, + "balance_loss_mlp": 1.02725482, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 2.026530345946477, + "language_loss": 0.68324184, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70484889, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12835693, + "step": 11325, + "time_per_iteration": 2.4926438331604004 + }, + { + "auxiliary_loss_clip": 0.01115794, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.03884923, + "balance_loss_mlp": 1.01794016, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.8970348075212207, + "language_loss": 0.6979934, + "learning_rate": 9.759261647532974e-07, + "loss": 0.71946192, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13122559, + "step": 11326, + "time_per_iteration": 2.4525630474090576 + }, + { + "auxiliary_loss_clip": 0.01113559, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.03946638, + "balance_loss_mlp": 1.02177918, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.9246465627057558, + "language_loss": 0.72839093, + "learning_rate": 9.75591650825392e-07, + "loss": 0.74987847, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.13421631, + "step": 11327, + "time_per_iteration": 2.457028865814209 + }, + { + "auxiliary_loss_clip": 0.01112426, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.04036236, + "balance_loss_mlp": 1.01807356, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 2.26706136590421, + "language_loss": 0.77098334, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79243207, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.14379883, + "step": 11328, + "time_per_iteration": 2.5156779289245605 + }, + { + "auxiliary_loss_clip": 0.01123863, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.04691374, + "balance_loss_mlp": 1.01663971, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 1.9368543943417171, + "language_loss": 0.63905239, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66058558, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12817383, + "step": 11329, + "time_per_iteration": 2.4820027351379395 + }, + { + "auxiliary_loss_clip": 0.01121428, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.04580474, + "balance_loss_mlp": 1.01742184, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 2.199635078974928, + "language_loss": 0.79333115, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81485379, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13421631, + "step": 11330, + "time_per_iteration": 2.4564943313598633 + }, + { + "auxiliary_loss_clip": 0.01119801, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04516673, + "balance_loss_mlp": 1.01807308, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 2.8034715383033406, + "language_loss": 0.63926774, + "learning_rate": 9.742539836972665e-07, + "loss": 0.66077173, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12518311, + "step": 11331, + "time_per_iteration": 2.505434274673462 + }, + { + "auxiliary_loss_clip": 0.01123431, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.04980063, + "balance_loss_mlp": 1.02009761, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.6576411742119428, + "language_loss": 0.72349298, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74505818, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12988281, + "step": 11332, + "time_per_iteration": 2.4885857105255127 + }, + { + "auxiliary_loss_clip": 0.01119496, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.04294944, + "balance_loss_mlp": 1.02299154, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.358749590761365, + "language_loss": 0.74752796, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76908267, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12988281, + "step": 11333, + "time_per_iteration": 2.4893100261688232 + }, + { + "auxiliary_loss_clip": 0.01126427, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.04614508, + "balance_loss_mlp": 1.01852643, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.6691615717176056, + "language_loss": 0.72247052, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74405318, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13323975, + "step": 11334, + "time_per_iteration": 2.5060911178588867 + }, + { + "auxiliary_loss_clip": 0.0111422, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.04136562, + "balance_loss_mlp": 1.02141047, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.6769890734651904, + "language_loss": 0.85844016, + "learning_rate": 9.729169389113791e-07, + "loss": 0.87991571, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11932373, + "step": 11335, + "time_per_iteration": 2.509284019470215 + }, + { + "auxiliary_loss_clip": 0.0110942, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.03869462, + "balance_loss_mlp": 1.01936579, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.9105989858168608, + "language_loss": 0.82138878, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84279019, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11352539, + "step": 11336, + "time_per_iteration": 2.501790761947632 + }, + { + "auxiliary_loss_clip": 0.01113572, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.04187155, + "balance_loss_mlp": 1.01517344, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.7893755873558508, + "language_loss": 0.81657279, + "learning_rate": 9.72248650150294e-07, + "loss": 0.83797514, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11505127, + "step": 11337, + "time_per_iteration": 2.5136473178863525 + }, + { + "auxiliary_loss_clip": 0.01118774, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.0445435, + "balance_loss_mlp": 1.01805019, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.9012984537357416, + "language_loss": 0.72540164, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74688852, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11865234, + "step": 11338, + "time_per_iteration": 2.444639205932617 + }, + { + "auxiliary_loss_clip": 0.01119996, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.04714429, + "balance_loss_mlp": 1.02079165, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4864165700416967, + "language_loss": 0.77388382, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79540783, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11608887, + "step": 11339, + "time_per_iteration": 2.4880270957946777 + }, + { + "auxiliary_loss_clip": 0.01118199, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.04379606, + "balance_loss_mlp": 1.02361298, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 1.8443863755105578, + "language_loss": 0.70481968, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72637677, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.13891602, + "step": 11340, + "time_per_iteration": 3.919074296951294 + }, + { + "auxiliary_loss_clip": 0.01126937, + "auxiliary_loss_mlp": 0.01036346, + "balance_loss_clip": 1.04922712, + "balance_loss_mlp": 1.02400756, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.522207077956939, + "language_loss": 0.8359195, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85755235, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12347412, + "step": 11341, + "time_per_iteration": 2.439743757247925 + }, + { + "auxiliary_loss_clip": 0.01119991, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.04471207, + "balance_loss_mlp": 1.02151978, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.8370783652420077, + "language_loss": 0.68248689, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70403832, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13635254, + "step": 11342, + "time_per_iteration": 2.487858295440674 + }, + { + "auxiliary_loss_clip": 0.01118683, + "auxiliary_loss_mlp": 0.0102647, + "balance_loss_clip": 1.04448164, + "balance_loss_mlp": 1.0135653, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.5597548935749896, + "language_loss": 0.75032252, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77177405, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12908936, + "step": 11343, + "time_per_iteration": 2.4616827964782715 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.0103105, + "balance_loss_clip": 1.04315972, + "balance_loss_mlp": 1.01821136, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 2.0515245398823185, + "language_loss": 0.79722089, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81871927, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12835693, + "step": 11344, + "time_per_iteration": 2.5510926246643066 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.04286385, + "balance_loss_mlp": 1.02113676, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.0445406486466298, + "language_loss": 0.66210806, + "learning_rate": 9.695770550166136e-07, + "loss": 0.6836217, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12908936, + "step": 11345, + "time_per_iteration": 2.5331947803497314 + }, + { + "auxiliary_loss_clip": 0.01123365, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.0460999, + "balance_loss_mlp": 1.01781404, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.3113235335616733, + "language_loss": 0.6540404, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67559195, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13970947, + "step": 11346, + "time_per_iteration": 2.4452292919158936 + }, + { + "auxiliary_loss_clip": 0.01113889, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.03886449, + "balance_loss_mlp": 1.01748252, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.5564477429459527, + "language_loss": 0.78561878, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80707693, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.14434814, + "step": 11347, + "time_per_iteration": 2.462632894515991 + }, + { + "auxiliary_loss_clip": 0.01061371, + "auxiliary_loss_mlp": 0.01002799, + "balance_loss_clip": 1.03582978, + "balance_loss_mlp": 1.00133753, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7417813649608489, + "language_loss": 0.52533382, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54597557, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01461792, + "step": 11348, + "time_per_iteration": 3.0958876609802246 + }, + { + "auxiliary_loss_clip": 0.01115092, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.04360795, + "balance_loss_mlp": 1.02252245, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.74176260215747, + "language_loss": 0.79543281, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81693804, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12921143, + "step": 11349, + "time_per_iteration": 2.4865541458129883 + }, + { + "auxiliary_loss_clip": 0.01127663, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.04820931, + "balance_loss_mlp": 1.01307762, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 1.7635303426644593, + "language_loss": 0.74023283, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76179183, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.15161133, + "step": 11350, + "time_per_iteration": 2.5841245651245117 + }, + { + "auxiliary_loss_clip": 0.01129013, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.05271554, + "balance_loss_mlp": 1.01875663, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.5521909789160306, + "language_loss": 0.79741734, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81902331, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12823486, + "step": 11351, + "time_per_iteration": 2.4940550327301025 + }, + { + "auxiliary_loss_clip": 0.01133611, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.0557369, + "balance_loss_mlp": 1.02053213, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.9951862715219344, + "language_loss": 0.73064214, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75230634, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12286377, + "step": 11352, + "time_per_iteration": 2.6234166622161865 + }, + { + "auxiliary_loss_clip": 0.01127845, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.05023837, + "balance_loss_mlp": 1.01799703, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.6313198451096245, + "language_loss": 0.80034733, + "learning_rate": 9.669079606018814e-07, + "loss": 0.821944, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13830566, + "step": 11353, + "time_per_iteration": 2.512080430984497 + }, + { + "auxiliary_loss_clip": 0.01120889, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.04804754, + "balance_loss_mlp": 1.01745248, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.7230936845621332, + "language_loss": 0.78070784, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80221295, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12176514, + "step": 11354, + "time_per_iteration": 3.8921520709991455 + }, + { + "auxiliary_loss_clip": 0.01117293, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.04456282, + "balance_loss_mlp": 1.0169102, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.192832599678506, + "language_loss": 0.61728525, + "learning_rate": 9.662410784947599e-07, + "loss": 0.638749, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12164307, + "step": 11355, + "time_per_iteration": 2.5003817081451416 + }, + { + "auxiliary_loss_clip": 0.01117263, + "auxiliary_loss_mlp": 0.0102613, + "balance_loss_clip": 1.04204726, + "balance_loss_mlp": 1.01370835, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 2.1606916897772726, + "language_loss": 0.82321966, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84465361, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12426758, + "step": 11356, + "time_per_iteration": 2.6587719917297363 + }, + { + "auxiliary_loss_clip": 0.01126398, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.04945755, + "balance_loss_mlp": 1.01861036, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 1.744067732478395, + "language_loss": 0.7881518, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80973828, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1362915, + "step": 11357, + "time_per_iteration": 2.530097484588623 + }, + { + "auxiliary_loss_clip": 0.01060638, + "auxiliary_loss_mlp": 0.01005893, + "balance_loss_clip": 1.03482091, + "balance_loss_mlp": 1.00449097, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8295239610728367, + "language_loss": 0.59630507, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61697042, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01402283, + "step": 11358, + "time_per_iteration": 3.1451754570007324 + }, + { + "auxiliary_loss_clip": 0.01135986, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.05225861, + "balance_loss_mlp": 1.02332258, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.7472761135890436, + "language_loss": 0.78261709, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80434656, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13641357, + "step": 11359, + "time_per_iteration": 2.4701731204986572 + }, + { + "auxiliary_loss_clip": 0.01126353, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.05126882, + "balance_loss_mlp": 1.0196836, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 2.038605734621755, + "language_loss": 0.81418473, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83577394, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12884521, + "step": 11360, + "time_per_iteration": 2.460174560546875 + }, + { + "auxiliary_loss_clip": 0.01124833, + "auxiliary_loss_mlp": 0.0103159, + "balance_loss_clip": 1.05030918, + "balance_loss_mlp": 1.01879334, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.7594179071243181, + "language_loss": 0.7573536, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77891779, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12805176, + "step": 11361, + "time_per_iteration": 3.8760523796081543 + }, + { + "auxiliary_loss_clip": 0.0105191, + "auxiliary_loss_mlp": 0.01002209, + "balance_loss_clip": 1.02685475, + "balance_loss_mlp": 1.00085926, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.876946409559204, + "language_loss": 0.59641129, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61695248, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01350403, + "step": 11362, + "time_per_iteration": 3.181030035018921 + }, + { + "auxiliary_loss_clip": 0.01120284, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.04272747, + "balance_loss_mlp": 1.01813447, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.2067119267721327, + "language_loss": 0.75534129, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77686262, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13708496, + "step": 11363, + "time_per_iteration": 2.399165630340576 + }, + { + "auxiliary_loss_clip": 0.011218, + "auxiliary_loss_mlp": 0.01044737, + "balance_loss_clip": 1.04461145, + "balance_loss_mlp": 1.03074777, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 6.668827063874067, + "language_loss": 0.89778781, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91945314, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.14001465, + "step": 11364, + "time_per_iteration": 2.445533275604248 + }, + { + "auxiliary_loss_clip": 0.01114437, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.04246747, + "balance_loss_mlp": 1.02107716, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 2.0220658912032636, + "language_loss": 0.88482815, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90630603, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12268066, + "step": 11365, + "time_per_iteration": 2.4243130683898926 + }, + { + "auxiliary_loss_clip": 0.01126705, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.04698753, + "balance_loss_mlp": 1.02805853, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.3723803077520125, + "language_loss": 0.81526113, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83696306, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.15429688, + "step": 11366, + "time_per_iteration": 3.9283149242401123 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.04143965, + "balance_loss_mlp": 1.01737785, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.604704214894597, + "language_loss": 0.76975197, + "learning_rate": 9.622430822110062e-07, + "loss": 0.7912178, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12286377, + "step": 11367, + "time_per_iteration": 2.4920010566711426 + }, + { + "auxiliary_loss_clip": 0.01119684, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.04409409, + "balance_loss_mlp": 1.02443755, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.5078512261428996, + "language_loss": 0.69183534, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71340865, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13214111, + "step": 11368, + "time_per_iteration": 2.4454519748687744 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.01028864, + "balance_loss_clip": 1.04590869, + "balance_loss_mlp": 1.01704478, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.949109303515955, + "language_loss": 0.73548496, + "learning_rate": 9.615772998335261e-07, + "loss": 0.7569654, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11816406, + "step": 11369, + "time_per_iteration": 2.4764490127563477 + }, + { + "auxiliary_loss_clip": 0.01120407, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.04555821, + "balance_loss_mlp": 1.01664877, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.9971754521120746, + "language_loss": 0.7890985, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81059861, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12963867, + "step": 11370, + "time_per_iteration": 2.431100606918335 + }, + { + "auxiliary_loss_clip": 0.01059468, + "auxiliary_loss_mlp": 0.01010499, + "balance_loss_clip": 1.03342628, + "balance_loss_mlp": 1.00918484, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7469651300523517, + "language_loss": 0.59798294, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61868262, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01312256, + "step": 11371, + "time_per_iteration": 2.974714517593384 + }, + { + "auxiliary_loss_clip": 0.01116357, + "auxiliary_loss_mlp": 0.01026053, + "balance_loss_clip": 1.04509664, + "balance_loss_mlp": 1.01439977, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.5720664995917952, + "language_loss": 0.63801634, + "learning_rate": 9.605789216270511e-07, + "loss": 0.6594404, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11657715, + "step": 11372, + "time_per_iteration": 2.4141829013824463 + }, + { + "auxiliary_loss_clip": 0.01123587, + "auxiliary_loss_mlp": 0.01026608, + "balance_loss_clip": 1.04988301, + "balance_loss_mlp": 1.01348352, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 2.5554877650411023, + "language_loss": 0.71912575, + "learning_rate": 9.602462077046375e-07, + "loss": 0.74062765, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13116455, + "step": 11373, + "time_per_iteration": 2.475782871246338 + }, + { + "auxiliary_loss_clip": 0.01057911, + "auxiliary_loss_mlp": 0.01003995, + "balance_loss_clip": 1.03234863, + "balance_loss_mlp": 1.00270343, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.204141415300624, + "language_loss": 0.56696749, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58758652, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01292419, + "step": 11374, + "time_per_iteration": 3.2303049564361572 + }, + { + "auxiliary_loss_clip": 0.01116171, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.04165268, + "balance_loss_mlp": 1.01433349, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.551162099158348, + "language_loss": 0.73810422, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75953853, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1293335, + "step": 11375, + "time_per_iteration": 2.472294569015503 + }, + { + "auxiliary_loss_clip": 0.01116139, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.04352474, + "balance_loss_mlp": 1.02218473, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.7592357297228645, + "language_loss": 0.70797807, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72949827, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.13696289, + "step": 11376, + "time_per_iteration": 2.4815540313720703 + }, + { + "auxiliary_loss_clip": 0.0111795, + "auxiliary_loss_mlp": 0.01037986, + "balance_loss_clip": 1.04059982, + "balance_loss_mlp": 1.02388906, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.1622425791504307, + "language_loss": 0.74224478, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76380414, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.14086914, + "step": 11377, + "time_per_iteration": 2.5081183910369873 + }, + { + "auxiliary_loss_clip": 0.01050627, + "auxiliary_loss_mlp": 0.01006089, + "balance_loss_clip": 1.02546358, + "balance_loss_mlp": 1.00448823, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7277933876590432, + "language_loss": 0.56830478, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58887196, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01597595, + "step": 11378, + "time_per_iteration": 3.163193464279175 + }, + { + "auxiliary_loss_clip": 0.01122689, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.04761958, + "balance_loss_mlp": 1.02005923, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.7179111146129196, + "language_loss": 0.78439796, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80595881, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13323975, + "step": 11379, + "time_per_iteration": 2.5335850715637207 + }, + { + "auxiliary_loss_clip": 0.01112966, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.04313433, + "balance_loss_mlp": 1.01476216, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 3.6389251570732295, + "language_loss": 0.69422394, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71561325, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11193848, + "step": 11380, + "time_per_iteration": 2.4451282024383545 + }, + { + "auxiliary_loss_clip": 0.01120224, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.0463382, + "balance_loss_mlp": 1.0192765, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 3.0340492590394925, + "language_loss": 0.78281116, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80433369, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12762451, + "step": 11381, + "time_per_iteration": 2.461242198944092 + }, + { + "auxiliary_loss_clip": 0.01062566, + "auxiliary_loss_mlp": 0.01004014, + "balance_loss_clip": 1.03487217, + "balance_loss_mlp": 1.00231934, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8693116159953008, + "language_loss": 0.67198139, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69264722, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01695251, + "step": 11382, + "time_per_iteration": 2.929171085357666 + }, + { + "auxiliary_loss_clip": 0.0106148, + "auxiliary_loss_mlp": 0.01004129, + "balance_loss_clip": 1.03683662, + "balance_loss_mlp": 1.00271297, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8237842345512106, + "language_loss": 0.58138078, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60203683, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01416016, + "step": 11383, + "time_per_iteration": 4.578449726104736 + }, + { + "auxiliary_loss_clip": 0.01117875, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.04395926, + "balance_loss_mlp": 1.01363945, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 3.447150373671983, + "language_loss": 0.79810357, + "learning_rate": 9.565889595521517e-07, + "loss": 0.81953484, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11608887, + "step": 11384, + "time_per_iteration": 2.5508179664611816 + }, + { + "auxiliary_loss_clip": 0.01122023, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.04419076, + "balance_loss_mlp": 1.02221084, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 1.8095705645527849, + "language_loss": 0.77158332, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79315364, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12811279, + "step": 11385, + "time_per_iteration": 2.4700605869293213 + }, + { + "auxiliary_loss_clip": 0.01129323, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.04681969, + "balance_loss_mlp": 1.02267647, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.1366180949947022, + "language_loss": 0.84661233, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86827564, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.14337158, + "step": 11386, + "time_per_iteration": 2.4110777378082275 + }, + { + "auxiliary_loss_clip": 0.01120961, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.04616463, + "balance_loss_mlp": 1.02557182, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.30820851184979, + "language_loss": 0.83234167, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85392618, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11907959, + "step": 11387, + "time_per_iteration": 2.4276773929595947 + }, + { + "auxiliary_loss_clip": 0.01122312, + "auxiliary_loss_mlp": 0.01028721, + "balance_loss_clip": 1.04937291, + "balance_loss_mlp": 1.01625752, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.8576320078240645, + "language_loss": 0.72576141, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74727178, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12469482, + "step": 11388, + "time_per_iteration": 2.5729455947875977 + }, + { + "auxiliary_loss_clip": 0.01121898, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.04728842, + "balance_loss_mlp": 1.01685047, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.8576659186208289, + "language_loss": 0.62697828, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64848173, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11578369, + "step": 11389, + "time_per_iteration": 2.625556230545044 + }, + { + "auxiliary_loss_clip": 0.01055297, + "auxiliary_loss_mlp": 0.01003436, + "balance_loss_clip": 1.02804399, + "balance_loss_mlp": 1.00189543, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7461927879364421, + "language_loss": 0.56002736, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58061475, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01541138, + "step": 11390, + "time_per_iteration": 3.1469407081604004 + }, + { + "auxiliary_loss_clip": 0.01120159, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.04651546, + "balance_loss_mlp": 1.01821983, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 1.9278126872335772, + "language_loss": 0.87990689, + "learning_rate": 9.542641114335109e-07, + "loss": 0.90141368, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12298584, + "step": 11391, + "time_per_iteration": 2.4449198246002197 + }, + { + "auxiliary_loss_clip": 0.01122893, + "auxiliary_loss_mlp": 0.01030364, + "balance_loss_clip": 1.04722369, + "balance_loss_mlp": 1.01799619, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 2.126017781342551, + "language_loss": 0.79187715, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81340969, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12371826, + "step": 11392, + "time_per_iteration": 2.5030155181884766 + }, + { + "auxiliary_loss_clip": 0.01116677, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.04286659, + "balance_loss_mlp": 1.01799464, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 3.4857143588982655, + "language_loss": 0.70756394, + "learning_rate": 9.536002258147104e-07, + "loss": 0.72903574, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12524414, + "step": 11393, + "time_per_iteration": 2.4523940086364746 + }, + { + "auxiliary_loss_clip": 0.01123176, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.04662538, + "balance_loss_mlp": 1.01780188, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.650171832944779, + "language_loss": 0.65137172, + "learning_rate": 9.532683425183936e-07, + "loss": 0.67291766, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13616943, + "step": 11394, + "time_per_iteration": 2.5239713191986084 + }, + { + "auxiliary_loss_clip": 0.01126293, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.04794288, + "balance_loss_mlp": 1.02238512, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 1.70359026606668, + "language_loss": 0.8084709, + "learning_rate": 9.529364989142468e-07, + "loss": 0.83008814, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13031006, + "step": 11395, + "time_per_iteration": 2.5459699630737305 + }, + { + "auxiliary_loss_clip": 0.0112662, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.04950309, + "balance_loss_mlp": 1.02031589, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.6451374024870005, + "language_loss": 0.73189509, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75350738, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.1428833, + "step": 11396, + "time_per_iteration": 2.5320780277252197 + }, + { + "auxiliary_loss_clip": 0.01127005, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.05026555, + "balance_loss_mlp": 1.01453781, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 2.223884112791336, + "language_loss": 0.79040694, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81195897, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13659668, + "step": 11397, + "time_per_iteration": 2.486215591430664 + }, + { + "auxiliary_loss_clip": 0.01123982, + "auxiliary_loss_mlp": 0.01029346, + "balance_loss_clip": 1.04779708, + "balance_loss_mlp": 1.01625729, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 1.8317752066392243, + "language_loss": 0.71538138, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73691463, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1307373, + "step": 11398, + "time_per_iteration": 3.9085445404052734 + }, + { + "auxiliary_loss_clip": 0.01126803, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.05299973, + "balance_loss_mlp": 1.01834166, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.7733124458027916, + "language_loss": 0.70918941, + "learning_rate": 9.516095216709996e-07, + "loss": 0.73075873, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11785889, + "step": 11399, + "time_per_iteration": 2.5029165744781494 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.04663873, + "balance_loss_mlp": 1.01782084, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 2.3115605185884993, + "language_loss": 0.70294428, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72445196, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12176514, + "step": 11400, + "time_per_iteration": 2.421994686126709 + }, + { + "auxiliary_loss_clip": 0.01126833, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.04581046, + "balance_loss_mlp": 1.02096009, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.7759308626093389, + "language_loss": 0.78158152, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80321205, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.15258789, + "step": 11401, + "time_per_iteration": 2.4113080501556396 + }, + { + "auxiliary_loss_clip": 0.0112128, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.046453, + "balance_loss_mlp": 1.02322125, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 2.178229054334368, + "language_loss": 0.75251001, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77407587, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12103271, + "step": 11402, + "time_per_iteration": 2.445033311843872 + }, + { + "auxiliary_loss_clip": 0.01122995, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.04678798, + "balance_loss_mlp": 1.01956332, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 3.500080529506085, + "language_loss": 0.72812343, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74968553, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13653564, + "step": 11403, + "time_per_iteration": 2.484949827194214 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.05054379, + "balance_loss_mlp": 1.01568723, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.035804972568989, + "language_loss": 0.81524301, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83675182, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12176514, + "step": 11404, + "time_per_iteration": 3.9384984970092773 + }, + { + "auxiliary_loss_clip": 0.0112438, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.05104935, + "balance_loss_mlp": 1.02002203, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3959399037058964, + "language_loss": 0.78035688, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80192244, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12158203, + "step": 11405, + "time_per_iteration": 2.5119593143463135 + }, + { + "auxiliary_loss_clip": 0.01044306, + "auxiliary_loss_mlp": 0.01004529, + "balance_loss_clip": 1.01928914, + "balance_loss_mlp": 1.003124, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 1.0109448130373615, + "language_loss": 0.60910404, + "learning_rate": 9.492888425496199e-07, + "loss": 0.62959242, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.0140686, + "step": 11406, + "time_per_iteration": 3.128206491470337 + }, + { + "auxiliary_loss_clip": 0.01111935, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.03836274, + "balance_loss_mlp": 1.01953673, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.7510366981184908, + "language_loss": 0.77129054, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79274857, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.14324951, + "step": 11407, + "time_per_iteration": 2.483386754989624 + }, + { + "auxiliary_loss_clip": 0.01125466, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.04744184, + "balance_loss_mlp": 1.02455258, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 3.687731549954806, + "language_loss": 0.71002507, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73167449, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.14929199, + "step": 11408, + "time_per_iteration": 2.6292428970336914 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.0103022, + "balance_loss_clip": 1.04458201, + "balance_loss_mlp": 1.01635623, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.8936825635446424, + "language_loss": 0.7054534, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72699052, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13867188, + "step": 11409, + "time_per_iteration": 2.46305251121521 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01026635, + "balance_loss_clip": 1.04740548, + "balance_loss_mlp": 1.01474988, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.5760326241491749, + "language_loss": 0.78122503, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80268091, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11889648, + "step": 11410, + "time_per_iteration": 2.438206911087036 + }, + { + "auxiliary_loss_clip": 0.01125562, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.04651308, + "balance_loss_mlp": 1.02321696, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 1.9814723076014737, + "language_loss": 0.71584278, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73746526, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13476562, + "step": 11411, + "time_per_iteration": 3.8261349201202393 + }, + { + "auxiliary_loss_clip": 0.01118879, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.04429054, + "balance_loss_mlp": 1.01605833, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 2.4201774211714695, + "language_loss": 0.70079571, + "learning_rate": 9.473012427332654e-07, + "loss": 0.7222774, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13232422, + "step": 11412, + "time_per_iteration": 2.429727792739868 + }, + { + "auxiliary_loss_clip": 0.01120066, + "auxiliary_loss_mlp": 0.01028843, + "balance_loss_clip": 1.04591954, + "balance_loss_mlp": 1.01606345, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 3.3278476952560885, + "language_loss": 0.72063363, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74212265, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12780762, + "step": 11413, + "time_per_iteration": 2.3518996238708496 + }, + { + "auxiliary_loss_clip": 0.01121998, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.04686236, + "balance_loss_mlp": 1.01987743, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 2.03487542329357, + "language_loss": 0.73922753, + "learning_rate": 9.466390286747164e-07, + "loss": 0.76076543, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11907959, + "step": 11414, + "time_per_iteration": 2.468780040740967 + }, + { + "auxiliary_loss_clip": 0.01119829, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.04407835, + "balance_loss_mlp": 1.016469, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.0727842453748964, + "language_loss": 0.86959112, + "learning_rate": 9.46307981554495e-07, + "loss": 0.89108759, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13330078, + "step": 11415, + "time_per_iteration": 2.434889554977417 + }, + { + "auxiliary_loss_clip": 0.01123965, + "auxiliary_loss_mlp": 0.01041108, + "balance_loss_clip": 1.04533386, + "balance_loss_mlp": 1.02658808, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.7274407185586798, + "language_loss": 0.66879565, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69044638, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.14508057, + "step": 11416, + "time_per_iteration": 2.522786855697632 + }, + { + "auxiliary_loss_clip": 0.01121849, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.0468955, + "balance_loss_mlp": 1.01902604, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.4676486155599446, + "language_loss": 0.76128983, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78281641, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11779785, + "step": 11417, + "time_per_iteration": 2.4331185817718506 + }, + { + "auxiliary_loss_clip": 0.01123449, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.05014253, + "balance_loss_mlp": 1.01822519, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 2.1227932522799358, + "language_loss": 0.77522218, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79675752, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11871338, + "step": 11418, + "time_per_iteration": 2.4483232498168945 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.04292452, + "balance_loss_mlp": 1.01611352, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.682840428626417, + "language_loss": 0.76448131, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78591925, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11865234, + "step": 11419, + "time_per_iteration": 2.49558424949646 + }, + { + "auxiliary_loss_clip": 0.01115815, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.04234767, + "balance_loss_mlp": 1.02095199, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.901914191291699, + "language_loss": 0.71545452, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73694021, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11804199, + "step": 11420, + "time_per_iteration": 2.501584053039551 + }, + { + "auxiliary_loss_clip": 0.01114823, + "auxiliary_loss_mlp": 0.01024449, + "balance_loss_clip": 1.04039359, + "balance_loss_mlp": 1.01310039, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.4132407273579293, + "language_loss": 0.74630427, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76769698, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11358643, + "step": 11421, + "time_per_iteration": 2.550863027572632 + }, + { + "auxiliary_loss_clip": 0.01116757, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.04363108, + "balance_loss_mlp": 1.01967025, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.8313147531694538, + "language_loss": 0.76931101, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79079449, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1192627, + "step": 11422, + "time_per_iteration": 2.517610549926758 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.04306376, + "balance_loss_mlp": 1.02295005, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.7242132307288205, + "language_loss": 0.7722947, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79383647, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13214111, + "step": 11423, + "time_per_iteration": 2.5136404037475586 + }, + { + "auxiliary_loss_clip": 0.01120209, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.04564977, + "balance_loss_mlp": 1.01669037, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.6087681483583212, + "language_loss": 0.7289148, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75040859, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12493896, + "step": 11424, + "time_per_iteration": 2.4622952938079834 + }, + { + "auxiliary_loss_clip": 0.0112278, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.04572082, + "balance_loss_mlp": 1.01550949, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.7668098403454693, + "language_loss": 0.650352, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67185944, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12463379, + "step": 11425, + "time_per_iteration": 2.477053165435791 + }, + { + "auxiliary_loss_clip": 0.01114956, + "auxiliary_loss_mlp": 0.01028218, + "balance_loss_clip": 1.04262948, + "balance_loss_mlp": 1.01635695, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.7303090733467639, + "language_loss": 0.72081649, + "learning_rate": 9.426691030957657e-07, + "loss": 0.7422483, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11865234, + "step": 11426, + "time_per_iteration": 2.4949777126312256 + }, + { + "auxiliary_loss_clip": 0.01117418, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.04475093, + "balance_loss_mlp": 1.01485896, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.2662126810169387, + "language_loss": 0.85013723, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87157762, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11767578, + "step": 11427, + "time_per_iteration": 2.4037256240844727 + }, + { + "auxiliary_loss_clip": 0.01112371, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.04119182, + "balance_loss_mlp": 1.02030468, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.4391098038976053, + "language_loss": 0.75984681, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78130376, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.13012695, + "step": 11428, + "time_per_iteration": 3.9798965454101562 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.04254174, + "balance_loss_mlp": 1.0209651, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 1.8227218367587348, + "language_loss": 0.72978735, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75132942, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13378906, + "step": 11429, + "time_per_iteration": 2.4849579334259033 + }, + { + "auxiliary_loss_clip": 0.01121444, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.04635072, + "balance_loss_mlp": 1.01571739, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.9459578057291915, + "language_loss": 0.83012199, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85161978, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12615967, + "step": 11430, + "time_per_iteration": 2.521958827972412 + }, + { + "auxiliary_loss_clip": 0.01117803, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.04348171, + "balance_loss_mlp": 1.01955342, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.669290757678227, + "language_loss": 0.70502234, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72651982, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12390137, + "step": 11431, + "time_per_iteration": 2.4698495864868164 + }, + { + "auxiliary_loss_clip": 0.01126033, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.04653907, + "balance_loss_mlp": 1.02083516, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 2.0381479977254084, + "language_loss": 0.80186075, + "learning_rate": 9.406863040327355e-07, + "loss": 0.82345593, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12640381, + "step": 11432, + "time_per_iteration": 2.4945900440216064 + }, + { + "auxiliary_loss_clip": 0.01112507, + "auxiliary_loss_mlp": 0.01027932, + "balance_loss_clip": 1.04290247, + "balance_loss_mlp": 1.01629126, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 2.9643216322252406, + "language_loss": 0.68085825, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70226264, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11639404, + "step": 11433, + "time_per_iteration": 2.4714505672454834 + }, + { + "auxiliary_loss_clip": 0.01122294, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.04854953, + "balance_loss_mlp": 1.02703106, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 2.081246613982838, + "language_loss": 0.72568208, + "learning_rate": 9.400256922323309e-07, + "loss": 0.74730206, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12670898, + "step": 11434, + "time_per_iteration": 2.5606601238250732 + }, + { + "auxiliary_loss_clip": 0.01124059, + "auxiliary_loss_mlp": 0.0102738, + "balance_loss_clip": 1.04883242, + "balance_loss_mlp": 1.01459491, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.5532945361553818, + "language_loss": 0.80880666, + "learning_rate": 9.396954466173657e-07, + "loss": 0.83032107, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12792969, + "step": 11435, + "time_per_iteration": 2.4508543014526367 + }, + { + "auxiliary_loss_clip": 0.01121503, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.04592621, + "balance_loss_mlp": 1.02126443, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.4269831853029493, + "language_loss": 0.80730581, + "learning_rate": 9.393652412092538e-07, + "loss": 0.82886338, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13000488, + "step": 11436, + "time_per_iteration": 2.4762227535247803 + }, + { + "auxiliary_loss_clip": 0.0111408, + "auxiliary_loss_mlp": 0.01027083, + "balance_loss_clip": 1.04531097, + "balance_loss_mlp": 1.01667035, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.7388843771778917, + "language_loss": 0.81662625, + "learning_rate": 9.390350760205183e-07, + "loss": 0.83803785, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10412598, + "step": 11437, + "time_per_iteration": 2.5335347652435303 + }, + { + "auxiliary_loss_clip": 0.01121308, + "auxiliary_loss_mlp": 0.01033157, + "balance_loss_clip": 1.04214358, + "balance_loss_mlp": 1.02006745, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 6.182645857751973, + "language_loss": 0.78481919, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80636382, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13092041, + "step": 11438, + "time_per_iteration": 2.4838814735412598 + }, + { + "auxiliary_loss_clip": 0.01104463, + "auxiliary_loss_mlp": 0.01027639, + "balance_loss_clip": 1.03615522, + "balance_loss_mlp": 1.01576018, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.9108657319881166, + "language_loss": 0.72275931, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74408036, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.11877441, + "step": 11439, + "time_per_iteration": 2.6036133766174316 + }, + { + "auxiliary_loss_clip": 0.01110824, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.03909254, + "balance_loss_mlp": 1.01383948, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 2.063537315086849, + "language_loss": 0.75347966, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77484393, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11767578, + "step": 11440, + "time_per_iteration": 2.4267544746398926 + }, + { + "auxiliary_loss_clip": 0.01120643, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.04742813, + "balance_loss_mlp": 1.01975965, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.642329383829276, + "language_loss": 0.7218585, + "learning_rate": 9.377148177097167e-07, + "loss": 0.74338371, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12133789, + "step": 11441, + "time_per_iteration": 2.4690005779266357 + }, + { + "auxiliary_loss_clip": 0.01128497, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.04978037, + "balance_loss_mlp": 1.02198601, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.7690799681623233, + "language_loss": 0.6685515, + "learning_rate": 9.373848538056317e-07, + "loss": 0.69018614, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12976074, + "step": 11442, + "time_per_iteration": 3.892568588256836 + }, + { + "auxiliary_loss_clip": 0.01122454, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.04879427, + "balance_loss_mlp": 1.0173111, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 2.781068927719903, + "language_loss": 0.69756079, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71907449, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11602783, + "step": 11443, + "time_per_iteration": 2.503938913345337 + }, + { + "auxiliary_loss_clip": 0.01130597, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.0549531, + "balance_loss_mlp": 1.01818895, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 5.55755620581922, + "language_loss": 0.76176333, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78337234, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12103271, + "step": 11444, + "time_per_iteration": 2.5000288486480713 + }, + { + "auxiliary_loss_clip": 0.01113187, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.04180992, + "balance_loss_mlp": 1.01686525, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 2.5211672295058953, + "language_loss": 0.76543188, + "learning_rate": 9.363952039102536e-07, + "loss": 0.7868489, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11645508, + "step": 11445, + "time_per_iteration": 2.5072901248931885 + }, + { + "auxiliary_loss_clip": 0.01048728, + "auxiliary_loss_mlp": 0.01001967, + "balance_loss_clip": 1.02286768, + "balance_loss_mlp": 1.00045168, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8224745399444557, + "language_loss": 0.58417517, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60468215, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.25830078, + "router_z_loss_mlp": 0.01515198, + "step": 11446, + "time_per_iteration": 3.1148505210876465 + }, + { + "auxiliary_loss_clip": 0.01115333, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.03914297, + "balance_loss_mlp": 1.01566708, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.8171994898077206, + "language_loss": 0.75983071, + "learning_rate": 9.357356389524886e-07, + "loss": 0.78126615, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12554932, + "step": 11447, + "time_per_iteration": 2.4870362281799316 + }, + { + "auxiliary_loss_clip": 0.01116428, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.04194117, + "balance_loss_mlp": 1.0199759, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.6076443919556227, + "language_loss": 0.73237991, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75388038, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13635254, + "step": 11448, + "time_per_iteration": 3.897110939025879 + }, + { + "auxiliary_loss_clip": 0.01120258, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.04420233, + "balance_loss_mlp": 1.01812768, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.871428457053187, + "language_loss": 0.7472198, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76873124, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12774658, + "step": 11449, + "time_per_iteration": 2.484025716781616 + }, + { + "auxiliary_loss_clip": 0.01111954, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03973281, + "balance_loss_mlp": 1.02210999, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 2.0322629422519074, + "language_loss": 0.70305908, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72451389, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11431885, + "step": 11450, + "time_per_iteration": 2.4966113567352295 + }, + { + "auxiliary_loss_clip": 0.01132587, + "auxiliary_loss_mlp": 0.01034842, + "balance_loss_clip": 1.05274367, + "balance_loss_mlp": 1.02164543, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 2.3261209048951565, + "language_loss": 0.75727212, + "learning_rate": 9.344169934211068e-07, + "loss": 0.7789464, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13201904, + "step": 11451, + "time_per_iteration": 2.428297281265259 + }, + { + "auxiliary_loss_clip": 0.01117347, + "auxiliary_loss_mlp": 0.01028283, + "balance_loss_clip": 1.042449, + "balance_loss_mlp": 1.01604044, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 4.950711818441451, + "language_loss": 0.69331068, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71476698, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12243652, + "step": 11452, + "time_per_iteration": 2.5623984336853027 + }, + { + "auxiliary_loss_clip": 0.01123584, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.05080748, + "balance_loss_mlp": 1.01799786, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 2.2337727172110533, + "language_loss": 0.72048891, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74203956, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1348877, + "step": 11453, + "time_per_iteration": 2.4271113872528076 + }, + { + "auxiliary_loss_clip": 0.0105001, + "auxiliary_loss_mlp": 0.01003782, + "balance_loss_clip": 1.02377772, + "balance_loss_mlp": 1.00213695, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.9044079349787467, + "language_loss": 0.50656229, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52710021, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.26269531, + "router_z_loss_mlp": 0.01643372, + "step": 11454, + "time_per_iteration": 4.418352365493774 + }, + { + "auxiliary_loss_clip": 0.01116258, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.04414511, + "balance_loss_mlp": 1.01499701, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.9467128933650615, + "language_loss": 0.75460827, + "learning_rate": 9.330989944019263e-07, + "loss": 0.7760396, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11883545, + "step": 11455, + "time_per_iteration": 2.440183162689209 + }, + { + "auxiliary_loss_clip": 0.01126117, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.04938424, + "balance_loss_mlp": 1.02004528, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.268375581780972, + "language_loss": 0.72811127, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74970567, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13269043, + "step": 11456, + "time_per_iteration": 2.390665292739868 + }, + { + "auxiliary_loss_clip": 0.01118827, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.04796195, + "balance_loss_mlp": 1.01676607, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.7633643529112137, + "language_loss": 0.80994701, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83141923, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11645508, + "step": 11457, + "time_per_iteration": 2.515721321105957 + }, + { + "auxiliary_loss_clip": 0.01132816, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.0554316, + "balance_loss_mlp": 1.01838851, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5785799659083102, + "language_loss": 0.76026934, + "learning_rate": 9.321109198922301e-07, + "loss": 0.78190941, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12805176, + "step": 11458, + "time_per_iteration": 2.4660489559173584 + }, + { + "auxiliary_loss_clip": 0.01116165, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.04322112, + "balance_loss_mlp": 1.01968992, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 3.398797924404805, + "language_loss": 0.67841256, + "learning_rate": 9.31781642694603e-07, + "loss": 0.69989121, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12011719, + "step": 11459, + "time_per_iteration": 2.4416916370391846 + }, + { + "auxiliary_loss_clip": 0.0111525, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.0429287, + "balance_loss_mlp": 1.0188067, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.4842803823835886, + "language_loss": 0.68360752, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70506454, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11645508, + "step": 11460, + "time_per_iteration": 2.5036702156066895 + }, + { + "auxiliary_loss_clip": 0.01123879, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.04334569, + "balance_loss_mlp": 1.02040887, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.8233953722619, + "language_loss": 0.7712425, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79282892, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14355469, + "step": 11461, + "time_per_iteration": 2.4848132133483887 + }, + { + "auxiliary_loss_clip": 0.01116323, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.04372954, + "balance_loss_mlp": 1.02141774, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.6941241988386793, + "language_loss": 0.69453871, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71604061, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12457275, + "step": 11462, + "time_per_iteration": 2.458198070526123 + }, + { + "auxiliary_loss_clip": 0.01119048, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.04360414, + "balance_loss_mlp": 1.01959991, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.5426603543541737, + "language_loss": 0.87362397, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89515543, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1449585, + "step": 11463, + "time_per_iteration": 2.5113139152526855 + }, + { + "auxiliary_loss_clip": 0.01105725, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.03713965, + "balance_loss_mlp": 1.0183177, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 2.0434647982724132, + "language_loss": 0.68034112, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70169866, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.11706543, + "step": 11464, + "time_per_iteration": 2.442289113998413 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.04315209, + "balance_loss_mlp": 1.02579141, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.869317544436754, + "language_loss": 0.65227604, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67382574, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12579346, + "step": 11465, + "time_per_iteration": 2.562030553817749 + }, + { + "auxiliary_loss_clip": 0.01119865, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.044209, + "balance_loss_mlp": 1.02477515, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.8118510200328428, + "language_loss": 0.72748393, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74904966, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11932373, + "step": 11466, + "time_per_iteration": 2.5412485599517822 + }, + { + "auxiliary_loss_clip": 0.01116693, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.04397678, + "balance_loss_mlp": 1.01962328, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.838877753671519, + "language_loss": 0.7211324, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74261415, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11865234, + "step": 11467, + "time_per_iteration": 2.4589548110961914 + }, + { + "auxiliary_loss_clip": 0.01121436, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.04382086, + "balance_loss_mlp": 1.01959085, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 2.1919790249316544, + "language_loss": 0.81100643, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83255386, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.137146, + "step": 11468, + "time_per_iteration": 2.4379799365997314 + }, + { + "auxiliary_loss_clip": 0.01125306, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.04880857, + "balance_loss_mlp": 1.02199292, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.580678080985871, + "language_loss": 0.66465616, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68626356, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13427734, + "step": 11469, + "time_per_iteration": 2.565786361694336 + }, + { + "auxiliary_loss_clip": 0.01062907, + "auxiliary_loss_mlp": 0.01007858, + "balance_loss_clip": 1.03811443, + "balance_loss_mlp": 1.00664771, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.8092578029887921, + "language_loss": 0.55199611, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57270378, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01210022, + "step": 11470, + "time_per_iteration": 2.9593045711517334 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.0446713, + "balance_loss_mlp": 1.02042103, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.7719987677454023, + "language_loss": 0.78006238, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80152571, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10284424, + "step": 11471, + "time_per_iteration": 3.9612269401550293 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.04604888, + "balance_loss_mlp": 1.0192858, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 2.540443480422365, + "language_loss": 0.78578842, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80728316, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11895752, + "step": 11472, + "time_per_iteration": 2.443551778793335 + }, + { + "auxiliary_loss_clip": 0.01110167, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.03754711, + "balance_loss_mlp": 1.01941276, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5998227323163312, + "language_loss": 0.76322556, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78463954, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11804199, + "step": 11473, + "time_per_iteration": 2.466871976852417 + }, + { + "auxiliary_loss_clip": 0.0112694, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.05001926, + "balance_loss_mlp": 1.01895308, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 3.045138047420114, + "language_loss": 0.75683868, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77842611, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12872314, + "step": 11474, + "time_per_iteration": 2.4206573963165283 + }, + { + "auxiliary_loss_clip": 0.01112325, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.04058599, + "balance_loss_mlp": 1.01831985, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.5537724348925226, + "language_loss": 0.74445653, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76588738, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12438965, + "step": 11475, + "time_per_iteration": 2.6063990592956543 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.04105544, + "balance_loss_mlp": 1.01858377, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.4271606665712984, + "language_loss": 0.88514495, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90659642, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12487793, + "step": 11476, + "time_per_iteration": 2.575791597366333 + }, + { + "auxiliary_loss_clip": 0.01107931, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.03678441, + "balance_loss_mlp": 1.01949692, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.4430552658866604, + "language_loss": 0.70209277, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72348011, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11297607, + "step": 11477, + "time_per_iteration": 2.5415236949920654 + }, + { + "auxiliary_loss_clip": 0.01119232, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.02499008, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.5973393870723975, + "language_loss": 0.67894959, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70051891, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1270752, + "step": 11478, + "time_per_iteration": 2.455836534500122 + }, + { + "auxiliary_loss_clip": 0.01122988, + "auxiliary_loss_mlp": 0.01033408, + "balance_loss_clip": 1.04610801, + "balance_loss_mlp": 1.02127266, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.863218733330848, + "language_loss": 0.76497936, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78654337, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12145996, + "step": 11479, + "time_per_iteration": 2.4157299995422363 + }, + { + "auxiliary_loss_clip": 0.0112344, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.04803491, + "balance_loss_mlp": 1.01770663, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 3.3445437526519926, + "language_loss": 0.78624237, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80777919, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12548828, + "step": 11480, + "time_per_iteration": 2.40474271774292 + }, + { + "auxiliary_loss_clip": 0.01122618, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.05033565, + "balance_loss_mlp": 1.01674271, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.7822603797904104, + "language_loss": 0.75626349, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77778149, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12438965, + "step": 11481, + "time_per_iteration": 2.5061755180358887 + }, + { + "auxiliary_loss_clip": 0.01112263, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.03885937, + "balance_loss_mlp": 1.01985276, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.8367207299371073, + "language_loss": 0.69209546, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71353817, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12164307, + "step": 11482, + "time_per_iteration": 2.461664915084839 + }, + { + "auxiliary_loss_clip": 0.01118311, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.04516792, + "balance_loss_mlp": 1.01258957, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.796762617957808, + "language_loss": 0.83065331, + "learning_rate": 9.238911707310096e-07, + "loss": 0.85207862, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11639404, + "step": 11483, + "time_per_iteration": 2.463928699493408 + }, + { + "auxiliary_loss_clip": 0.01118195, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.04295933, + "balance_loss_mlp": 1.01972651, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.7032302253705263, + "language_loss": 0.6526053, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67409992, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11535645, + "step": 11484, + "time_per_iteration": 2.4911446571350098 + }, + { + "auxiliary_loss_clip": 0.01116629, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.04348922, + "balance_loss_mlp": 1.01932907, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.5157131497074938, + "language_loss": 0.73609912, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75757718, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11853027, + "step": 11485, + "time_per_iteration": 3.911162853240967 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.04826903, + "balance_loss_mlp": 1.02079964, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 2.043314669944331, + "language_loss": 0.85258144, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87415981, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1262207, + "step": 11486, + "time_per_iteration": 2.462946891784668 + }, + { + "auxiliary_loss_clip": 0.01118706, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.04568076, + "balance_loss_mlp": 1.02230096, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.550819876067065, + "language_loss": 0.72625935, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74779522, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12585449, + "step": 11487, + "time_per_iteration": 2.4682443141937256 + }, + { + "auxiliary_loss_clip": 0.01041643, + "auxiliary_loss_mlp": 0.01008601, + "balance_loss_clip": 1.01639545, + "balance_loss_mlp": 1.00703681, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8930717278350057, + "language_loss": 0.66607881, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68658125, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01565552, + "step": 11488, + "time_per_iteration": 3.0991132259368896 + }, + { + "auxiliary_loss_clip": 0.01120228, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.04106307, + "balance_loss_mlp": 1.02245927, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.6452269693857688, + "language_loss": 0.75111628, + "learning_rate": 9.219222185664519e-07, + "loss": 0.77268767, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.14440918, + "step": 11489, + "time_per_iteration": 2.542503595352173 + }, + { + "auxiliary_loss_clip": 0.01117903, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.04347456, + "balance_loss_mlp": 1.0228622, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 2.590140341970828, + "language_loss": 0.62499654, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64653778, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.13348389, + "step": 11490, + "time_per_iteration": 2.5712943077087402 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.04321456, + "balance_loss_mlp": 1.01515937, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 2.2267685490428595, + "language_loss": 0.72964263, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75108951, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12188721, + "step": 11491, + "time_per_iteration": 4.028621435165405 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.03661084, + "balance_loss_mlp": 1.02704656, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.9960278508438338, + "language_loss": 0.70363772, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72512507, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.13311768, + "step": 11492, + "time_per_iteration": 2.5216376781463623 + }, + { + "auxiliary_loss_clip": 0.01123097, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.04508388, + "balance_loss_mlp": 1.0214293, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.7644585179225318, + "language_loss": 0.75137794, + "learning_rate": 9.206104012405049e-07, + "loss": 0.77295387, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13079834, + "step": 11493, + "time_per_iteration": 2.460143804550171 + }, + { + "auxiliary_loss_clip": 0.01122899, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.04839206, + "balance_loss_mlp": 1.01637459, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.6856727014386954, + "language_loss": 0.74439257, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76590669, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12139893, + "step": 11494, + "time_per_iteration": 2.4181339740753174 + }, + { + "auxiliary_loss_clip": 0.01112798, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.04012299, + "balance_loss_mlp": 1.01769936, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.5932753697813888, + "language_loss": 0.68314183, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70456827, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.121521, + "step": 11495, + "time_per_iteration": 2.561250925064087 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.01885712, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 2.718741045507133, + "language_loss": 0.74621701, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76773572, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1416626, + "step": 11496, + "time_per_iteration": 2.422290325164795 + }, + { + "auxiliary_loss_clip": 0.0111665, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.04371452, + "balance_loss_mlp": 1.01347268, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6530709369781231, + "language_loss": 0.80026674, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82168692, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11871338, + "step": 11497, + "time_per_iteration": 3.895110607147217 + }, + { + "auxiliary_loss_clip": 0.01120308, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.04172373, + "balance_loss_mlp": 1.02000499, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5950526285403501, + "language_loss": 0.8098923, + "learning_rate": 9.189715506138993e-07, + "loss": 0.83142662, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13122559, + "step": 11498, + "time_per_iteration": 2.4455695152282715 + }, + { + "auxiliary_loss_clip": 0.01119401, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.04709792, + "balance_loss_mlp": 1.01800978, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.6680161063988481, + "language_loss": 0.86035407, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88184834, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12011719, + "step": 11499, + "time_per_iteration": 2.5778005123138428 + }, + { + "auxiliary_loss_clip": 0.01125057, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.05130064, + "balance_loss_mlp": 1.01482999, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.746366072919618, + "language_loss": 0.75295544, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77447939, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12506104, + "step": 11500, + "time_per_iteration": 2.4221105575561523 + }, + { + "auxiliary_loss_clip": 0.01119152, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.04242659, + "balance_loss_mlp": 1.02880383, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 2.557531202665571, + "language_loss": 0.77171874, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79333508, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13690186, + "step": 11501, + "time_per_iteration": 2.4804821014404297 + }, + { + "auxiliary_loss_clip": 0.01123141, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.04753542, + "balance_loss_mlp": 1.02279186, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.897981212051178, + "language_loss": 0.73611861, + "learning_rate": 9.176612079067458e-07, + "loss": 0.7577008, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.1227417, + "step": 11502, + "time_per_iteration": 2.461193561553955 + }, + { + "auxiliary_loss_clip": 0.01120039, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.04289269, + "balance_loss_mlp": 1.02124596, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 5.448659821332507, + "language_loss": 0.7316348, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75317645, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12872314, + "step": 11503, + "time_per_iteration": 2.6159868240356445 + }, + { + "auxiliary_loss_clip": 0.01121254, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.04859018, + "balance_loss_mlp": 1.0184114, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.9567232184678078, + "language_loss": 0.77146178, + "learning_rate": 9.170062827578575e-07, + "loss": 0.792979, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12060547, + "step": 11504, + "time_per_iteration": 2.501695156097412 + }, + { + "auxiliary_loss_clip": 0.01117863, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.04054308, + "balance_loss_mlp": 1.01796103, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.8549502157233302, + "language_loss": 0.73519564, + "learning_rate": 9.166788817780499e-07, + "loss": 0.7566787, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12481689, + "step": 11505, + "time_per_iteration": 2.499685049057007 + }, + { + "auxiliary_loss_clip": 0.01122642, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.04908633, + "balance_loss_mlp": 1.02213228, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 9.38574270780744, + "language_loss": 0.87759614, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89917094, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12695312, + "step": 11506, + "time_per_iteration": 2.4628143310546875 + }, + { + "auxiliary_loss_clip": 0.01119069, + "auxiliary_loss_mlp": 0.0102698, + "balance_loss_clip": 1.04524565, + "balance_loss_mlp": 1.01548803, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 1.9712607006675973, + "language_loss": 0.69949341, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72095394, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11499023, + "step": 11507, + "time_per_iteration": 2.545138120651245 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.04554296, + "balance_loss_mlp": 1.01871467, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 2.043167912982936, + "language_loss": 0.76763535, + "learning_rate": 9.156969253661538e-07, + "loss": 0.7891835, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12799072, + "step": 11508, + "time_per_iteration": 2.453711748123169 + }, + { + "auxiliary_loss_clip": 0.01115158, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.04331374, + "balance_loss_mlp": 1.01757109, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.5569443969198096, + "language_loss": 0.74777216, + "learning_rate": 9.153696887794027e-07, + "loss": 0.76921326, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1138916, + "step": 11509, + "time_per_iteration": 2.5150675773620605 + }, + { + "auxiliary_loss_clip": 0.01122515, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.0477221, + "balance_loss_mlp": 1.02030861, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.5360893643751024, + "language_loss": 0.640885, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66243362, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12036133, + "step": 11510, + "time_per_iteration": 2.4521758556365967 + }, + { + "auxiliary_loss_clip": 0.01124112, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.04726601, + "balance_loss_mlp": 1.01883948, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 1.6244701669762756, + "language_loss": 0.75411522, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77567601, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13128662, + "step": 11511, + "time_per_iteration": 2.5077922344207764 + }, + { + "auxiliary_loss_clip": 0.01108167, + "auxiliary_loss_mlp": 0.01031307, + "balance_loss_clip": 1.03735006, + "balance_loss_mlp": 1.01970172, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.720910750312883, + "language_loss": 0.62982094, + "learning_rate": 9.143882258445184e-07, + "loss": 0.65121573, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11602783, + "step": 11512, + "time_per_iteration": 2.5556581020355225 + }, + { + "auxiliary_loss_clip": 0.0111878, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.04277575, + "balance_loss_mlp": 1.02076209, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.8033540113625062, + "language_loss": 0.83087218, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85239553, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12799072, + "step": 11513, + "time_per_iteration": 2.505488157272339 + }, + { + "auxiliary_loss_clip": 0.01116436, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.04340863, + "balance_loss_mlp": 1.01762009, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4518078639377312, + "language_loss": 0.7858631, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80731404, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11035156, + "step": 11514, + "time_per_iteration": 2.483858108520508 + }, + { + "auxiliary_loss_clip": 0.01118891, + "auxiliary_loss_mlp": 0.01029385, + "balance_loss_clip": 1.04443073, + "balance_loss_mlp": 1.01755953, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.8529765401479077, + "language_loss": 0.74543977, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76692253, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1182251, + "step": 11515, + "time_per_iteration": 3.8371644020080566 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.04179883, + "balance_loss_mlp": 1.01728415, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 3.082650241073466, + "language_loss": 0.53845727, + "learning_rate": 9.130801849869694e-07, + "loss": 0.55987006, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11206055, + "step": 11516, + "time_per_iteration": 2.4971466064453125 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01037228, + "balance_loss_clip": 1.0401752, + "balance_loss_mlp": 1.02334607, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.8149431292663205, + "language_loss": 0.73031968, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75179309, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.13879395, + "step": 11517, + "time_per_iteration": 2.4630849361419678 + }, + { + "auxiliary_loss_clip": 0.01122591, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.047086, + "balance_loss_mlp": 1.02243853, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.9502127697262395, + "language_loss": 0.76147664, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78305435, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12750244, + "step": 11518, + "time_per_iteration": 2.438154935836792 + }, + { + "auxiliary_loss_clip": 0.01124875, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.04829741, + "balance_loss_mlp": 1.02045095, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 4.462493305268747, + "language_loss": 0.64787859, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66946638, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13446045, + "step": 11519, + "time_per_iteration": 2.590733051300049 + }, + { + "auxiliary_loss_clip": 0.01124674, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.05094135, + "balance_loss_mlp": 1.01913655, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.8875477832375291, + "language_loss": 0.62625432, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64781308, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1204834, + "step": 11520, + "time_per_iteration": 2.5045084953308105 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.05114686, + "balance_loss_mlp": 1.02276206, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.804791052184017, + "language_loss": 0.77657384, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79829144, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13665771, + "step": 11521, + "time_per_iteration": 2.4625096321105957 + }, + { + "auxiliary_loss_clip": 0.01118385, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.04090703, + "balance_loss_mlp": 1.01849556, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.8889865932680365, + "language_loss": 0.81953931, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84103656, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12841797, + "step": 11522, + "time_per_iteration": 2.4902796745300293 + }, + { + "auxiliary_loss_clip": 0.0111763, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.04391479, + "balance_loss_mlp": 1.01947355, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.540710739350367, + "language_loss": 0.76803088, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78952426, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12249756, + "step": 11523, + "time_per_iteration": 2.52593731880188 + }, + { + "auxiliary_loss_clip": 0.0111355, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.04147363, + "balance_loss_mlp": 1.02155495, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 1.9199781830371145, + "language_loss": 0.68726188, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70872343, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11047363, + "step": 11524, + "time_per_iteration": 2.442005157470703 + }, + { + "auxiliary_loss_clip": 0.0111637, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.04081583, + "balance_loss_mlp": 1.02273965, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.9313539844975904, + "language_loss": 0.64718843, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66872978, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.15032959, + "step": 11525, + "time_per_iteration": 2.4585142135620117 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01039795, + "balance_loss_clip": 1.0474062, + "balance_loss_mlp": 1.02686667, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 2.3722178047168527, + "language_loss": 0.70846319, + "learning_rate": 9.098129697055907e-07, + "loss": 0.73008978, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12921143, + "step": 11526, + "time_per_iteration": 2.4569671154022217 + }, + { + "auxiliary_loss_clip": 0.01117946, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.04655266, + "balance_loss_mlp": 1.0171833, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.6143899333956395, + "language_loss": 0.76504195, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78650928, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1159668, + "step": 11527, + "time_per_iteration": 2.4554901123046875 + }, + { + "auxiliary_loss_clip": 0.01111989, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.04051208, + "balance_loss_mlp": 1.01626658, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 3.2389132753942023, + "language_loss": 0.79482782, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81621867, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10821533, + "step": 11528, + "time_per_iteration": 2.4372355937957764 + }, + { + "auxiliary_loss_clip": 0.01115413, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.04438925, + "balance_loss_mlp": 1.01830304, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.4612122252295658, + "language_loss": 0.76224649, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78370428, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.12060547, + "step": 11529, + "time_per_iteration": 3.8766133785247803 + }, + { + "auxiliary_loss_clip": 0.01113211, + "auxiliary_loss_mlp": 0.01040026, + "balance_loss_clip": 1.0407449, + "balance_loss_mlp": 1.02749133, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.9135717153136633, + "language_loss": 0.72275734, + "learning_rate": 9.085072404194436e-07, + "loss": 0.7442897, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12530518, + "step": 11530, + "time_per_iteration": 2.585841655731201 + }, + { + "auxiliary_loss_clip": 0.01128561, + "auxiliary_loss_mlp": 0.01034694, + "balance_loss_clip": 1.04947793, + "balance_loss_mlp": 1.02065706, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.8323123868740485, + "language_loss": 0.78509247, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80672503, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14025879, + "step": 11531, + "time_per_iteration": 2.4888908863067627 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.04778719, + "balance_loss_mlp": 1.02486229, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.3777428760606365, + "language_loss": 0.69340259, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71499938, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11920166, + "step": 11532, + "time_per_iteration": 2.5513575077056885 + }, + { + "auxiliary_loss_clip": 0.01122531, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.04581535, + "balance_loss_mlp": 1.01634121, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 2.459123435767563, + "language_loss": 0.67274481, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69426024, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12664795, + "step": 11533, + "time_per_iteration": 2.428162097930908 + }, + { + "auxiliary_loss_clip": 0.0112047, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.04536033, + "balance_loss_mlp": 1.02258503, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 2.2622966030225395, + "language_loss": 0.58519822, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60675573, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12701416, + "step": 11534, + "time_per_iteration": 2.4410552978515625 + }, + { + "auxiliary_loss_clip": 0.01115496, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.04286003, + "balance_loss_mlp": 1.01722574, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.0624587754777792, + "language_loss": 0.70936096, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73081309, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12512207, + "step": 11535, + "time_per_iteration": 3.9887077808380127 + }, + { + "auxiliary_loss_clip": 0.01055992, + "auxiliary_loss_mlp": 0.01004997, + "balance_loss_clip": 1.02879655, + "balance_loss_mlp": 1.00331497, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7087463773791611, + "language_loss": 0.59061694, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61122686, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01686096, + "step": 11536, + "time_per_iteration": 3.275496482849121 + }, + { + "auxiliary_loss_clip": 0.01122585, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.04623663, + "balance_loss_mlp": 1.01850307, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.5426227631396392, + "language_loss": 0.72752023, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74906206, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13098145, + "step": 11537, + "time_per_iteration": 2.446868419647217 + }, + { + "auxiliary_loss_clip": 0.01053227, + "auxiliary_loss_mlp": 0.01001753, + "balance_loss_clip": 1.02787089, + "balance_loss_mlp": 1.00014055, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7482569003707275, + "language_loss": 0.55554891, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57609868, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.0161438, + "step": 11538, + "time_per_iteration": 3.0597269535064697 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.03710532, + "balance_loss_mlp": 1.02022362, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.752474274166022, + "language_loss": 0.77536875, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79673749, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.11077881, + "step": 11539, + "time_per_iteration": 2.4762279987335205 + }, + { + "auxiliary_loss_clip": 0.0111798, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.04558527, + "balance_loss_mlp": 1.01801574, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 2.2455217300772214, + "language_loss": 0.63980371, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66127658, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11291504, + "step": 11540, + "time_per_iteration": 2.4873721599578857 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.04105616, + "balance_loss_mlp": 1.01652122, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.6112941088611779, + "language_loss": 0.86876684, + "learning_rate": 9.049199018987437e-07, + "loss": 0.8901754, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12158203, + "step": 11541, + "time_per_iteration": 3.994807481765747 + }, + { + "auxiliary_loss_clip": 0.01119702, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.04535556, + "balance_loss_mlp": 1.02205181, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 2.5847314643062127, + "language_loss": 0.84377372, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86530983, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11871338, + "step": 11542, + "time_per_iteration": 2.5725038051605225 + }, + { + "auxiliary_loss_clip": 0.01116259, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.0429225, + "balance_loss_mlp": 1.01729965, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 2.1084220862880954, + "language_loss": 0.75249887, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77396178, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12731934, + "step": 11543, + "time_per_iteration": 2.463961601257324 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.04702973, + "balance_loss_mlp": 1.02162659, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.7372989961394554, + "language_loss": 0.76529247, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78681397, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12518311, + "step": 11544, + "time_per_iteration": 2.546316385269165 + }, + { + "auxiliary_loss_clip": 0.01122783, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.04366207, + "balance_loss_mlp": 1.02425885, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.7054372542882266, + "language_loss": 0.71076012, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73235607, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12561035, + "step": 11545, + "time_per_iteration": 2.378488302230835 + }, + { + "auxiliary_loss_clip": 0.01113031, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.04474986, + "balance_loss_mlp": 1.01482582, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.6961031051664348, + "language_loss": 0.79296136, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81434494, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10491943, + "step": 11546, + "time_per_iteration": 2.4423668384552 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.04404724, + "balance_loss_mlp": 1.0204488, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.522757831242518, + "language_loss": 0.78393793, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80543029, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11523438, + "step": 11547, + "time_per_iteration": 2.5266661643981934 + }, + { + "auxiliary_loss_clip": 0.01121105, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.04744923, + "balance_loss_mlp": 1.02614951, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.0926358930271602, + "language_loss": 0.80895448, + "learning_rate": 9.026396651834834e-07, + "loss": 0.83055794, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.13079834, + "step": 11548, + "time_per_iteration": 2.468334913253784 + }, + { + "auxiliary_loss_clip": 0.01057897, + "auxiliary_loss_mlp": 0.01005638, + "balance_loss_clip": 1.03175044, + "balance_loss_mlp": 1.00420451, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6945607966319298, + "language_loss": 0.53742909, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55806446, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01434326, + "step": 11549, + "time_per_iteration": 3.071058750152588 + }, + { + "auxiliary_loss_clip": 0.01114695, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.04093766, + "balance_loss_mlp": 1.02014601, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.3853149492092491, + "language_loss": 0.73575974, + "learning_rate": 9.01988543302e-07, + "loss": 0.75722629, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1182251, + "step": 11550, + "time_per_iteration": 2.5127289295196533 + }, + { + "auxiliary_loss_clip": 0.01122393, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.04799533, + "balance_loss_mlp": 1.02135742, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.7553790862834795, + "language_loss": 0.74163485, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76319182, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1194458, + "step": 11551, + "time_per_iteration": 2.463886260986328 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.04008245, + "balance_loss_mlp": 1.02072477, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.6777442021303002, + "language_loss": 0.84403747, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86549497, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11468506, + "step": 11552, + "time_per_iteration": 2.512983798980713 + }, + { + "auxiliary_loss_clip": 0.01115154, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.04251146, + "balance_loss_mlp": 1.02185404, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 2.526192481990992, + "language_loss": 0.67237788, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69386888, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12097168, + "step": 11553, + "time_per_iteration": 2.6773812770843506 + }, + { + "auxiliary_loss_clip": 0.01124903, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.04882574, + "balance_loss_mlp": 1.01921523, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.6798088280475845, + "language_loss": 0.79304177, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81461269, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12969971, + "step": 11554, + "time_per_iteration": 2.5277390480041504 + }, + { + "auxiliary_loss_clip": 0.01108954, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.03589714, + "balance_loss_mlp": 1.01743555, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 2.1590497234617216, + "language_loss": 0.72871733, + "learning_rate": 9.003614674565934e-07, + "loss": 0.75010258, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12133789, + "step": 11555, + "time_per_iteration": 2.4688122272491455 + }, + { + "auxiliary_loss_clip": 0.01115038, + "auxiliary_loss_mlp": 0.01027084, + "balance_loss_clip": 1.04339051, + "balance_loss_mlp": 1.01576495, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 3.466181430248338, + "language_loss": 0.77977502, + "learning_rate": 9.000361773333705e-07, + "loss": 0.80119622, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11322021, + "step": 11556, + "time_per_iteration": 2.5051321983337402 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03844261, + "balance_loss_mlp": 1.01910067, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.122251806725016, + "language_loss": 0.60466003, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62606466, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11340332, + "step": 11557, + "time_per_iteration": 4.003082752227783 + }, + { + "auxiliary_loss_clip": 0.011085, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.03843069, + "balance_loss_mlp": 1.02158403, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.811514526299844, + "language_loss": 0.85522842, + "learning_rate": 8.993857222314752e-07, + "loss": 0.8766433, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11407471, + "step": 11558, + "time_per_iteration": 2.5018951892852783 + }, + { + "auxiliary_loss_clip": 0.011225, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.04611993, + "balance_loss_mlp": 1.01628375, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.7747298338890707, + "language_loss": 0.70524776, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72676635, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1307373, + "step": 11559, + "time_per_iteration": 2.534424066543579 + }, + { + "auxiliary_loss_clip": 0.01109218, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.03758729, + "balance_loss_mlp": 1.01689816, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 2.4869482771568436, + "language_loss": 0.78776085, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80914408, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12213135, + "step": 11560, + "time_per_iteration": 2.4602997303009033 + }, + { + "auxiliary_loss_clip": 0.01121776, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.04992414, + "balance_loss_mlp": 1.01829457, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 3.5402453568684566, + "language_loss": 0.76907891, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79058921, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10961914, + "step": 11561, + "time_per_iteration": 2.469365358352661 + }, + { + "auxiliary_loss_clip": 0.0111362, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.04271126, + "balance_loss_mlp": 1.01754117, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 2.0487671777969787, + "language_loss": 0.79045618, + "learning_rate": 8.980853129511577e-07, + "loss": 0.81189448, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.12677002, + "step": 11562, + "time_per_iteration": 2.398928165435791 + }, + { + "auxiliary_loss_clip": 0.0111571, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.04196846, + "balance_loss_mlp": 1.0252496, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 2.266221731382433, + "language_loss": 0.69779861, + "learning_rate": 8.977603150620515e-07, + "loss": 0.719347, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13885498, + "step": 11563, + "time_per_iteration": 2.431628942489624 + }, + { + "auxiliary_loss_clip": 0.0112439, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.05376029, + "balance_loss_mlp": 1.01529086, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 5.751312914284461, + "language_loss": 0.74023527, + "learning_rate": 8.974353589699846e-07, + "loss": 0.76174104, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10894775, + "step": 11564, + "time_per_iteration": 2.4549500942230225 + }, + { + "auxiliary_loss_clip": 0.01133911, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.05176592, + "balance_loss_mlp": 1.0227381, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 2.9938623387301457, + "language_loss": 0.71728849, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73900765, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.15270996, + "step": 11565, + "time_per_iteration": 2.588656187057495 + }, + { + "auxiliary_loss_clip": 0.01053039, + "auxiliary_loss_mlp": 0.01006547, + "balance_loss_clip": 1.02710915, + "balance_loss_mlp": 1.00487256, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9091167657630965, + "language_loss": 0.58449596, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60509181, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.01675415, + "step": 11566, + "time_per_iteration": 2.9375669956207275 + }, + { + "auxiliary_loss_clip": 0.01126859, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.04924166, + "balance_loss_mlp": 1.01651955, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.8154365815228428, + "language_loss": 0.73906356, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76062256, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12512207, + "step": 11567, + "time_per_iteration": 2.4899797439575195 + }, + { + "auxiliary_loss_clip": 0.0111295, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.042624, + "balance_loss_mlp": 1.01927185, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.3002651103541125, + "language_loss": 0.76712501, + "learning_rate": 8.961359528185313e-07, + "loss": 0.78856468, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11749268, + "step": 11568, + "time_per_iteration": 2.5031111240386963 + }, + { + "auxiliary_loss_clip": 0.01115884, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.04406989, + "balance_loss_mlp": 1.01987815, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.7426532155996723, + "language_loss": 0.72724617, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74871528, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11157227, + "step": 11569, + "time_per_iteration": 2.4729602336883545 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01032295, + "balance_loss_clip": 1.0529685, + "balance_loss_mlp": 1.0199573, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.8011984133252954, + "language_loss": 0.77318853, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79482555, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12335205, + "step": 11570, + "time_per_iteration": 2.4586071968078613 + }, + { + "auxiliary_loss_clip": 0.01119384, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.0451529, + "balance_loss_mlp": 1.02200651, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.0474048806745904, + "language_loss": 0.74574339, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76727819, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12084961, + "step": 11571, + "time_per_iteration": 2.5581812858581543 + }, + { + "auxiliary_loss_clip": 0.01118004, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.04970205, + "balance_loss_mlp": 1.01708937, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.7154387076915516, + "language_loss": 0.74161553, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76307833, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.11181641, + "step": 11572, + "time_per_iteration": 3.883840799331665 + }, + { + "auxiliary_loss_clip": 0.01110175, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.03800654, + "balance_loss_mlp": 1.01354814, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 2.145738100225484, + "language_loss": 0.70287651, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72422791, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11425781, + "step": 11573, + "time_per_iteration": 2.5998034477233887 + }, + { + "auxiliary_loss_clip": 0.01113314, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.03888869, + "balance_loss_mlp": 1.02086461, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.7907056157125678, + "language_loss": 0.75099576, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77246344, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12597656, + "step": 11574, + "time_per_iteration": 2.4917550086975098 + }, + { + "auxiliary_loss_clip": 0.01115377, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.03994429, + "balance_loss_mlp": 1.01884687, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.713117549608013, + "language_loss": 0.74705875, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76851797, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11700439, + "step": 11575, + "time_per_iteration": 2.4312775135040283 + }, + { + "auxiliary_loss_clip": 0.01115135, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.04126382, + "balance_loss_mlp": 1.01951408, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 2.000832398307576, + "language_loss": 0.79060221, + "learning_rate": 8.935391505179966e-07, + "loss": 0.81207341, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12469482, + "step": 11576, + "time_per_iteration": 2.438114643096924 + }, + { + "auxiliary_loss_clip": 0.0112264, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.04726434, + "balance_loss_mlp": 1.01785874, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.9331321419618055, + "language_loss": 0.57521534, + "learning_rate": 8.932147389081985e-07, + "loss": 0.59674048, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12023926, + "step": 11577, + "time_per_iteration": 2.408007860183716 + }, + { + "auxiliary_loss_clip": 0.01110464, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_clip": 1.04171145, + "balance_loss_mlp": 1.01334381, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.4500575176941095, + "language_loss": 0.7681219, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78945875, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.09881592, + "step": 11578, + "time_per_iteration": 3.95572829246521 + }, + { + "auxiliary_loss_clip": 0.01113771, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.04193008, + "balance_loss_mlp": 1.01989818, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 2.0291720965007722, + "language_loss": 0.79646862, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81792504, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11950684, + "step": 11579, + "time_per_iteration": 2.4266531467437744 + }, + { + "auxiliary_loss_clip": 0.01107806, + "auxiliary_loss_mlp": 0.01026888, + "balance_loss_clip": 1.03792191, + "balance_loss_mlp": 1.01550341, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 2.3685972168248184, + "language_loss": 0.72572422, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74707115, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11383057, + "step": 11580, + "time_per_iteration": 2.493643045425415 + }, + { + "auxiliary_loss_clip": 0.01117975, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.04347777, + "balance_loss_mlp": 1.01779556, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 3.445169101713193, + "language_loss": 0.65254682, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67402744, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12298584, + "step": 11581, + "time_per_iteration": 2.4509334564208984 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01027835, + "balance_loss_clip": 1.03713441, + "balance_loss_mlp": 1.01658118, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.717575864443496, + "language_loss": 0.76680267, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78817236, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11248779, + "step": 11582, + "time_per_iteration": 2.5484554767608643 + }, + { + "auxiliary_loss_clip": 0.01112022, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.03981328, + "balance_loss_mlp": 1.01577377, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.8098347566235837, + "language_loss": 0.70105076, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72243738, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10858154, + "step": 11583, + "time_per_iteration": 2.413325548171997 + }, + { + "auxiliary_loss_clip": 0.01122938, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.04769862, + "balance_loss_mlp": 1.0188576, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 2.176340960239697, + "language_loss": 0.82199103, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84353173, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1227417, + "step": 11584, + "time_per_iteration": 2.5145773887634277 + }, + { + "auxiliary_loss_clip": 0.01115018, + "auxiliary_loss_mlp": 0.0103571, + "balance_loss_clip": 1.04185319, + "balance_loss_mlp": 1.02250147, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.6533467281347147, + "language_loss": 0.79965174, + "learning_rate": 8.906209579615107e-07, + "loss": 0.82115906, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.13220215, + "step": 11585, + "time_per_iteration": 3.8832945823669434 + }, + { + "auxiliary_loss_clip": 0.01117015, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.0456804, + "balance_loss_mlp": 1.01830399, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.6758175726195026, + "language_loss": 0.77847087, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79993463, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1105957, + "step": 11586, + "time_per_iteration": 2.4423744678497314 + }, + { + "auxiliary_loss_clip": 0.0111039, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.04330218, + "balance_loss_mlp": 1.01789331, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.71717075559319, + "language_loss": 0.78570068, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80708802, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.67089844, + "router_z_loss_mlp": 0.10443115, + "step": 11587, + "time_per_iteration": 2.61259388923645 + }, + { + "auxiliary_loss_clip": 0.01115042, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.04659581, + "balance_loss_mlp": 1.01680183, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.1452337737409275, + "language_loss": 0.72913188, + "learning_rate": 8.896489838865857e-07, + "loss": 0.75056696, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.11663818, + "step": 11588, + "time_per_iteration": 2.445605754852295 + }, + { + "auxiliary_loss_clip": 0.01114829, + "auxiliary_loss_mlp": 0.01025534, + "balance_loss_clip": 1.04263222, + "balance_loss_mlp": 1.01484692, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.7891772632427463, + "language_loss": 0.75414956, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77555323, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10687256, + "step": 11589, + "time_per_iteration": 2.4769792556762695 + }, + { + "auxiliary_loss_clip": 0.01123782, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.05035019, + "balance_loss_mlp": 1.02045465, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.8894733835881803, + "language_loss": 0.63330865, + "learning_rate": 8.890012116726012e-07, + "loss": 0.65486068, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10968018, + "step": 11590, + "time_per_iteration": 2.508732318878174 + }, + { + "auxiliary_loss_clip": 0.01066312, + "auxiliary_loss_mlp": 0.01003859, + "balance_loss_clip": 1.04172611, + "balance_loss_mlp": 1.00247777, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7548037938633468, + "language_loss": 0.61239779, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63309956, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01382446, + "step": 11591, + "time_per_iteration": 3.1415979862213135 + }, + { + "auxiliary_loss_clip": 0.01116289, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.04228544, + "balance_loss_mlp": 1.02384019, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 1.6741177455569378, + "language_loss": 0.69036722, + "learning_rate": 8.883536079753582e-07, + "loss": 0.71190333, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13482666, + "step": 11592, + "time_per_iteration": 2.494662046432495 + }, + { + "auxiliary_loss_clip": 0.01115391, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.04386735, + "balance_loss_mlp": 1.01437688, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.7583830321887144, + "language_loss": 0.62088323, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64228863, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10760498, + "step": 11593, + "time_per_iteration": 2.570432662963867 + }, + { + "auxiliary_loss_clip": 0.0110662, + "auxiliary_loss_mlp": 0.01026137, + "balance_loss_clip": 1.039217, + "balance_loss_mlp": 1.01424003, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.341202923325052, + "language_loss": 0.54117996, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56250751, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.11895752, + "step": 11594, + "time_per_iteration": 2.534510850906372 + }, + { + "auxiliary_loss_clip": 0.01112509, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.04027271, + "balance_loss_mlp": 1.01821065, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.3737257576933377, + "language_loss": 0.76805556, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78948736, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12463379, + "step": 11595, + "time_per_iteration": 2.4394946098327637 + }, + { + "auxiliary_loss_clip": 0.01118795, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.04385138, + "balance_loss_mlp": 1.01794839, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 3.225121904602921, + "language_loss": 0.71775711, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73924851, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12408447, + "step": 11596, + "time_per_iteration": 2.416841506958008 + }, + { + "auxiliary_loss_clip": 0.01120208, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.04823482, + "balance_loss_mlp": 1.01596963, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.7165731879318025, + "language_loss": 0.76203889, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78351808, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11730957, + "step": 11597, + "time_per_iteration": 2.5223236083984375 + }, + { + "auxiliary_loss_clip": 0.01120168, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.04735053, + "balance_loss_mlp": 1.01747108, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.8493416673341019, + "language_loss": 0.75004894, + "learning_rate": 8.864118089662267e-07, + "loss": 0.77153635, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11096191, + "step": 11598, + "time_per_iteration": 2.4542503356933594 + }, + { + "auxiliary_loss_clip": 0.0112063, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.04299772, + "balance_loss_mlp": 1.0232451, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.7651735191764615, + "language_loss": 0.8921355, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91370839, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13433838, + "step": 11599, + "time_per_iteration": 2.540708541870117 + }, + { + "auxiliary_loss_clip": 0.01126878, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.04843569, + "balance_loss_mlp": 1.02167547, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.0733735296308247, + "language_loss": 0.69984305, + "learning_rate": 8.85764880317974e-07, + "loss": 0.72145951, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13098145, + "step": 11600, + "time_per_iteration": 2.482071876525879 + }, + { + "auxiliary_loss_clip": 0.01129316, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.05213857, + "balance_loss_mlp": 1.0186789, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.8290635317099269, + "language_loss": 0.7677325, + "learning_rate": 8.854414793655771e-07, + "loss": 0.78932941, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11700439, + "step": 11601, + "time_per_iteration": 3.913602590560913 + }, + { + "auxiliary_loss_clip": 0.01128292, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.0560658, + "balance_loss_mlp": 1.01776862, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.9140862056633228, + "language_loss": 0.72362626, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74519467, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10797119, + "step": 11602, + "time_per_iteration": 2.629884719848633 + }, + { + "auxiliary_loss_clip": 0.01117511, + "auxiliary_loss_mlp": 0.01033876, + "balance_loss_clip": 1.04567432, + "balance_loss_mlp": 1.02354598, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.3843955372290235, + "language_loss": 0.76462716, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78614098, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10333252, + "step": 11603, + "time_per_iteration": 2.5634191036224365 + }, + { + "auxiliary_loss_clip": 0.01116976, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.04530907, + "balance_loss_mlp": 1.01915431, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 1.5424584261549643, + "language_loss": 0.62223047, + "learning_rate": 8.844715301424557e-07, + "loss": 0.6437059, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11407471, + "step": 11604, + "time_per_iteration": 2.462001085281372 + }, + { + "auxiliary_loss_clip": 0.01116005, + "auxiliary_loss_mlp": 0.01028391, + "balance_loss_clip": 1.04301381, + "balance_loss_mlp": 1.01547456, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.2078667173913966, + "language_loss": 0.81510377, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83654773, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12902832, + "step": 11605, + "time_per_iteration": 2.4892160892486572 + }, + { + "auxiliary_loss_clip": 0.01119444, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.04753923, + "balance_loss_mlp": 1.01868975, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.7393175784740886, + "language_loss": 0.70446604, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72596133, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11395264, + "step": 11606, + "time_per_iteration": 2.4509944915771484 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.03861475, + "balance_loss_mlp": 1.02078414, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 2.0157478133320548, + "language_loss": 0.82595825, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84740567, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11962891, + "step": 11607, + "time_per_iteration": 2.4859347343444824 + }, + { + "auxiliary_loss_clip": 0.01121897, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.04329097, + "balance_loss_mlp": 1.0196631, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 4.023232864938195, + "language_loss": 0.79079592, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81234723, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13574219, + "step": 11608, + "time_per_iteration": 2.441758632659912 + }, + { + "auxiliary_loss_clip": 0.01124686, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.05028272, + "balance_loss_mlp": 1.01807475, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 2.08510658040254, + "language_loss": 0.9025172, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92405778, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11297607, + "step": 11609, + "time_per_iteration": 2.448852300643921 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.0431782, + "balance_loss_mlp": 1.01631165, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 2.8973046156937654, + "language_loss": 0.63995796, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66140592, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11724854, + "step": 11610, + "time_per_iteration": 2.4198684692382812 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.03839374, + "balance_loss_mlp": 1.01856387, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 2.0329241838593983, + "language_loss": 0.84818506, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86958456, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.12286377, + "step": 11611, + "time_per_iteration": 2.5714173316955566 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.04284513, + "balance_loss_mlp": 1.02334237, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 2.0250586823076575, + "language_loss": 0.70770192, + "learning_rate": 8.818868610212793e-07, + "loss": 0.72925115, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12298584, + "step": 11612, + "time_per_iteration": 2.427257537841797 + }, + { + "auxiliary_loss_clip": 0.01117756, + "auxiliary_loss_mlp": 0.01026634, + "balance_loss_clip": 1.04642153, + "balance_loss_mlp": 1.01514268, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.537321949160791, + "language_loss": 0.80941868, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83086264, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11499023, + "step": 11613, + "time_per_iteration": 2.475480556488037 + }, + { + "auxiliary_loss_clip": 0.01123361, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.05217004, + "balance_loss_mlp": 1.02036965, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 2.1120908484606975, + "language_loss": 0.75641674, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77796376, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10968018, + "step": 11614, + "time_per_iteration": 2.462827444076538 + }, + { + "auxiliary_loss_clip": 0.01116542, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.04429984, + "balance_loss_mlp": 1.01824546, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.31039662353789, + "language_loss": 0.76927018, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79073811, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11993408, + "step": 11615, + "time_per_iteration": 2.460800886154175 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01030169, + "balance_loss_clip": 1.04956627, + "balance_loss_mlp": 1.01899922, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 3.44486675969499, + "language_loss": 0.72978801, + "learning_rate": 8.80595543643797e-07, + "loss": 0.7513085, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11175537, + "step": 11616, + "time_per_iteration": 2.413757085800171 + }, + { + "auxiliary_loss_clip": 0.01115739, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.04557312, + "balance_loss_mlp": 1.02120531, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.570922581662736, + "language_loss": 0.84188122, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86337554, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.12481689, + "step": 11617, + "time_per_iteration": 3.965127468109131 + }, + { + "auxiliary_loss_clip": 0.01122934, + "auxiliary_loss_mlp": 0.01038729, + "balance_loss_clip": 1.04689157, + "balance_loss_mlp": 1.02617013, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.6386805255861314, + "language_loss": 0.59630185, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61791849, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12554932, + "step": 11618, + "time_per_iteration": 2.4492311477661133 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.04423726, + "balance_loss_mlp": 1.02446616, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 2.865636776835882, + "language_loss": 0.8302961, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85182142, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.10980225, + "step": 11619, + "time_per_iteration": 2.4579880237579346 + }, + { + "auxiliary_loss_clip": 0.01107425, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.03933525, + "balance_loss_mlp": 1.01808548, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 2.1663382040421317, + "language_loss": 0.67260474, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69396275, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10284424, + "step": 11620, + "time_per_iteration": 2.6586673259735107 + }, + { + "auxiliary_loss_clip": 0.01117612, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.04378057, + "balance_loss_mlp": 1.01802397, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 3.0057389713325118, + "language_loss": 0.73057461, + "learning_rate": 8.789823520920794e-07, + "loss": 0.75205207, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12115479, + "step": 11621, + "time_per_iteration": 2.424372434616089 + }, + { + "auxiliary_loss_clip": 0.01115158, + "auxiliary_loss_mlp": 0.01037358, + "balance_loss_clip": 1.03905678, + "balance_loss_mlp": 1.02530038, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.6687804436752527, + "language_loss": 0.6863451, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70787024, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1206665, + "step": 11622, + "time_per_iteration": 3.9083330631256104 + }, + { + "auxiliary_loss_clip": 0.01106113, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.0375371, + "balance_loss_mlp": 1.0182538, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.703487398750593, + "language_loss": 0.62927347, + "learning_rate": 8.783373729494721e-07, + "loss": 0.65063465, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.11743164, + "step": 11623, + "time_per_iteration": 2.4231159687042236 + }, + { + "auxiliary_loss_clip": 0.0111375, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.038661, + "balance_loss_mlp": 1.01529336, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.97679795724098, + "language_loss": 0.6067977, + "learning_rate": 8.780149471723932e-07, + "loss": 0.62820852, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12036133, + "step": 11624, + "time_per_iteration": 2.6273396015167236 + }, + { + "auxiliary_loss_clip": 0.01112434, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.03734934, + "balance_loss_mlp": 1.02210021, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6353730822787673, + "language_loss": 0.78401661, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80548775, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12573242, + "step": 11625, + "time_per_iteration": 2.466822624206543 + }, + { + "auxiliary_loss_clip": 0.0111471, + "auxiliary_loss_mlp": 0.01031696, + "balance_loss_clip": 1.04353142, + "balance_loss_mlp": 1.02059782, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.8454531238876588, + "language_loss": 0.66133857, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68280268, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11102295, + "step": 11626, + "time_per_iteration": 2.4831085205078125 + }, + { + "auxiliary_loss_clip": 0.01112449, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.04050708, + "balance_loss_mlp": 1.0174371, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 2.130288904880971, + "language_loss": 0.70137191, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72279114, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12030029, + "step": 11627, + "time_per_iteration": 2.5023727416992188 + }, + { + "auxiliary_loss_clip": 0.011149, + "auxiliary_loss_mlp": 0.01023033, + "balance_loss_clip": 1.04597354, + "balance_loss_mlp": 1.01320457, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.8403968207081955, + "language_loss": 0.6204108, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64179015, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.0982666, + "step": 11628, + "time_per_iteration": 2.4472155570983887 + }, + { + "auxiliary_loss_clip": 0.01117505, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.04364252, + "balance_loss_mlp": 1.02110839, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.4798339156803757, + "language_loss": 0.68377769, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70528066, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11688232, + "step": 11629, + "time_per_iteration": 3.9904603958129883 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.04317307, + "balance_loss_mlp": 1.0167731, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.6886128578920492, + "language_loss": 0.72657633, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74802834, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12518311, + "step": 11630, + "time_per_iteration": 2.409186601638794 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01033329, + "balance_loss_clip": 1.04721522, + "balance_loss_mlp": 1.0214318, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.647103489986064, + "language_loss": 0.74132872, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76284802, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11895752, + "step": 11631, + "time_per_iteration": 2.507761001586914 + }, + { + "auxiliary_loss_clip": 0.01125857, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.04976773, + "balance_loss_mlp": 1.01563025, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.1228742418342117, + "language_loss": 0.89407659, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91562092, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1295166, + "step": 11632, + "time_per_iteration": 2.4277963638305664 + }, + { + "auxiliary_loss_clip": 0.0111769, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.04589796, + "balance_loss_mlp": 1.01872492, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 2.0273494345493455, + "language_loss": 0.80129099, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82276607, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11090088, + "step": 11633, + "time_per_iteration": 2.491116762161255 + }, + { + "auxiliary_loss_clip": 0.01120208, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.04385495, + "balance_loss_mlp": 1.02005482, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.8707876969156647, + "language_loss": 0.67089105, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69242567, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13201904, + "step": 11634, + "time_per_iteration": 2.4471476078033447 + }, + { + "auxiliary_loss_clip": 0.01051655, + "auxiliary_loss_mlp": 0.01006831, + "balance_loss_clip": 1.02547157, + "balance_loss_mlp": 1.00521386, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.7168036467428418, + "language_loss": 0.53136289, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55194777, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01617432, + "step": 11635, + "time_per_iteration": 3.2036328315734863 + }, + { + "auxiliary_loss_clip": 0.01117749, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.04697084, + "balance_loss_mlp": 1.0170213, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.5758382677778304, + "language_loss": 0.81895095, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84041774, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11920166, + "step": 11636, + "time_per_iteration": 2.4113755226135254 + }, + { + "auxiliary_loss_clip": 0.01115574, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.042171, + "balance_loss_mlp": 1.01576018, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 1.9525169424224635, + "language_loss": 0.83045602, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85189164, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12219238, + "step": 11637, + "time_per_iteration": 2.4634511470794678 + }, + { + "auxiliary_loss_clip": 0.01113923, + "auxiliary_loss_mlp": 0.01038905, + "balance_loss_clip": 1.04191637, + "balance_loss_mlp": 1.02554774, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 2.038975631371429, + "language_loss": 0.67955446, + "learning_rate": 8.735054591608704e-07, + "loss": 0.70108271, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.13342285, + "step": 11638, + "time_per_iteration": 2.438729763031006 + }, + { + "auxiliary_loss_clip": 0.01119585, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.04319382, + "balance_loss_mlp": 1.01953888, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 4.13644013628432, + "language_loss": 0.77904725, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80056596, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12744141, + "step": 11639, + "time_per_iteration": 2.528813600540161 + }, + { + "auxiliary_loss_clip": 0.01118134, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.04519546, + "balance_loss_mlp": 1.02357316, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 1.9777256569884964, + "language_loss": 0.82516438, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84670258, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12115479, + "step": 11640, + "time_per_iteration": 2.470250368118286 + }, + { + "auxiliary_loss_clip": 0.01116108, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.04517603, + "balance_loss_mlp": 1.01577914, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.7667050857881115, + "language_loss": 0.75164151, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77307165, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11132812, + "step": 11641, + "time_per_iteration": 2.5175392627716064 + }, + { + "auxiliary_loss_clip": 0.01118061, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.04482114, + "balance_loss_mlp": 1.01165795, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9897339767721305, + "language_loss": 0.77544463, + "learning_rate": 8.722185703539022e-07, + "loss": 0.79686671, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12493896, + "step": 11642, + "time_per_iteration": 2.4374043941497803 + }, + { + "auxiliary_loss_clip": 0.01119644, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.04331923, + "balance_loss_mlp": 1.0192312, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 2.446085473805769, + "language_loss": 0.74979651, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77133173, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.14642334, + "step": 11643, + "time_per_iteration": 2.4931881427764893 + }, + { + "auxiliary_loss_clip": 0.01127149, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.05145788, + "balance_loss_mlp": 1.01680624, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.479616572429136, + "language_loss": 0.60426629, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62582815, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12225342, + "step": 11644, + "time_per_iteration": 3.988483190536499 + }, + { + "auxiliary_loss_clip": 0.01111849, + "auxiliary_loss_mlp": 0.01026187, + "balance_loss_clip": 1.04273748, + "balance_loss_mlp": 1.01498675, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.7523988194945785, + "language_loss": 0.81625652, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83763689, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11199951, + "step": 11645, + "time_per_iteration": 2.4797582626342773 + }, + { + "auxiliary_loss_clip": 0.01116346, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.04489255, + "balance_loss_mlp": 1.01554942, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.972982369007031, + "language_loss": 0.6862402, + "learning_rate": 8.709323657962584e-07, + "loss": 0.7076757, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11657715, + "step": 11646, + "time_per_iteration": 2.4664204120635986 + }, + { + "auxiliary_loss_clip": 0.01117871, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.04718471, + "balance_loss_mlp": 1.01783979, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.7606876035429284, + "language_loss": 0.70995915, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73142934, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11297607, + "step": 11647, + "time_per_iteration": 2.483842134475708 + }, + { + "auxiliary_loss_clip": 0.01115878, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.04374635, + "balance_loss_mlp": 1.01992905, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.877813464696888, + "language_loss": 0.71567786, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73715866, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1227417, + "step": 11648, + "time_per_iteration": 2.6559901237487793 + }, + { + "auxiliary_loss_clip": 0.01114939, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.04338729, + "balance_loss_mlp": 1.01390171, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.7193826881383227, + "language_loss": 0.77602303, + "learning_rate": 8.699681618861014e-07, + "loss": 0.7974236, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11212158, + "step": 11649, + "time_per_iteration": 2.517153739929199 + }, + { + "auxiliary_loss_clip": 0.01108923, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.03826571, + "balance_loss_mlp": 1.01944113, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.6908967452580541, + "language_loss": 0.78611219, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80752313, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.1272583, + "step": 11650, + "time_per_iteration": 2.4132676124572754 + }, + { + "auxiliary_loss_clip": 0.0111455, + "auxiliary_loss_mlp": 0.01025661, + "balance_loss_clip": 1.04314756, + "balance_loss_mlp": 1.0140326, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.20717704836172, + "language_loss": 0.78470206, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80610418, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11627197, + "step": 11651, + "time_per_iteration": 2.42289662361145 + }, + { + "auxiliary_loss_clip": 0.01121025, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.04747963, + "balance_loss_mlp": 1.01683521, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.5769608785283329, + "language_loss": 0.69651365, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71800268, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11047363, + "step": 11652, + "time_per_iteration": 2.591060161590576 + }, + { + "auxiliary_loss_clip": 0.01108726, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.03778088, + "balance_loss_mlp": 1.01944852, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3918819787872858, + "language_loss": 0.74564946, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76704776, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11657715, + "step": 11653, + "time_per_iteration": 2.54885196685791 + }, + { + "auxiliary_loss_clip": 0.01112999, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.03917074, + "balance_loss_mlp": 1.01966143, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.6340107314612307, + "language_loss": 0.71056688, + "learning_rate": 8.68362012550003e-07, + "loss": 0.73202312, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1295166, + "step": 11654, + "time_per_iteration": 2.4591097831726074 + }, + { + "auxiliary_loss_clip": 0.01121259, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.04592693, + "balance_loss_mlp": 1.01612246, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.569379769943058, + "language_loss": 0.73015106, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75165933, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13427734, + "step": 11655, + "time_per_iteration": 2.4266905784606934 + }, + { + "auxiliary_loss_clip": 0.01127926, + "auxiliary_loss_mlp": 0.01042836, + "balance_loss_clip": 1.0476141, + "balance_loss_mlp": 1.02770245, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.8338765157606953, + "language_loss": 0.70013487, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72184253, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.15130615, + "step": 11656, + "time_per_iteration": 2.471381187438965 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.03933716, + "balance_loss_mlp": 1.01796651, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.641310238806159, + "language_loss": 0.77997196, + "learning_rate": 8.673988377928092e-07, + "loss": 0.80137336, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10766602, + "step": 11657, + "time_per_iteration": 2.5124683380126953 + }, + { + "auxiliary_loss_clip": 0.01124293, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.04774261, + "balance_loss_mlp": 1.01758218, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 2.3498345254325574, + "language_loss": 0.78201342, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80356324, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13116455, + "step": 11658, + "time_per_iteration": 2.4316842555999756 + }, + { + "auxiliary_loss_clip": 0.01117788, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.04394817, + "balance_loss_mlp": 1.01931095, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 1.8606796362683151, + "language_loss": 0.82794183, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84943211, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11938477, + "step": 11659, + "time_per_iteration": 3.8377768993377686 + }, + { + "auxiliary_loss_clip": 0.01121316, + "auxiliary_loss_mlp": 0.01026116, + "balance_loss_clip": 1.04814529, + "balance_loss_mlp": 1.01511312, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 2.5402730301167122, + "language_loss": 0.69556504, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71703935, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11004639, + "step": 11660, + "time_per_iteration": 2.436577081680298 + }, + { + "auxiliary_loss_clip": 0.01121881, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.04689503, + "balance_loss_mlp": 1.02020645, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 1.803400643112406, + "language_loss": 0.80928767, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83083344, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12493896, + "step": 11661, + "time_per_iteration": 2.4519026279449463 + }, + { + "auxiliary_loss_clip": 0.01130409, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.05552101, + "balance_loss_mlp": 1.01723313, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.883014224970542, + "language_loss": 0.79185092, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81344211, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11468506, + "step": 11662, + "time_per_iteration": 2.557206869125366 + }, + { + "auxiliary_loss_clip": 0.01120696, + "auxiliary_loss_mlp": 0.01025952, + "balance_loss_clip": 1.04674542, + "balance_loss_mlp": 1.01401341, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 2.1986769879241024, + "language_loss": 0.83847964, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85994613, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11938477, + "step": 11663, + "time_per_iteration": 2.447709321975708 + }, + { + "auxiliary_loss_clip": 0.01055863, + "auxiliary_loss_mlp": 0.01006606, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.00517821, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8208613933381735, + "language_loss": 0.53717005, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55779469, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.25219727, + "router_z_loss_mlp": 0.01428223, + "step": 11664, + "time_per_iteration": 3.12041974067688 + }, + { + "auxiliary_loss_clip": 0.01119328, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.04470444, + "balance_loss_mlp": 1.01513481, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 1.7958878052545695, + "language_loss": 0.79140687, + "learning_rate": 8.64832262393344e-07, + "loss": 0.81287348, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12200928, + "step": 11665, + "time_per_iteration": 2.5418999195098877 + }, + { + "auxiliary_loss_clip": 0.01122711, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.04752207, + "balance_loss_mlp": 1.0175674, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.255676577530083, + "language_loss": 0.77201128, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79353058, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11651611, + "step": 11666, + "time_per_iteration": 3.8294336795806885 + }, + { + "auxiliary_loss_clip": 0.01118651, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.01780391, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 2.062701327439708, + "language_loss": 0.81516254, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83663791, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11071777, + "step": 11667, + "time_per_iteration": 2.492677927017212 + }, + { + "auxiliary_loss_clip": 0.01114227, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04349709, + "balance_loss_mlp": 1.02316117, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.398277441425688, + "language_loss": 0.65556687, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67706078, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12017822, + "step": 11668, + "time_per_iteration": 2.458733320236206 + }, + { + "auxiliary_loss_clip": 0.01118708, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.04530406, + "balance_loss_mlp": 1.0155257, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 1.977003017165586, + "language_loss": 0.76478219, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78624421, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11968994, + "step": 11669, + "time_per_iteration": 2.490518093109131 + }, + { + "auxiliary_loss_clip": 0.01059833, + "auxiliary_loss_mlp": 0.01004694, + "balance_loss_clip": 1.0341537, + "balance_loss_mlp": 1.00323272, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6977320944749703, + "language_loss": 0.54498744, + "learning_rate": 8.632295513577122e-07, + "loss": 0.5656327, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01463318, + "step": 11670, + "time_per_iteration": 3.2241334915161133 + }, + { + "auxiliary_loss_clip": 0.01116576, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.04574645, + "balance_loss_mlp": 1.01958776, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.7289641791128263, + "language_loss": 0.81863052, + "learning_rate": 8.629091384213218e-07, + "loss": 0.84010905, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11682129, + "step": 11671, + "time_per_iteration": 2.4991109371185303 + }, + { + "auxiliary_loss_clip": 0.01111772, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.03961837, + "balance_loss_mlp": 1.02059245, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.149134046856611, + "language_loss": 0.7537396, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77518559, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12249756, + "step": 11672, + "time_per_iteration": 3.8847949504852295 + }, + { + "auxiliary_loss_clip": 0.0111125, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.03949666, + "balance_loss_mlp": 1.01655555, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.6887633778555873, + "language_loss": 0.86973369, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89113081, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11901855, + "step": 11673, + "time_per_iteration": 2.437786817550659 + }, + { + "auxiliary_loss_clip": 0.01115963, + "auxiliary_loss_mlp": 0.01026752, + "balance_loss_clip": 1.04677999, + "balance_loss_mlp": 1.0152303, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.9025479614784468, + "language_loss": 0.72853327, + "learning_rate": 8.619481583723399e-07, + "loss": 0.74996048, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.11535645, + "step": 11674, + "time_per_iteration": 2.438016653060913 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.04227722, + "balance_loss_mlp": 1.01793218, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.5876931495032651, + "language_loss": 0.72369939, + "learning_rate": 8.616279179832329e-07, + "loss": 0.7450949, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.11151123, + "step": 11675, + "time_per_iteration": 2.537294864654541 + }, + { + "auxiliary_loss_clip": 0.01113793, + "auxiliary_loss_mlp": 0.01023955, + "balance_loss_clip": 1.04163289, + "balance_loss_mlp": 1.01176012, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.836494526394779, + "language_loss": 0.51208252, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53346002, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12200928, + "step": 11676, + "time_per_iteration": 2.510363817214966 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01006434, + "balance_loss_clip": 1.04288721, + "balance_loss_mlp": 1.0045296, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.8265732350653044, + "language_loss": 0.59160316, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61233938, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.24291992, + "router_z_loss_mlp": 0.01904297, + "step": 11677, + "time_per_iteration": 3.1454763412475586 + }, + { + "auxiliary_loss_clip": 0.01114294, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.04107094, + "balance_loss_mlp": 1.01608467, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.745543803012987, + "language_loss": 0.62815046, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64957011, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11590576, + "step": 11678, + "time_per_iteration": 2.625591278076172 + }, + { + "auxiliary_loss_clip": 0.01113974, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.04238224, + "balance_loss_mlp": 1.01924849, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.6348467825854476, + "language_loss": 0.79282612, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81427503, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11676025, + "step": 11679, + "time_per_iteration": 2.4970977306365967 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.04517603, + "balance_loss_mlp": 1.02295113, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.4427963837982385, + "language_loss": 0.71484184, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73635483, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11047363, + "step": 11680, + "time_per_iteration": 2.4595062732696533 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.04797673, + "balance_loss_mlp": 1.02187586, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.7432774228290602, + "language_loss": 0.75037467, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77194953, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12084961, + "step": 11681, + "time_per_iteration": 2.4163589477539062 + }, + { + "auxiliary_loss_clip": 0.0111551, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.0432893, + "balance_loss_mlp": 1.01778591, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.5259588247639366, + "language_loss": 0.76917756, + "learning_rate": 8.593874446204434e-07, + "loss": 0.79062319, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11273193, + "step": 11682, + "time_per_iteration": 2.500972032546997 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.04299378, + "balance_loss_mlp": 1.01852238, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.0243191018583175, + "language_loss": 0.72926927, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75074363, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11761475, + "step": 11683, + "time_per_iteration": 2.4167520999908447 + }, + { + "auxiliary_loss_clip": 0.01120314, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.04837501, + "balance_loss_mlp": 1.01825786, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 2.480385615128381, + "language_loss": 0.71798229, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73949265, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12475586, + "step": 11684, + "time_per_iteration": 2.5014240741729736 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.04252136, + "balance_loss_mlp": 1.01971495, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.9471802909943263, + "language_loss": 0.72023845, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74169862, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12420654, + "step": 11685, + "time_per_iteration": 2.4691898822784424 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.04167688, + "balance_loss_mlp": 1.0187186, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.6436460959756864, + "language_loss": 0.8453573, + "learning_rate": 8.581081254075582e-07, + "loss": 0.866817, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11480713, + "step": 11686, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01051032, + "auxiliary_loss_mlp": 0.0100306, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.00156498, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.9810047123500168, + "language_loss": 0.69918144, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71972239, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01493835, + "step": 11687, + "time_per_iteration": 3.245300531387329 + }, + { + "auxiliary_loss_clip": 0.01116128, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.04386723, + "balance_loss_mlp": 1.0170815, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.031845046819703, + "language_loss": 0.77805007, + "learning_rate": 8.574687255565329e-07, + "loss": 0.7995007, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11834717, + "step": 11688, + "time_per_iteration": 3.904841899871826 + }, + { + "auxiliary_loss_clip": 0.01119365, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.04663205, + "balance_loss_mlp": 1.02015984, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 4.37213009263251, + "language_loss": 0.68498647, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70650023, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11859131, + "step": 11689, + "time_per_iteration": 2.479677438735962 + }, + { + "auxiliary_loss_clip": 0.01122195, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.04702592, + "balance_loss_mlp": 1.021281, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.096090879315694, + "language_loss": 0.79657841, + "learning_rate": 8.568294990051086e-07, + "loss": 0.8181355, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12237549, + "step": 11690, + "time_per_iteration": 2.426072835922241 + }, + { + "auxiliary_loss_clip": 0.01112015, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.0410248, + "balance_loss_mlp": 1.0207938, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 2.0284279930143843, + "language_loss": 0.76047909, + "learning_rate": 8.56509950747047e-07, + "loss": 0.7819227, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11566162, + "step": 11691, + "time_per_iteration": 2.5445399284362793 + }, + { + "auxiliary_loss_clip": 0.01115699, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.04328179, + "balance_loss_mlp": 1.01716161, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 2.4814015732116035, + "language_loss": 0.81801593, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83946395, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11950684, + "step": 11692, + "time_per_iteration": 2.5205423831939697 + }, + { + "auxiliary_loss_clip": 0.01120285, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.04674351, + "balance_loss_mlp": 1.01519144, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.7430291800204831, + "language_loss": 0.7647903, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78626668, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.121521, + "step": 11693, + "time_per_iteration": 2.522559404373169 + }, + { + "auxiliary_loss_clip": 0.01110552, + "auxiliary_loss_mlp": 0.01030205, + "balance_loss_clip": 1.04078221, + "balance_loss_mlp": 1.01892734, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.751443046060097, + "language_loss": 0.68524134, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70664895, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11273193, + "step": 11694, + "time_per_iteration": 2.6339030265808105 + }, + { + "auxiliary_loss_clip": 0.01119148, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.04673934, + "balance_loss_mlp": 1.01760495, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.1738751024602445, + "language_loss": 0.75855136, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78003585, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11682129, + "step": 11695, + "time_per_iteration": 2.438124179840088 + }, + { + "auxiliary_loss_clip": 0.01117794, + "auxiliary_loss_mlp": 0.01035073, + "balance_loss_clip": 1.04316258, + "balance_loss_mlp": 1.02265167, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.0811192802377576, + "language_loss": 0.73642635, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75795501, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12420654, + "step": 11696, + "time_per_iteration": 2.3999204635620117 + }, + { + "auxiliary_loss_clip": 0.01114245, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.04000092, + "balance_loss_mlp": 1.01701224, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.7232036896444847, + "language_loss": 0.75538158, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77682197, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12762451, + "step": 11697, + "time_per_iteration": 2.526653528213501 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.0466125, + "balance_loss_mlp": 1.01977146, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 2.41253640230893, + "language_loss": 0.81129682, + "learning_rate": 8.542743277341793e-07, + "loss": 0.83282781, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.13232422, + "step": 11698, + "time_per_iteration": 2.4384708404541016 + }, + { + "auxiliary_loss_clip": 0.01109084, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03650463, + "balance_loss_mlp": 1.019032, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.5879857911620532, + "language_loss": 0.84637564, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86777925, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12249756, + "step": 11699, + "time_per_iteration": 2.495850086212158 + }, + { + "auxiliary_loss_clip": 0.01127313, + "auxiliary_loss_mlp": 0.0103055, + "balance_loss_clip": 1.0542289, + "balance_loss_mlp": 1.01736581, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 1.952507312760565, + "language_loss": 0.7925846, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81416321, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.13201904, + "step": 11700, + "time_per_iteration": 2.4387309551239014 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.03727627, + "balance_loss_mlp": 1.01288247, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 4.5658204381716745, + "language_loss": 0.74610281, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76745999, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12609863, + "step": 11701, + "time_per_iteration": 2.670822858810425 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.04247904, + "balance_loss_mlp": 1.02056336, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 3.1069707464219007, + "language_loss": 0.84544396, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86696994, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13458252, + "step": 11702, + "time_per_iteration": 2.5582358837127686 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.04392231, + "balance_loss_mlp": 1.02075934, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 12.734118901224337, + "language_loss": 0.60614765, + "learning_rate": 8.526787572922738e-07, + "loss": 0.62764955, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1237793, + "step": 11703, + "time_per_iteration": 3.910487651824951 + }, + { + "auxiliary_loss_clip": 0.01110336, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.03743124, + "balance_loss_mlp": 1.0171926, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 2.3686535073601918, + "language_loss": 0.61171377, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63311338, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12432861, + "step": 11704, + "time_per_iteration": 2.559195041656494 + }, + { + "auxiliary_loss_clip": 0.01106471, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.03787541, + "balance_loss_mlp": 1.01965904, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.7470230831955036, + "language_loss": 0.71201789, + "learning_rate": 8.520408335765719e-07, + "loss": 0.73339176, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.1126709, + "step": 11705, + "time_per_iteration": 2.5266902446746826 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01026277, + "balance_loss_clip": 1.0413022, + "balance_loss_mlp": 1.01448703, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 1.9459556129645357, + "language_loss": 0.6183182, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63970113, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11785889, + "step": 11706, + "time_per_iteration": 2.5169360637664795 + }, + { + "auxiliary_loss_clip": 0.01115362, + "auxiliary_loss_mlp": 0.01025646, + "balance_loss_clip": 1.04155755, + "balance_loss_mlp": 1.01424956, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 1.9065621577664753, + "language_loss": 0.68399036, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70540041, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1138916, + "step": 11707, + "time_per_iteration": 2.4712796211242676 + }, + { + "auxiliary_loss_clip": 0.01115568, + "auxiliary_loss_mlp": 0.01027166, + "balance_loss_clip": 1.04550576, + "balance_loss_mlp": 1.01588869, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.9157458642500886, + "language_loss": 0.75829363, + "learning_rate": 8.510842745136974e-07, + "loss": 0.7797209, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11279297, + "step": 11708, + "time_per_iteration": 2.5066170692443848 + }, + { + "auxiliary_loss_clip": 0.01114852, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.04381204, + "balance_loss_mlp": 1.01543188, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.7210477673288787, + "language_loss": 0.72203493, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74344885, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11114502, + "step": 11709, + "time_per_iteration": 3.9013867378234863 + }, + { + "auxiliary_loss_clip": 0.01106555, + "auxiliary_loss_mlp": 0.01027231, + "balance_loss_clip": 1.03610897, + "balance_loss_mlp": 1.01594234, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.587156098859102, + "language_loss": 0.78866023, + "learning_rate": 8.504467862866267e-07, + "loss": 0.8099981, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.112854, + "step": 11710, + "time_per_iteration": 2.459794282913208 + }, + { + "auxiliary_loss_clip": 0.01118427, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.0458746, + "balance_loss_mlp": 1.01971674, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.770693059212068, + "language_loss": 0.77543104, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79693782, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12530518, + "step": 11711, + "time_per_iteration": 2.459547519683838 + }, + { + "auxiliary_loss_clip": 0.0111265, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.04204261, + "balance_loss_mlp": 1.01631093, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 3.0601559193409718, + "language_loss": 0.74932998, + "learning_rate": 8.498094724242457e-07, + "loss": 0.77072692, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10717773, + "step": 11712, + "time_per_iteration": 2.4543228149414062 + }, + { + "auxiliary_loss_clip": 0.01045408, + "auxiliary_loss_mlp": 0.01002624, + "balance_loss_clip": 1.01961017, + "balance_loss_mlp": 1.00101769, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8796598204129081, + "language_loss": 0.64588428, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66636467, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.25830078, + "router_z_loss_mlp": 0.0160675, + "step": 11713, + "time_per_iteration": 3.1255452632904053 + }, + { + "auxiliary_loss_clip": 0.0111141, + "auxiliary_loss_mlp": 0.01027886, + "balance_loss_clip": 1.03816104, + "balance_loss_mlp": 1.01693106, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.037602860374872, + "language_loss": 0.73065788, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75205082, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10968018, + "step": 11714, + "time_per_iteration": 2.520934581756592 + }, + { + "auxiliary_loss_clip": 0.0111032, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.03943372, + "balance_loss_mlp": 1.01919019, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.6652893266057678, + "language_loss": 0.79699826, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81841409, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12072754, + "step": 11715, + "time_per_iteration": 4.0022783279418945 + }, + { + "auxiliary_loss_clip": 0.01119728, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.04335475, + "balance_loss_mlp": 1.02123451, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 7.434919984894694, + "language_loss": 0.71453553, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73606873, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12341309, + "step": 11716, + "time_per_iteration": 2.4187991619110107 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01025489, + "balance_loss_clip": 1.04829538, + "balance_loss_mlp": 1.01405656, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 2.094331460418678, + "language_loss": 0.66161114, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68309855, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11437988, + "step": 11717, + "time_per_iteration": 2.5564382076263428 + }, + { + "auxiliary_loss_clip": 0.01117795, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.04599512, + "balance_loss_mlp": 1.01843309, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4455132042088228, + "language_loss": 0.74367845, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76515079, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11010742, + "step": 11718, + "time_per_iteration": 2.4763810634613037 + }, + { + "auxiliary_loss_clip": 0.01114213, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.04256785, + "balance_loss_mlp": 1.02148211, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.6436147701051853, + "language_loss": 0.79871583, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82019812, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12530518, + "step": 11719, + "time_per_iteration": 2.515148162841797 + }, + { + "auxiliary_loss_clip": 0.01120145, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.04863358, + "balance_loss_mlp": 1.01973486, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.836342435094809, + "language_loss": 0.65548164, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67700052, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12005615, + "step": 11720, + "time_per_iteration": 2.620021343231201 + }, + { + "auxiliary_loss_clip": 0.01119899, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.04621494, + "balance_loss_mlp": 1.0156641, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.194803547900781, + "language_loss": 0.80279839, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82427615, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12207031, + "step": 11721, + "time_per_iteration": 2.487807035446167 + }, + { + "auxiliary_loss_clip": 0.01048367, + "auxiliary_loss_mlp": 0.01005249, + "balance_loss_clip": 1.02228796, + "balance_loss_mlp": 1.00357389, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7610325655995741, + "language_loss": 0.64719677, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66773295, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01675415, + "step": 11722, + "time_per_iteration": 3.1692662239074707 + }, + { + "auxiliary_loss_clip": 0.0112148, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.04781115, + "balance_loss_mlp": 1.02545309, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.6381795883936257, + "language_loss": 0.66165864, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68324584, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11785889, + "step": 11723, + "time_per_iteration": 2.5330278873443604 + }, + { + "auxiliary_loss_clip": 0.01113513, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.04008603, + "balance_loss_mlp": 1.0194875, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.7371122864287887, + "language_loss": 0.80992579, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83137882, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12298584, + "step": 11724, + "time_per_iteration": 2.4552276134490967 + }, + { + "auxiliary_loss_clip": 0.01122975, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.04606581, + "balance_loss_mlp": 1.02448654, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 1.6704449550381335, + "language_loss": 0.73247707, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75407124, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11956787, + "step": 11725, + "time_per_iteration": 2.5167880058288574 + }, + { + "auxiliary_loss_clip": 0.01117964, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.04447067, + "balance_loss_mlp": 1.01537108, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.2635213680355313, + "language_loss": 0.78272611, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80418146, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12194824, + "step": 11726, + "time_per_iteration": 2.4031639099121094 + }, + { + "auxiliary_loss_clip": 0.01119503, + "auxiliary_loss_mlp": 0.01025765, + "balance_loss_clip": 1.04417026, + "balance_loss_mlp": 1.01442194, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.845266565087398, + "language_loss": 0.70764893, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72910166, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11352539, + "step": 11727, + "time_per_iteration": 2.477627754211426 + }, + { + "auxiliary_loss_clip": 0.01123419, + "auxiliary_loss_mlp": 0.01022427, + "balance_loss_clip": 1.05307567, + "balance_loss_mlp": 1.01203156, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.6065897728090281, + "language_loss": 0.69038367, + "learning_rate": 8.44717250248668e-07, + "loss": 0.71184218, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10388184, + "step": 11728, + "time_per_iteration": 2.4939749240875244 + }, + { + "auxiliary_loss_clip": 0.0112056, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.04877234, + "balance_loss_mlp": 1.01938748, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 1.6715813081662267, + "language_loss": 0.73062891, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75214207, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1137085, + "step": 11729, + "time_per_iteration": 2.519505500793457 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.04477072, + "balance_loss_mlp": 1.0183624, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.5972418314683339, + "language_loss": 0.78359854, + "learning_rate": 8.440815100153862e-07, + "loss": 0.8051312, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13214111, + "step": 11730, + "time_per_iteration": 2.505023956298828 + }, + { + "auxiliary_loss_clip": 0.01118689, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.04598653, + "balance_loss_mlp": 1.01992178, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.036655390319725, + "language_loss": 0.63105667, + "learning_rate": 8.437637056415359e-07, + "loss": 0.65255767, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11486816, + "step": 11731, + "time_per_iteration": 3.862356185913086 + }, + { + "auxiliary_loss_clip": 0.01115172, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.03964305, + "balance_loss_mlp": 1.01725698, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.125474764213461, + "language_loss": 0.74640834, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76786131, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12872314, + "step": 11732, + "time_per_iteration": 2.453111171722412 + }, + { + "auxiliary_loss_clip": 0.01113553, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.04429758, + "balance_loss_mlp": 1.01476717, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.526270388672915, + "language_loss": 0.71240234, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73379755, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11206055, + "step": 11733, + "time_per_iteration": 2.496000289916992 + }, + { + "auxiliary_loss_clip": 0.01116523, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.04433286, + "balance_loss_mlp": 1.0212034, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.8250001595814498, + "language_loss": 0.73372495, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75521827, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1159668, + "step": 11734, + "time_per_iteration": 2.4697773456573486 + }, + { + "auxiliary_loss_clip": 0.01118366, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.04147792, + "balance_loss_mlp": 1.01878309, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.7413717676621476, + "language_loss": 0.69154912, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71304673, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1260376, + "step": 11735, + "time_per_iteration": 2.4313833713531494 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.04253793, + "balance_loss_mlp": 1.02082562, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 1.9121852683341187, + "language_loss": 0.72797191, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74948728, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13574219, + "step": 11736, + "time_per_iteration": 2.529487371444702 + }, + { + "auxiliary_loss_clip": 0.01117513, + "auxiliary_loss_mlp": 0.01024355, + "balance_loss_clip": 1.04709816, + "balance_loss_mlp": 1.0136559, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 2.3587178443977295, + "language_loss": 0.68762171, + "learning_rate": 8.41857800556629e-07, + "loss": 0.7090404, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10693359, + "step": 11737, + "time_per_iteration": 2.4761836528778076 + }, + { + "auxiliary_loss_clip": 0.01124895, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.05022085, + "balance_loss_mlp": 1.02401602, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 3.3404557524439378, + "language_loss": 0.67396605, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69558126, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12609863, + "step": 11738, + "time_per_iteration": 2.4563205242156982 + }, + { + "auxiliary_loss_clip": 0.01113867, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.04113376, + "balance_loss_mlp": 1.01901639, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.8838651321288526, + "language_loss": 0.75210375, + "learning_rate": 8.41222850068145e-07, + "loss": 0.7735613, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12884521, + "step": 11739, + "time_per_iteration": 2.72149658203125 + }, + { + "auxiliary_loss_clip": 0.01116911, + "auxiliary_loss_mlp": 0.01027421, + "balance_loss_clip": 1.04688716, + "balance_loss_mlp": 1.01545262, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.8529534838008996, + "language_loss": 0.71572518, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73716855, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11962891, + "step": 11740, + "time_per_iteration": 2.504567861557007 + }, + { + "auxiliary_loss_clip": 0.01119362, + "auxiliary_loss_mlp": 0.01024958, + "balance_loss_clip": 1.04768705, + "balance_loss_mlp": 1.01473582, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6869223826096025, + "language_loss": 0.82047486, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84191811, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10223389, + "step": 11741, + "time_per_iteration": 2.450138568878174 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.04014754, + "balance_loss_mlp": 1.01955748, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 2.0855043190148606, + "language_loss": 0.78110582, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80255675, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.13311768, + "step": 11742, + "time_per_iteration": 2.473500967025757 + }, + { + "auxiliary_loss_clip": 0.01123348, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.04826808, + "balance_loss_mlp": 1.01539207, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.8149965346789567, + "language_loss": 0.6428678, + "learning_rate": 8.39953476478805e-07, + "loss": 0.6643796, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12451172, + "step": 11743, + "time_per_iteration": 2.533618688583374 + }, + { + "auxiliary_loss_clip": 0.01113006, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.03896356, + "balance_loss_mlp": 1.0241996, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 2.755457353601261, + "language_loss": 0.65883672, + "learning_rate": 8.396362430240902e-07, + "loss": 0.68035936, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.15032959, + "step": 11744, + "time_per_iteration": 2.45959734916687 + }, + { + "auxiliary_loss_clip": 0.01118768, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.046749, + "balance_loss_mlp": 1.01823974, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 2.4084971641750705, + "language_loss": 0.63635898, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65784717, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11810303, + "step": 11745, + "time_per_iteration": 2.482578754425049 + }, + { + "auxiliary_loss_clip": 0.01114682, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.04249001, + "balance_loss_mlp": 1.021662, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.8760883887195845, + "language_loss": 0.71691322, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73840851, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.13171387, + "step": 11746, + "time_per_iteration": 3.8801207542419434 + }, + { + "auxiliary_loss_clip": 0.01120618, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.04591393, + "balance_loss_mlp": 1.01942611, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.4359119582110011, + "language_loss": 0.79577374, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81731069, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.13659668, + "step": 11747, + "time_per_iteration": 2.498518705368042 + }, + { + "auxiliary_loss_clip": 0.01122377, + "auxiliary_loss_mlp": 0.01026007, + "balance_loss_clip": 1.05061817, + "balance_loss_mlp": 1.01531434, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 2.20669292559005, + "language_loss": 0.65281594, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67429984, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10693359, + "step": 11748, + "time_per_iteration": 2.487450361251831 + }, + { + "auxiliary_loss_clip": 0.01122865, + "auxiliary_loss_mlp": 0.01039506, + "balance_loss_clip": 1.04562593, + "balance_loss_mlp": 1.02694738, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.9602107939952405, + "language_loss": 0.79275262, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81437629, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12561035, + "step": 11749, + "time_per_iteration": 2.444812297821045 + }, + { + "auxiliary_loss_clip": 0.01057196, + "auxiliary_loss_mlp": 0.01004918, + "balance_loss_clip": 1.03207695, + "balance_loss_mlp": 1.0036397, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7923166838413215, + "language_loss": 0.54067594, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56129712, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.25170898, + "router_z_loss_mlp": 0.01277161, + "step": 11750, + "time_per_iteration": 3.0553364753723145 + }, + { + "auxiliary_loss_clip": 0.01116534, + "auxiliary_loss_mlp": 0.01036643, + "balance_loss_clip": 1.04480636, + "balance_loss_mlp": 1.02289259, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.6883501197959527, + "language_loss": 0.78193063, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80346239, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.13745117, + "step": 11751, + "time_per_iteration": 2.6165688037872314 + }, + { + "auxiliary_loss_clip": 0.01115972, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.04238415, + "balance_loss_mlp": 1.01829278, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 2.6488750278801403, + "language_loss": 0.68166238, + "learning_rate": 8.370999604364634e-07, + "loss": 0.7031135, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10845947, + "step": 11752, + "time_per_iteration": 2.4537482261657715 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.04445887, + "balance_loss_mlp": 1.02675402, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 1.9490895235124177, + "language_loss": 0.76657301, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78812981, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12207031, + "step": 11753, + "time_per_iteration": 3.91637921333313 + }, + { + "auxiliary_loss_clip": 0.01119267, + "auxiliary_loss_mlp": 0.01026005, + "balance_loss_clip": 1.04909241, + "balance_loss_mlp": 1.01468039, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.732180530337763, + "language_loss": 0.71151054, + "learning_rate": 8.364663305220405e-07, + "loss": 0.7329632, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11328125, + "step": 11754, + "time_per_iteration": 2.484067916870117 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.04400373, + "balance_loss_mlp": 1.01870775, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.9173676297423412, + "language_loss": 0.89461035, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91610181, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12109375, + "step": 11755, + "time_per_iteration": 2.522545099258423 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.04166603, + "balance_loss_mlp": 1.01688445, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 2.17507763522927, + "language_loss": 0.79610014, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81750381, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11676025, + "step": 11756, + "time_per_iteration": 2.4430086612701416 + }, + { + "auxiliary_loss_clip": 0.01045961, + "auxiliary_loss_mlp": 0.01004389, + "balance_loss_clip": 1.02051783, + "balance_loss_mlp": 1.00249374, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8332911542763703, + "language_loss": 0.60419464, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62469816, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.25488281, + "router_z_loss_mlp": 0.01898193, + "step": 11757, + "time_per_iteration": 2.8615059852600098 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.04512978, + "balance_loss_mlp": 1.02147496, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 2.519249962391738, + "language_loss": 0.80495822, + "learning_rate": 8.351996002450307e-07, + "loss": 0.8264488, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11022949, + "step": 11758, + "time_per_iteration": 2.4311604499816895 + }, + { + "auxiliary_loss_clip": 0.0111752, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.04776537, + "balance_loss_mlp": 1.02076054, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 4.467245349515281, + "language_loss": 0.7726419, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79414421, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.1194458, + "step": 11759, + "time_per_iteration": 4.0952308177948 + }, + { + "auxiliary_loss_clip": 0.0111441, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.04244447, + "balance_loss_mlp": 1.01701808, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.5851195256696502, + "language_loss": 0.67768019, + "learning_rate": 8.34566500074583e-07, + "loss": 0.69911677, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12231445, + "step": 11760, + "time_per_iteration": 2.4798264503479004 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.01031513, + "balance_loss_clip": 1.04376841, + "balance_loss_mlp": 1.02019989, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.9511596643331368, + "language_loss": 0.80530465, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82678878, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11309814, + "step": 11761, + "time_per_iteration": 2.4258596897125244 + }, + { + "auxiliary_loss_clip": 0.01117042, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04201674, + "balance_loss_mlp": 1.02395356, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.5604401323881394, + "language_loss": 0.75038779, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77193511, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13726807, + "step": 11762, + "time_per_iteration": 2.465834140777588 + }, + { + "auxiliary_loss_clip": 0.01119453, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.04652715, + "balance_loss_mlp": 1.01843202, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.828110526765793, + "language_loss": 0.76988339, + "learning_rate": 8.336171812990724e-07, + "loss": 0.79137373, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11151123, + "step": 11763, + "time_per_iteration": 2.447486400604248 + }, + { + "auxiliary_loss_clip": 0.01118017, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.04603219, + "balance_loss_mlp": 1.02129912, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 3.8184588317609953, + "language_loss": 0.78649241, + "learning_rate": 8.333008301499453e-07, + "loss": 0.8080036, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11816406, + "step": 11764, + "time_per_iteration": 2.5281221866607666 + }, + { + "auxiliary_loss_clip": 0.01121649, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.04728675, + "balance_loss_mlp": 1.02305722, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.6905341377929382, + "language_loss": 0.79474366, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81631947, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12872314, + "step": 11765, + "time_per_iteration": 2.4251322746276855 + }, + { + "auxiliary_loss_clip": 0.01121017, + "auxiliary_loss_mlp": 0.0102636, + "balance_loss_clip": 1.04896069, + "balance_loss_mlp": 1.01585186, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 3.7738858589893463, + "language_loss": 0.68586123, + "learning_rate": 8.326682605877324e-07, + "loss": 0.707335, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10510254, + "step": 11766, + "time_per_iteration": 2.536517858505249 + }, + { + "auxiliary_loss_clip": 0.01119937, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.04590273, + "balance_loss_mlp": 1.01755834, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.883659072626269, + "language_loss": 0.63963103, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66112912, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12310791, + "step": 11767, + "time_per_iteration": 2.504138708114624 + }, + { + "auxiliary_loss_clip": 0.01118243, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.04465234, + "balance_loss_mlp": 1.0166862, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.6071637441441666, + "language_loss": 0.5293864, + "learning_rate": 8.320358680868646e-07, + "loss": 0.55084991, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11425781, + "step": 11768, + "time_per_iteration": 2.5601470470428467 + }, + { + "auxiliary_loss_clip": 0.01119351, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.04957271, + "balance_loss_mlp": 1.01810741, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.5926234927557414, + "language_loss": 0.75923467, + "learning_rate": 8.317197382644119e-07, + "loss": 0.78072083, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11157227, + "step": 11769, + "time_per_iteration": 2.4525651931762695 + }, + { + "auxiliary_loss_clip": 0.01056298, + "auxiliary_loss_mlp": 0.01003458, + "balance_loss_clip": 1.03059888, + "balance_loss_mlp": 1.00189328, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8489704967470955, + "language_loss": 0.61942112, + "learning_rate": 8.314036527432637e-07, + "loss": 0.6400187, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01564026, + "step": 11770, + "time_per_iteration": 3.0055179595947266 + }, + { + "auxiliary_loss_clip": 0.01121236, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.0466882, + "balance_loss_mlp": 1.01999044, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.821512186513965, + "language_loss": 0.76436138, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78589833, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12475586, + "step": 11771, + "time_per_iteration": 2.484755039215088 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.04422832, + "balance_loss_mlp": 1.02044129, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.5107537303075171, + "language_loss": 0.70907485, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73054326, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.12390137, + "step": 11772, + "time_per_iteration": 2.5184903144836426 + }, + { + "auxiliary_loss_clip": 0.01118967, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.04299867, + "balance_loss_mlp": 1.02124119, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 3.2858880109291206, + "language_loss": 0.69719601, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71873689, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13879395, + "step": 11773, + "time_per_iteration": 2.497114419937134 + }, + { + "auxiliary_loss_clip": 0.01120869, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.04825997, + "balance_loss_mlp": 1.02190912, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 2.094987775865029, + "language_loss": 0.7030766, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72462261, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11810303, + "step": 11774, + "time_per_iteration": 3.921267509460449 + }, + { + "auxiliary_loss_clip": 0.0111302, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.04277873, + "balance_loss_mlp": 1.0141654, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.528606385593315, + "language_loss": 0.74581003, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76719463, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.112854, + "step": 11775, + "time_per_iteration": 2.453535795211792 + }, + { + "auxiliary_loss_clip": 0.01125558, + "auxiliary_loss_mlp": 0.01028703, + "balance_loss_clip": 1.04930067, + "balance_loss_mlp": 1.01658499, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.663031217098037, + "language_loss": 0.86966211, + "learning_rate": 8.295080706148665e-07, + "loss": 0.89120471, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12121582, + "step": 11776, + "time_per_iteration": 2.440570116043091 + }, + { + "auxiliary_loss_clip": 0.01127193, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.0541296, + "balance_loss_mlp": 1.02083051, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.549835361437984, + "language_loss": 0.75142431, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77301669, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11212158, + "step": 11777, + "time_per_iteration": 2.4272713661193848 + }, + { + "auxiliary_loss_clip": 0.01121975, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.04589844, + "balance_loss_mlp": 1.01864719, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.2095943585117905, + "language_loss": 0.8211472, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84267688, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12353516, + "step": 11778, + "time_per_iteration": 2.4451792240142822 + }, + { + "auxiliary_loss_clip": 0.01119192, + "auxiliary_loss_mlp": 0.0102886, + "balance_loss_clip": 1.05097127, + "balance_loss_mlp": 1.01846528, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.8901289739929112, + "language_loss": 0.84848976, + "learning_rate": 8.285608785887673e-07, + "loss": 0.86997032, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.10394287, + "step": 11779, + "time_per_iteration": 2.509908676147461 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.04111063, + "balance_loss_mlp": 1.020208, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.1359131365819146, + "language_loss": 0.7175138, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73897374, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12060547, + "step": 11780, + "time_per_iteration": 2.5989742279052734 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.0404489, + "balance_loss_mlp": 1.01775241, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4111290665036709, + "language_loss": 0.72992641, + "learning_rate": 8.279296393235256e-07, + "loss": 0.7513352, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11688232, + "step": 11781, + "time_per_iteration": 2.4687511920928955 + }, + { + "auxiliary_loss_clip": 0.01114435, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.04418945, + "balance_loss_mlp": 1.01810527, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.7204493591877648, + "language_loss": 0.77372873, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79516232, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10827637, + "step": 11782, + "time_per_iteration": 2.4083452224731445 + }, + { + "auxiliary_loss_clip": 0.01107956, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.037902, + "balance_loss_mlp": 1.01671004, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.6891538687339411, + "language_loss": 0.69526243, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71661246, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10333252, + "step": 11783, + "time_per_iteration": 2.5165796279907227 + }, + { + "auxiliary_loss_clip": 0.01119637, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.04495358, + "balance_loss_mlp": 1.01814187, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.7483461728087601, + "language_loss": 0.79306591, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81455988, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11627197, + "step": 11784, + "time_per_iteration": 2.4593703746795654 + }, + { + "auxiliary_loss_clip": 0.01113658, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.04143596, + "balance_loss_mlp": 1.02201843, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 1.6912474518247254, + "language_loss": 0.7699309, + "learning_rate": 8.266676942290609e-07, + "loss": 0.7914058, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11828613, + "step": 11785, + "time_per_iteration": 2.4297494888305664 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.04282987, + "balance_loss_mlp": 1.02194619, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.9027441717516582, + "language_loss": 0.78343749, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80491984, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12878418, + "step": 11786, + "time_per_iteration": 2.508972406387329 + }, + { + "auxiliary_loss_clip": 0.0111813, + "auxiliary_loss_mlp": 0.01028081, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.01621413, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.4437004267041686, + "language_loss": 0.78778487, + "learning_rate": 8.260369885912526e-07, + "loss": 0.80924702, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11877441, + "step": 11787, + "time_per_iteration": 2.4895095825195312 + }, + { + "auxiliary_loss_clip": 0.01115008, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.04269409, + "balance_loss_mlp": 1.0158242, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 2.0278258964216036, + "language_loss": 0.76791704, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78934097, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11572266, + "step": 11788, + "time_per_iteration": 2.4616823196411133 + }, + { + "auxiliary_loss_clip": 0.01127689, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.04628897, + "balance_loss_mlp": 1.0198741, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.0916681029495483, + "language_loss": 0.67963696, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70124835, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13580322, + "step": 11789, + "time_per_iteration": 2.3948371410369873 + }, + { + "auxiliary_loss_clip": 0.0112148, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.04429841, + "balance_loss_mlp": 1.01695216, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.7348003138670447, + "language_loss": 0.77476978, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79628259, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12854004, + "step": 11790, + "time_per_iteration": 3.9619460105895996 + }, + { + "auxiliary_loss_clip": 0.01122161, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.04320514, + "balance_loss_mlp": 1.01871419, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 1.6697298333281891, + "language_loss": 0.71063316, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73217344, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13146973, + "step": 11791, + "time_per_iteration": 2.5222442150115967 + }, + { + "auxiliary_loss_clip": 0.01115553, + "auxiliary_loss_mlp": 0.01028596, + "balance_loss_clip": 1.04379618, + "balance_loss_mlp": 1.01613832, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 2.0698349029034007, + "language_loss": 0.8187294, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84017092, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12457275, + "step": 11792, + "time_per_iteration": 2.4309709072113037 + }, + { + "auxiliary_loss_clip": 0.01113429, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.03781533, + "balance_loss_mlp": 1.01594675, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 3.0487718433600177, + "language_loss": 0.64528513, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66670501, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12628174, + "step": 11793, + "time_per_iteration": 2.481886148452759 + }, + { + "auxiliary_loss_clip": 0.01112701, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.0410285, + "balance_loss_mlp": 1.01687729, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.3138286523735174, + "language_loss": 0.70261681, + "learning_rate": 8.238309217655133e-07, + "loss": 0.7240271, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11437988, + "step": 11794, + "time_per_iteration": 2.470612049102783 + }, + { + "auxiliary_loss_clip": 0.01117293, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.04528856, + "balance_loss_mlp": 1.02262163, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.6984249346838753, + "language_loss": 0.76078606, + "learning_rate": 8.23515947668052e-07, + "loss": 0.78231311, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.12780762, + "step": 11795, + "time_per_iteration": 2.4383113384246826 + }, + { + "auxiliary_loss_clip": 0.01118771, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0440228, + "balance_loss_mlp": 1.01844335, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.2743307779072715, + "language_loss": 0.74764305, + "learning_rate": 8.232010181829838e-07, + "loss": 0.76913279, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11749268, + "step": 11796, + "time_per_iteration": 2.363238573074341 + }, + { + "auxiliary_loss_clip": 0.01123859, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.04498184, + "balance_loss_mlp": 1.02477288, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.6612564652712487, + "language_loss": 0.74646652, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76810712, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.15441895, + "step": 11797, + "time_per_iteration": 3.8469557762145996 + }, + { + "auxiliary_loss_clip": 0.01115709, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.04288435, + "balance_loss_mlp": 1.0196538, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.6067975435200215, + "language_loss": 0.7948209, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81631303, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1385498, + "step": 11798, + "time_per_iteration": 2.4317715167999268 + }, + { + "auxiliary_loss_clip": 0.01114509, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.04314566, + "balance_loss_mlp": 1.01806462, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.8532037574615665, + "language_loss": 0.66500509, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68644869, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11791992, + "step": 11799, + "time_per_iteration": 2.452741861343384 + }, + { + "auxiliary_loss_clip": 0.01113523, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.04056954, + "balance_loss_mlp": 1.01873469, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 3.4025155884413385, + "language_loss": 0.81856263, + "learning_rate": 8.219417466054622e-07, + "loss": 0.84002692, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.14172363, + "step": 11800, + "time_per_iteration": 2.590280294418335 + }, + { + "auxiliary_loss_clip": 0.01113355, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.04236794, + "balance_loss_mlp": 1.02315617, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.8912658771362179, + "language_loss": 0.86748493, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88898009, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.13012695, + "step": 11801, + "time_per_iteration": 2.5694191455841064 + }, + { + "auxiliary_loss_clip": 0.01119248, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.04390371, + "balance_loss_mlp": 1.02050424, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.8713577894617734, + "language_loss": 0.76390851, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78542662, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12054443, + "step": 11802, + "time_per_iteration": 2.501967668533325 + }, + { + "auxiliary_loss_clip": 0.01128614, + "auxiliary_loss_mlp": 0.01039321, + "balance_loss_clip": 1.05260539, + "balance_loss_mlp": 1.02769756, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 1.6996710044938559, + "language_loss": 0.81839979, + "learning_rate": 8.209977619374462e-07, + "loss": 0.84007913, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1161499, + "step": 11803, + "time_per_iteration": 3.8926713466644287 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.04405594, + "balance_loss_mlp": 1.01512253, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.5546487959535904, + "language_loss": 0.67457008, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69603562, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12823486, + "step": 11804, + "time_per_iteration": 2.415822982788086 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01024973, + "balance_loss_clip": 1.04765344, + "balance_loss_mlp": 1.01436365, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 3.073274780112177, + "language_loss": 0.78355157, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80496776, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10614014, + "step": 11805, + "time_per_iteration": 2.4971771240234375 + }, + { + "auxiliary_loss_clip": 0.01124917, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.04909873, + "balance_loss_mlp": 1.01906586, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 3.349138867517634, + "language_loss": 0.79126585, + "learning_rate": 8.200541796403667e-07, + "loss": 0.81283236, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12683105, + "step": 11806, + "time_per_iteration": 2.4177231788635254 + }, + { + "auxiliary_loss_clip": 0.01120332, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.04602098, + "balance_loss_mlp": 1.01929796, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.324460931537336, + "language_loss": 0.56400812, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58551532, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11090088, + "step": 11807, + "time_per_iteration": 2.409916877746582 + }, + { + "auxiliary_loss_clip": 0.01120922, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.04375172, + "balance_loss_mlp": 1.0235368, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.0746067805125774, + "language_loss": 0.68646526, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70803189, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12200928, + "step": 11808, + "time_per_iteration": 2.4286930561065674 + }, + { + "auxiliary_loss_clip": 0.01116891, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.04283953, + "balance_loss_mlp": 1.01740086, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 2.5051145818081353, + "language_loss": 0.71298969, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73444742, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11499023, + "step": 11809, + "time_per_iteration": 2.4198014736175537 + }, + { + "auxiliary_loss_clip": 0.01051229, + "auxiliary_loss_mlp": 0.01004705, + "balance_loss_clip": 1.02443385, + "balance_loss_mlp": 1.00301719, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7559114856850926, + "language_loss": 0.59477162, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61533099, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.26806641, + "router_z_loss_mlp": 0.01689148, + "step": 11810, + "time_per_iteration": 3.1366820335388184 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.04704261, + "balance_loss_mlp": 1.02398515, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.6290263851262636, + "language_loss": 0.74162352, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76317167, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11401367, + "step": 11811, + "time_per_iteration": 2.4404213428497314 + }, + { + "auxiliary_loss_clip": 0.01116182, + "auxiliary_loss_mlp": 0.01024026, + "balance_loss_clip": 1.04734063, + "balance_loss_mlp": 1.01282644, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 3.235228102485966, + "language_loss": 0.83664358, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85804564, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.11199951, + "step": 11812, + "time_per_iteration": 2.4861788749694824 + }, + { + "auxiliary_loss_clip": 0.0112367, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.05021, + "balance_loss_mlp": 1.01732397, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.822180389186508, + "language_loss": 0.69638228, + "learning_rate": 8.178540541983716e-07, + "loss": 0.71792185, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1295166, + "step": 11813, + "time_per_iteration": 2.443711757659912 + }, + { + "auxiliary_loss_clip": 0.01116979, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.04222238, + "balance_loss_mlp": 1.02133095, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 2.0408733990487677, + "language_loss": 0.81980836, + "learning_rate": 8.175399297768495e-07, + "loss": 0.84131759, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12609863, + "step": 11814, + "time_per_iteration": 2.512927532196045 + }, + { + "auxiliary_loss_clip": 0.01114685, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.04294503, + "balance_loss_mlp": 1.01665437, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 1.6415504327827786, + "language_loss": 0.75617158, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77760231, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11712646, + "step": 11815, + "time_per_iteration": 2.4445903301239014 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.03972423, + "balance_loss_mlp": 1.01757455, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.72693464127795, + "language_loss": 0.7856676, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80707949, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11590576, + "step": 11816, + "time_per_iteration": 2.422452688217163 + }, + { + "auxiliary_loss_clip": 0.01118818, + "auxiliary_loss_mlp": 0.01039776, + "balance_loss_clip": 1.04385948, + "balance_loss_mlp": 1.02812934, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.732252459208691, + "language_loss": 0.86524558, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88683152, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11657715, + "step": 11817, + "time_per_iteration": 3.853419303894043 + }, + { + "auxiliary_loss_clip": 0.0112146, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.04628205, + "balance_loss_mlp": 1.01842439, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 2.4758515770873637, + "language_loss": 0.84667361, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86818576, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11340332, + "step": 11818, + "time_per_iteration": 2.4143943786621094 + }, + { + "auxiliary_loss_clip": 0.01117891, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04315829, + "balance_loss_mlp": 1.02159357, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 3.2947171382735094, + "language_loss": 0.75582266, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77734149, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12390137, + "step": 11819, + "time_per_iteration": 2.4404635429382324 + }, + { + "auxiliary_loss_clip": 0.01122041, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.04712415, + "balance_loss_mlp": 1.01504445, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.7699665308362238, + "language_loss": 0.71120059, + "learning_rate": 8.156561252835883e-07, + "loss": 0.73270512, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13348389, + "step": 11820, + "time_per_iteration": 2.487576484680176 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01025, + "balance_loss_clip": 1.04740572, + "balance_loss_mlp": 1.01382387, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.7981058110266224, + "language_loss": 0.75275314, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77419341, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11181641, + "step": 11821, + "time_per_iteration": 2.425675630569458 + }, + { + "auxiliary_loss_clip": 0.01057445, + "auxiliary_loss_mlp": 0.01003846, + "balance_loss_clip": 1.03171802, + "balance_loss_mlp": 1.00224257, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.772312931331022, + "language_loss": 0.55080652, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57141936, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01602173, + "step": 11822, + "time_per_iteration": 3.105417251586914 + }, + { + "auxiliary_loss_clip": 0.01107449, + "auxiliary_loss_mlp": 0.01025127, + "balance_loss_clip": 1.03884745, + "balance_loss_mlp": 1.01337886, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 4.658270207056769, + "language_loss": 0.60412121, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62544703, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.11743164, + "step": 11823, + "time_per_iteration": 2.525651216506958 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.04575098, + "balance_loss_mlp": 1.01652956, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.764632158282062, + "language_loss": 0.71303606, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73449337, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11090088, + "step": 11824, + "time_per_iteration": 2.43881893157959 + }, + { + "auxiliary_loss_clip": 0.01113558, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.04370606, + "balance_loss_mlp": 1.01825929, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.9128392147845918, + "language_loss": 0.72823107, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74965203, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10272217, + "step": 11825, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.011173, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.04449272, + "balance_loss_mlp": 1.02218091, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 3.3052366210096844, + "language_loss": 0.79479778, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81630754, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11486816, + "step": 11826, + "time_per_iteration": 2.4985742568969727 + }, + { + "auxiliary_loss_clip": 0.01114412, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.04428172, + "balance_loss_mlp": 1.02059686, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 2.201900649496405, + "language_loss": 0.83251452, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85397267, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10809326, + "step": 11827, + "time_per_iteration": 2.5197527408599854 + }, + { + "auxiliary_loss_clip": 0.01110395, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.0385083, + "balance_loss_mlp": 1.02260721, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.8477141800587036, + "language_loss": 0.62384719, + "learning_rate": 8.131469013876748e-07, + "loss": 0.645298, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12097168, + "step": 11828, + "time_per_iteration": 2.515228509902954 + }, + { + "auxiliary_loss_clip": 0.01113997, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.04140091, + "balance_loss_mlp": 1.01839149, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.4811843908165505, + "language_loss": 0.7227357, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74418336, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12371826, + "step": 11829, + "time_per_iteration": 2.508725166320801 + }, + { + "auxiliary_loss_clip": 0.01112822, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.041188, + "balance_loss_mlp": 1.01755404, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 2.8900238164897694, + "language_loss": 0.80245948, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82387, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10668945, + "step": 11830, + "time_per_iteration": 2.4764726161956787 + }, + { + "auxiliary_loss_clip": 0.01108566, + "auxiliary_loss_mlp": 0.010368, + "balance_loss_clip": 1.03732491, + "balance_loss_mlp": 1.02546287, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 2.1248421285620305, + "language_loss": 0.84196812, + "learning_rate": 8.122066846919138e-07, + "loss": 0.8634218, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11340332, + "step": 11831, + "time_per_iteration": 2.483211040496826 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.04055452, + "balance_loss_mlp": 1.01841044, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.4010424563167576, + "language_loss": 0.77137977, + "learning_rate": 8.118933691932985e-07, + "loss": 0.792822, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11505127, + "step": 11832, + "time_per_iteration": 2.425363302230835 + }, + { + "auxiliary_loss_clip": 0.01052251, + "auxiliary_loss_mlp": 0.01011054, + "balance_loss_clip": 1.02625561, + "balance_loss_mlp": 1.00967455, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7437551878307324, + "language_loss": 0.56595731, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58659041, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.0138092, + "step": 11833, + "time_per_iteration": 2.972372531890869 + }, + { + "auxiliary_loss_clip": 0.01114716, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04201257, + "balance_loss_mlp": 1.02597165, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.5265384858694262, + "language_loss": 0.71079159, + "learning_rate": 8.11266873367315e-07, + "loss": 0.73230994, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1114502, + "step": 11834, + "time_per_iteration": 3.922353506088257 + }, + { + "auxiliary_loss_clip": 0.01113132, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.03937554, + "balance_loss_mlp": 1.01904464, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.8914573830303434, + "language_loss": 0.79392415, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81537902, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.13305664, + "step": 11835, + "time_per_iteration": 2.47890043258667 + }, + { + "auxiliary_loss_clip": 0.01116018, + "auxiliary_loss_mlp": 0.01029946, + "balance_loss_clip": 1.0452199, + "balance_loss_mlp": 1.01882339, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.914130031234233, + "language_loss": 0.76010334, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78156304, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11132812, + "step": 11836, + "time_per_iteration": 2.4854071140289307 + }, + { + "auxiliary_loss_clip": 0.01107625, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.03694773, + "balance_loss_mlp": 1.0177207, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.7108772592371566, + "language_loss": 0.70280182, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72417164, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11639404, + "step": 11837, + "time_per_iteration": 2.619741916656494 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01036324, + "balance_loss_clip": 1.042858, + "balance_loss_mlp": 1.0229665, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.8688435283914908, + "language_loss": 0.61399961, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63554239, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13360596, + "step": 11838, + "time_per_iteration": 2.4908206462860107 + }, + { + "auxiliary_loss_clip": 0.01117512, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.04639876, + "balance_loss_mlp": 1.01825511, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.950663319093874, + "language_loss": 0.68046838, + "learning_rate": 8.097014228555426e-07, + "loss": 0.70194191, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11590576, + "step": 11839, + "time_per_iteration": 2.4976449012756348 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.04376411, + "balance_loss_mlp": 1.02107763, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 3.1551385854300875, + "language_loss": 0.84418708, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86569351, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12316895, + "step": 11840, + "time_per_iteration": 3.9722084999084473 + }, + { + "auxiliary_loss_clip": 0.01122816, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.04873013, + "balance_loss_mlp": 1.02024162, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 2.7244681167822127, + "language_loss": 0.76855135, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79009855, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11663818, + "step": 11841, + "time_per_iteration": 2.4230892658233643 + }, + { + "auxiliary_loss_clip": 0.01114494, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.04110241, + "balance_loss_mlp": 1.02167225, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.3157479086614305, + "language_loss": 0.74774408, + "learning_rate": 8.087626940883994e-07, + "loss": 0.76922452, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11877441, + "step": 11842, + "time_per_iteration": 2.4381651878356934 + }, + { + "auxiliary_loss_clip": 0.01046153, + "auxiliary_loss_mlp": 0.01003209, + "balance_loss_clip": 1.02064359, + "balance_loss_mlp": 1.00156724, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.8023730182445546, + "language_loss": 0.61552632, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63601995, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.25537109, + "router_z_loss_mlp": 0.0164032, + "step": 11843, + "time_per_iteration": 3.015486001968384 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01025339, + "balance_loss_clip": 1.04413879, + "balance_loss_mlp": 1.01393652, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.9374010295561173, + "language_loss": 0.8062036, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82759917, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11407471, + "step": 11844, + "time_per_iteration": 2.4848713874816895 + }, + { + "auxiliary_loss_clip": 0.01114238, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.04027486, + "balance_loss_mlp": 1.01934338, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.478462514901531, + "language_loss": 0.79450846, + "learning_rate": 8.078243718677873e-07, + "loss": 0.81598538, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.14129639, + "step": 11845, + "time_per_iteration": 2.3936381340026855 + }, + { + "auxiliary_loss_clip": 0.0112401, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.05335999, + "balance_loss_mlp": 1.02178824, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 2.1771533731171613, + "language_loss": 0.77252293, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79409933, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11846924, + "step": 11846, + "time_per_iteration": 2.518644094467163 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.04835629, + "balance_loss_mlp": 1.02038789, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.867846980848346, + "language_loss": 0.58828259, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60983264, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12213135, + "step": 11847, + "time_per_iteration": 3.9727001190185547 + }, + { + "auxiliary_loss_clip": 0.01112364, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.04380155, + "balance_loss_mlp": 1.02131414, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.6181994394824113, + "language_loss": 0.71481335, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73626769, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.11743164, + "step": 11848, + "time_per_iteration": 2.4972586631774902 + }, + { + "auxiliary_loss_clip": 0.01053502, + "auxiliary_loss_mlp": 0.01003676, + "balance_loss_clip": 1.02743578, + "balance_loss_mlp": 1.00209069, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8379691172358288, + "language_loss": 0.62964857, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65022039, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.26074219, + "router_z_loss_mlp": 0.01583862, + "step": 11849, + "time_per_iteration": 3.069200038909912 + }, + { + "auxiliary_loss_clip": 0.01123284, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.04988384, + "balance_loss_mlp": 1.01836824, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.7360555115447611, + "language_loss": 0.63687342, + "learning_rate": 8.0626140580654e-07, + "loss": 0.65840715, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11712646, + "step": 11850, + "time_per_iteration": 2.675921678543091 + }, + { + "auxiliary_loss_clip": 0.01116803, + "auxiliary_loss_mlp": 0.01027818, + "balance_loss_clip": 1.04450214, + "balance_loss_mlp": 1.0162071, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.6348733531049395, + "language_loss": 0.69888175, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72032797, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11608887, + "step": 11851, + "time_per_iteration": 2.528027296066284 + }, + { + "auxiliary_loss_clip": 0.01117453, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.04692316, + "balance_loss_mlp": 1.01946044, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.5587268140600474, + "language_loss": 0.8315959, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85307139, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10638428, + "step": 11852, + "time_per_iteration": 2.468027114868164 + }, + { + "auxiliary_loss_clip": 0.01121719, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.04665959, + "balance_loss_mlp": 1.01786602, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.9366089531277537, + "language_loss": 0.72669584, + "learning_rate": 8.053241692752126e-07, + "loss": 0.7482152, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12353516, + "step": 11853, + "time_per_iteration": 2.4350335597991943 + }, + { + "auxiliary_loss_clip": 0.0111156, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.04255211, + "balance_loss_mlp": 1.01510644, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 2.085973880296645, + "language_loss": 0.92102712, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94239897, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10510254, + "step": 11854, + "time_per_iteration": 2.426006555557251 + }, + { + "auxiliary_loss_clip": 0.01114843, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.04447627, + "balance_loss_mlp": 1.01977754, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.8397840630726163, + "language_loss": 0.79673916, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81819874, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11340332, + "step": 11855, + "time_per_iteration": 2.473179817199707 + }, + { + "auxiliary_loss_clip": 0.0112007, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.04780674, + "balance_loss_mlp": 1.02201211, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.703645485341625, + "language_loss": 0.72752869, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74907577, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12628174, + "step": 11856, + "time_per_iteration": 2.4226877689361572 + }, + { + "auxiliary_loss_clip": 0.01115252, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.04169989, + "balance_loss_mlp": 1.01961827, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.8588763356962776, + "language_loss": 0.70220125, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72366679, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11682129, + "step": 11857, + "time_per_iteration": 2.494889736175537 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.03566599, + "balance_loss_mlp": 1.01786852, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.156064973576667, + "language_loss": 0.85135198, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87270123, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.12005615, + "step": 11858, + "time_per_iteration": 2.4248733520507812 + }, + { + "auxiliary_loss_clip": 0.01117066, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.04091215, + "balance_loss_mlp": 1.0242722, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.7046023199747657, + "language_loss": 0.80608761, + "learning_rate": 8.034509196923829e-07, + "loss": 0.8276428, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.14178467, + "step": 11859, + "time_per_iteration": 2.455759286880493 + }, + { + "auxiliary_loss_clip": 0.01118676, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.04654455, + "balance_loss_mlp": 1.01688552, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.244165349410975, + "language_loss": 0.68924773, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71071923, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11590576, + "step": 11860, + "time_per_iteration": 2.7777507305145264 + }, + { + "auxiliary_loss_clip": 0.01116845, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0419991, + "balance_loss_mlp": 1.02160561, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.9967627864374655, + "language_loss": 0.6414488, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66297978, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.14654541, + "step": 11861, + "time_per_iteration": 2.441805839538574 + }, + { + "auxiliary_loss_clip": 0.01125886, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.05044842, + "balance_loss_mlp": 1.02041149, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.7833760507887015, + "language_loss": 0.67288339, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69447243, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12615967, + "step": 11862, + "time_per_iteration": 4.0121259689331055 + }, + { + "auxiliary_loss_clip": 0.01121428, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04863513, + "balance_loss_mlp": 1.02460694, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 2.903128119393199, + "language_loss": 0.67178869, + "learning_rate": 8.022029939445214e-07, + "loss": 0.69335717, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10821533, + "step": 11863, + "time_per_iteration": 2.5098109245300293 + }, + { + "auxiliary_loss_clip": 0.01130714, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.05364966, + "balance_loss_mlp": 1.02014661, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 1.7480599031210826, + "language_loss": 0.65792644, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67956364, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12854004, + "step": 11864, + "time_per_iteration": 2.4894471168518066 + }, + { + "auxiliary_loss_clip": 0.01121288, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.04801345, + "balance_loss_mlp": 1.01831484, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.9053710125044383, + "language_loss": 0.86132532, + "learning_rate": 8.015793035467697e-07, + "loss": 0.88284564, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12438965, + "step": 11865, + "time_per_iteration": 2.401576280593872 + }, + { + "auxiliary_loss_clip": 0.01117466, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.0438261, + "balance_loss_mlp": 1.0174979, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 3.510419127373339, + "language_loss": 0.74991244, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77138662, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12451172, + "step": 11866, + "time_per_iteration": 2.444033145904541 + }, + { + "auxiliary_loss_clip": 0.01123165, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.04944932, + "balance_loss_mlp": 1.01847005, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 3.11452207205906, + "language_loss": 0.70753336, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72907937, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12963867, + "step": 11867, + "time_per_iteration": 2.5285916328430176 + }, + { + "auxiliary_loss_clip": 0.01108437, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.0385778, + "balance_loss_mlp": 1.01665199, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 2.0149180203446915, + "language_loss": 0.7169444, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73830771, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11242676, + "step": 11868, + "time_per_iteration": 2.4595301151275635 + }, + { + "auxiliary_loss_clip": 0.01120107, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.04521298, + "balance_loss_mlp": 1.01472688, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.197770732219868, + "language_loss": 0.6595335, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68101352, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.1315918, + "step": 11869, + "time_per_iteration": 2.4318723678588867 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.01027303, + "balance_loss_clip": 1.04332185, + "balance_loss_mlp": 1.01587105, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 2.3367213491619774, + "language_loss": 0.7799328, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80136681, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11431885, + "step": 11870, + "time_per_iteration": 2.465925931930542 + }, + { + "auxiliary_loss_clip": 0.01123856, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.05184722, + "balance_loss_mlp": 1.01582837, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.7199482956905903, + "language_loss": 0.80983627, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83135927, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12609863, + "step": 11871, + "time_per_iteration": 2.4941816329956055 + }, + { + "auxiliary_loss_clip": 0.0111632, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.04210198, + "balance_loss_mlp": 1.02079237, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 2.021407822636526, + "language_loss": 0.7884087, + "learning_rate": 7.993978192685331e-07, + "loss": 0.8099038, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1239624, + "step": 11872, + "time_per_iteration": 2.419668436050415 + }, + { + "auxiliary_loss_clip": 0.01111048, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.0376761, + "balance_loss_mlp": 1.01537538, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.973713434660129, + "language_loss": 0.83809555, + "learning_rate": 7.990863606706606e-07, + "loss": 0.85948867, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12890625, + "step": 11873, + "time_per_iteration": 2.4539389610290527 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.03873587, + "balance_loss_mlp": 1.01708651, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 2.6089808073287597, + "language_loss": 0.8615104, + "learning_rate": 7.987749476115539e-07, + "loss": 0.8828873, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10772705, + "step": 11874, + "time_per_iteration": 2.4144086837768555 + }, + { + "auxiliary_loss_clip": 0.01108562, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.03663015, + "balance_loss_mlp": 1.01770449, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 2.1718996158707067, + "language_loss": 0.83105254, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85243738, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12231445, + "step": 11875, + "time_per_iteration": 2.4491264820098877 + }, + { + "auxiliary_loss_clip": 0.01122861, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.04392934, + "balance_loss_mlp": 1.01670074, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.9013660834760089, + "language_loss": 0.69640791, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71794575, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.14233398, + "step": 11876, + "time_per_iteration": 2.411998987197876 + }, + { + "auxiliary_loss_clip": 0.0112649, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.05218518, + "balance_loss_mlp": 1.02272868, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 2.128553952330793, + "language_loss": 0.77530026, + "learning_rate": 7.978409817849079e-07, + "loss": 0.79691595, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12353516, + "step": 11877, + "time_per_iteration": 2.4107131958007812 + }, + { + "auxiliary_loss_clip": 0.01114043, + "auxiliary_loss_mlp": 0.01039498, + "balance_loss_clip": 1.04166329, + "balance_loss_mlp": 1.02661753, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 2.0702340814812707, + "language_loss": 0.69617736, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71771276, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12884521, + "step": 11878, + "time_per_iteration": 3.893789291381836 + }, + { + "auxiliary_loss_clip": 0.01114294, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.04394484, + "balance_loss_mlp": 1.01634312, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 2.044984985988048, + "language_loss": 0.67641789, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69783366, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10943604, + "step": 11879, + "time_per_iteration": 2.4524550437927246 + }, + { + "auxiliary_loss_clip": 0.01110524, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.03911686, + "balance_loss_mlp": 1.01771188, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 2.3208679784408144, + "language_loss": 0.69560075, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71700978, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12664795, + "step": 11880, + "time_per_iteration": 2.4696855545043945 + }, + { + "auxiliary_loss_clip": 0.01118102, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.04394054, + "balance_loss_mlp": 1.01909602, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.7286473661234036, + "language_loss": 0.80502379, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82651931, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12347412, + "step": 11881, + "time_per_iteration": 2.4347078800201416 + }, + { + "auxiliary_loss_clip": 0.01111776, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.03934526, + "balance_loss_mlp": 1.02008533, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.6626375969390794, + "language_loss": 0.63743532, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65886664, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1126709, + "step": 11882, + "time_per_iteration": 2.5278780460357666 + }, + { + "auxiliary_loss_clip": 0.01119184, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.04431713, + "balance_loss_mlp": 1.02004766, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.780566952622261, + "language_loss": 0.68798018, + "learning_rate": 7.959742812719304e-07, + "loss": 0.70950043, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12805176, + "step": 11883, + "time_per_iteration": 2.489280939102173 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01037252, + "balance_loss_clip": 1.04675663, + "balance_loss_mlp": 1.02407944, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 2.0558319184728067, + "language_loss": 0.77874058, + "learning_rate": 7.956633242496788e-07, + "loss": 0.8003217, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.13171387, + "step": 11884, + "time_per_iteration": 3.9170968532562256 + }, + { + "auxiliary_loss_clip": 0.01128541, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.04621196, + "balance_loss_mlp": 1.01595712, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 1.958048650232683, + "language_loss": 0.73793244, + "learning_rate": 7.953524128959954e-07, + "loss": 0.75952041, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.1428833, + "step": 11885, + "time_per_iteration": 2.457411766052246 + }, + { + "auxiliary_loss_clip": 0.01044839, + "auxiliary_loss_mlp": 0.01002461, + "balance_loss_clip": 1.01980686, + "balance_loss_mlp": 1.0008707, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8985622108403062, + "language_loss": 0.66315216, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68362522, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.01591492, + "step": 11886, + "time_per_iteration": 3.066425323486328 + }, + { + "auxiliary_loss_clip": 0.01111671, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.03985691, + "balance_loss_mlp": 1.0150609, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.6842431116912078, + "language_loss": 0.75246286, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77385557, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12524414, + "step": 11887, + "time_per_iteration": 2.4749977588653564 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.03610826, + "balance_loss_mlp": 1.01776195, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.5406002061892965, + "language_loss": 0.71424526, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73561203, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1138916, + "step": 11888, + "time_per_iteration": 2.484752893447876 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.04033744, + "balance_loss_mlp": 1.02202797, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 2.9413699817174916, + "language_loss": 0.84191394, + "learning_rate": 7.941092244027041e-07, + "loss": 0.8634243, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13250732, + "step": 11889, + "time_per_iteration": 3.9503633975982666 + }, + { + "auxiliary_loss_clip": 0.01115463, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.04289901, + "balance_loss_mlp": 1.01536047, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.865193686822575, + "language_loss": 0.76268351, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78410995, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1182251, + "step": 11890, + "time_per_iteration": 2.5115585327148438 + }, + { + "auxiliary_loss_clip": 0.01114916, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.043396, + "balance_loss_mlp": 1.01729965, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.4789324621345568, + "language_loss": 0.7415601, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76299274, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11047363, + "step": 11891, + "time_per_iteration": 2.483250617980957 + }, + { + "auxiliary_loss_clip": 0.01123727, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.04729891, + "balance_loss_mlp": 1.02628446, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 1.7536902715240337, + "language_loss": 0.67587358, + "learning_rate": 7.931773131302211e-07, + "loss": 0.69749385, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12017822, + "step": 11892, + "time_per_iteration": 2.404965877532959 + }, + { + "auxiliary_loss_clip": 0.01116117, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.04031718, + "balance_loss_mlp": 1.02013814, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 1.98827366208709, + "language_loss": 0.74030477, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76180249, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13519287, + "step": 11893, + "time_per_iteration": 2.500389337539673 + }, + { + "auxiliary_loss_clip": 0.01113884, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.02179646, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 3.1019360350703487, + "language_loss": 0.66327423, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68477798, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.14715576, + "step": 11894, + "time_per_iteration": 2.4051811695098877 + }, + { + "auxiliary_loss_clip": 0.01121734, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.04604566, + "balance_loss_mlp": 1.02015734, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.7266871960002417, + "language_loss": 0.77661502, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79815465, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12072754, + "step": 11895, + "time_per_iteration": 2.5784292221069336 + }, + { + "auxiliary_loss_clip": 0.01122281, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.04566121, + "balance_loss_mlp": 1.01778603, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 1.967303502231982, + "language_loss": 0.6963473, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71787947, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13153076, + "step": 11896, + "time_per_iteration": 2.428792715072632 + }, + { + "auxiliary_loss_clip": 0.01116043, + "auxiliary_loss_mlp": 0.01044248, + "balance_loss_clip": 1.03994656, + "balance_loss_mlp": 1.03094411, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.195263180705152, + "language_loss": 0.86659259, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88819546, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13305664, + "step": 11897, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01110184, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.03964174, + "balance_loss_mlp": 1.02066159, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 1.9186826360623215, + "language_loss": 0.78224736, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80367625, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12036133, + "step": 11898, + "time_per_iteration": 2.4290730953216553 + }, + { + "auxiliary_loss_clip": 0.01113088, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.03832877, + "balance_loss_mlp": 1.01848078, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 3.196950184756004, + "language_loss": 0.73236585, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75381494, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.13323975, + "step": 11899, + "time_per_iteration": 2.5565555095672607 + }, + { + "auxiliary_loss_clip": 0.01111577, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.03778863, + "balance_loss_mlp": 1.02404773, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 2.340071950790018, + "language_loss": 0.75570846, + "learning_rate": 7.906942308317614e-07, + "loss": 0.77719194, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12731934, + "step": 11900, + "time_per_iteration": 2.428354501724243 + }, + { + "auxiliary_loss_clip": 0.01111708, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.03826308, + "balance_loss_mlp": 1.0168854, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.880120224197966, + "language_loss": 0.81060022, + "learning_rate": 7.903840517773886e-07, + "loss": 0.83200574, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11956787, + "step": 11901, + "time_per_iteration": 2.6030261516571045 + }, + { + "auxiliary_loss_clip": 0.01125555, + "auxiliary_loss_mlp": 0.01038373, + "balance_loss_clip": 1.04519391, + "balance_loss_mlp": 1.025105, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 1.8571522778086003, + "language_loss": 0.8142494, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83588874, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13262939, + "step": 11902, + "time_per_iteration": 2.43001389503479 + }, + { + "auxiliary_loss_clip": 0.01119446, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.04597127, + "balance_loss_mlp": 1.01642847, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 3.2700938655795415, + "language_loss": 0.67505765, + "learning_rate": 7.897638312866785e-07, + "loss": 0.69653141, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.1149292, + "step": 11903, + "time_per_iteration": 2.407081365585327 + }, + { + "auxiliary_loss_clip": 0.01114164, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.04264736, + "balance_loss_mlp": 1.02010751, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.7983688961046023, + "language_loss": 0.7622664, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78372723, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1182251, + "step": 11904, + "time_per_iteration": 2.4407620429992676 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01039262, + "balance_loss_clip": 1.0424962, + "balance_loss_mlp": 1.02446842, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 1.9210871199751556, + "language_loss": 0.72304618, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74459672, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.14794922, + "step": 11905, + "time_per_iteration": 3.894883632659912 + }, + { + "auxiliary_loss_clip": 0.01121135, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.04784107, + "balance_loss_mlp": 1.02075219, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.7675014015611588, + "language_loss": 0.77621996, + "learning_rate": 7.88833844772076e-07, + "loss": 0.79775524, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11639404, + "step": 11906, + "time_per_iteration": 2.4549500942230225 + }, + { + "auxiliary_loss_clip": 0.01055258, + "auxiliary_loss_mlp": 0.0100861, + "balance_loss_clip": 1.030195, + "balance_loss_mlp": 1.00708413, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.745665143789919, + "language_loss": 0.55261737, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57325608, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.01525879, + "step": 11907, + "time_per_iteration": 2.9834489822387695 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.04479003, + "balance_loss_mlp": 1.02178645, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 6.006131314194357, + "language_loss": 0.69748944, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71903801, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12347412, + "step": 11908, + "time_per_iteration": 2.4275643825531006 + }, + { + "auxiliary_loss_clip": 0.011188, + "auxiliary_loss_mlp": 0.01028931, + "balance_loss_clip": 1.04596925, + "balance_loss_mlp": 1.01628888, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.8803838195424432, + "language_loss": 0.71502882, + "learning_rate": 7.879042716053415e-07, + "loss": 0.7365061, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12646484, + "step": 11909, + "time_per_iteration": 2.4642229080200195 + }, + { + "auxiliary_loss_clip": 0.01122116, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.04787707, + "balance_loss_mlp": 1.01970816, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.731223580341429, + "language_loss": 0.75437343, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77590978, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11816406, + "step": 11910, + "time_per_iteration": 2.5337657928466797 + }, + { + "auxiliary_loss_clip": 0.0112526, + "auxiliary_loss_mlp": 0.01032959, + "balance_loss_clip": 1.05256748, + "balance_loss_mlp": 1.02211142, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.5377005395273657, + "language_loss": 0.76766461, + "learning_rate": 7.872847859552251e-07, + "loss": 0.7892468, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10858154, + "step": 11911, + "time_per_iteration": 2.4632010459899902 + }, + { + "auxiliary_loss_clip": 0.01116114, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.04245579, + "balance_loss_mlp": 1.02095246, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.8247175777316043, + "language_loss": 0.58417261, + "learning_rate": 7.869751121037192e-07, + "loss": 0.6056751, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.13195801, + "step": 11912, + "time_per_iteration": 2.847970962524414 + }, + { + "auxiliary_loss_clip": 0.01122711, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.04932761, + "balance_loss_mlp": 1.01994419, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 1.623411177486482, + "language_loss": 0.77802956, + "learning_rate": 7.866654842502376e-07, + "loss": 0.7995851, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12896729, + "step": 11913, + "time_per_iteration": 2.450467824935913 + }, + { + "auxiliary_loss_clip": 0.01108257, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.03774548, + "balance_loss_mlp": 1.01852059, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.8399762373801267, + "language_loss": 0.74176514, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76314044, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10760498, + "step": 11914, + "time_per_iteration": 2.490176200866699 + }, + { + "auxiliary_loss_clip": 0.01113437, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.04223311, + "balance_loss_mlp": 1.01968122, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 3.162645367620132, + "language_loss": 0.74125284, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76270014, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11602783, + "step": 11915, + "time_per_iteration": 2.455652952194214 + }, + { + "auxiliary_loss_clip": 0.01118052, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.04437566, + "balance_loss_mlp": 1.02070713, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.8499754103894823, + "language_loss": 0.80976534, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83127844, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12554932, + "step": 11916, + "time_per_iteration": 2.40169095993042 + }, + { + "auxiliary_loss_clip": 0.01119018, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.04415715, + "balance_loss_mlp": 1.02336383, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.821534299812322, + "language_loss": 0.68527234, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70682716, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13116455, + "step": 11917, + "time_per_iteration": 2.4261202812194824 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.04384446, + "balance_loss_mlp": 1.01788974, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 2.7879171246358965, + "language_loss": 0.75859976, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78006715, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12518311, + "step": 11918, + "time_per_iteration": 2.4188387393951416 + }, + { + "auxiliary_loss_clip": 0.01051196, + "auxiliary_loss_mlp": 0.01005345, + "balance_loss_clip": 1.02496982, + "balance_loss_mlp": 1.00368476, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6306203050658943, + "language_loss": 0.53926367, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55982912, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.26269531, + "router_z_loss_mlp": 0.01660156, + "step": 11919, + "time_per_iteration": 3.0885777473449707 + }, + { + "auxiliary_loss_clip": 0.01118598, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.04500794, + "balance_loss_mlp": 1.01720572, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 1.6215279295148965, + "language_loss": 0.69403279, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71550864, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11791992, + "step": 11920, + "time_per_iteration": 2.47363543510437 + }, + { + "auxiliary_loss_clip": 0.01116386, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.04381156, + "balance_loss_mlp": 1.02049828, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 1.7966351314595055, + "language_loss": 0.74827564, + "learning_rate": 7.841901187598678e-07, + "loss": 0.76976895, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12463379, + "step": 11921, + "time_per_iteration": 2.478795051574707 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.04836762, + "balance_loss_mlp": 1.02131414, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.2853324635513244, + "language_loss": 0.75766289, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77927834, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.14776611, + "step": 11922, + "time_per_iteration": 3.8129372596740723 + }, + { + "auxiliary_loss_clip": 0.01058438, + "auxiliary_loss_mlp": 0.01006833, + "balance_loss_clip": 1.03106427, + "balance_loss_mlp": 1.00536919, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7522996626225673, + "language_loss": 0.55103105, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57168376, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.27441406, + "router_z_loss_mlp": 0.01464844, + "step": 11923, + "time_per_iteration": 2.9050791263580322 + }, + { + "auxiliary_loss_clip": 0.01120593, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.04399645, + "balance_loss_mlp": 1.02617955, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.638979523249326, + "language_loss": 0.76973313, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79133123, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13037109, + "step": 11924, + "time_per_iteration": 2.439707040786743 + }, + { + "auxiliary_loss_clip": 0.01117224, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.04534495, + "balance_loss_mlp": 1.02123523, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 2.071900685347693, + "language_loss": 0.68534946, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70684689, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11291504, + "step": 11925, + "time_per_iteration": 2.4694502353668213 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.0439136, + "balance_loss_mlp": 1.01628828, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.577035150020675, + "language_loss": 0.77124351, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79264939, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11254883, + "step": 11926, + "time_per_iteration": 2.4815125465393066 + }, + { + "auxiliary_loss_clip": 0.01130267, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.05107617, + "balance_loss_mlp": 1.01856542, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 2.0491342798296817, + "language_loss": 0.77077854, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79239273, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12585449, + "step": 11927, + "time_per_iteration": 2.4595773220062256 + }, + { + "auxiliary_loss_clip": 0.01112785, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.04155159, + "balance_loss_mlp": 1.01627982, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.963345735229703, + "language_loss": 0.69457912, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71599364, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12384033, + "step": 11928, + "time_per_iteration": 3.990954875946045 + }, + { + "auxiliary_loss_clip": 0.01109753, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.03963757, + "balance_loss_mlp": 1.02229857, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.9248723363303022, + "language_loss": 0.65653872, + "learning_rate": 7.817177039013931e-07, + "loss": 0.6779865, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.12738037, + "step": 11929, + "time_per_iteration": 2.4964399337768555 + }, + { + "auxiliary_loss_clip": 0.01113846, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.04018259, + "balance_loss_mlp": 1.01843226, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 2.098613866204856, + "language_loss": 0.69846964, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71992457, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13220215, + "step": 11930, + "time_per_iteration": 2.4661290645599365 + }, + { + "auxiliary_loss_clip": 0.0111396, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.04046702, + "balance_loss_mlp": 1.01663256, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 2.1120157606429797, + "language_loss": 0.80773652, + "learning_rate": 7.811000619263219e-07, + "loss": 0.8291707, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12835693, + "step": 11931, + "time_per_iteration": 2.4687671661376953 + }, + { + "auxiliary_loss_clip": 0.01118667, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.04482889, + "balance_loss_mlp": 1.02134609, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.287503610218735, + "language_loss": 0.78163749, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80315375, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11608887, + "step": 11932, + "time_per_iteration": 2.3878815174102783 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.04207754, + "balance_loss_mlp": 1.0176245, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.483563962316415, + "language_loss": 0.75336635, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77480739, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12615967, + "step": 11933, + "time_per_iteration": 3.8809616565704346 + }, + { + "auxiliary_loss_clip": 0.01128768, + "auxiliary_loss_mlp": 0.01035634, + "balance_loss_clip": 1.04864907, + "balance_loss_mlp": 1.02011275, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.3709444642867594, + "language_loss": 0.69772434, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71936834, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.1550293, + "step": 11934, + "time_per_iteration": 2.381544828414917 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.05993557, + "balance_loss_mlp": 1.0208174, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.387540581614856, + "language_loss": 0.86238837, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88408506, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11889648, + "step": 11935, + "time_per_iteration": 2.4611294269561768 + }, + { + "auxiliary_loss_clip": 0.01123511, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.0497576, + "balance_loss_mlp": 1.01832187, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.626842835557262, + "language_loss": 0.74081814, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76236236, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12585449, + "step": 11936, + "time_per_iteration": 2.6759395599365234 + }, + { + "auxiliary_loss_clip": 0.01041647, + "auxiliary_loss_mlp": 0.01008904, + "balance_loss_clip": 1.01674485, + "balance_loss_mlp": 1.0070889, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7598127220246195, + "language_loss": 0.55904508, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57955062, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01815796, + "step": 11937, + "time_per_iteration": 3.040391683578491 + }, + { + "auxiliary_loss_clip": 0.01124264, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.04631233, + "balance_loss_mlp": 1.02412248, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 2.495071862322591, + "language_loss": 0.55049479, + "learning_rate": 7.789397715835542e-07, + "loss": 0.57212305, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.14447021, + "step": 11938, + "time_per_iteration": 2.533177614212036 + }, + { + "auxiliary_loss_clip": 0.01112136, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.04038572, + "balance_loss_mlp": 1.02054501, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.792725273290596, + "language_loss": 0.76557511, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78703284, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.13110352, + "step": 11939, + "time_per_iteration": 2.3983588218688965 + }, + { + "auxiliary_loss_clip": 0.01050086, + "auxiliary_loss_mlp": 0.01005496, + "balance_loss_clip": 1.02418709, + "balance_loss_mlp": 1.00382698, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7556555708069546, + "language_loss": 0.61406767, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63462353, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.0166626, + "step": 11940, + "time_per_iteration": 3.004580497741699 + }, + { + "auxiliary_loss_clip": 0.01112598, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.04177558, + "balance_loss_mlp": 1.01586521, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.695718911961325, + "language_loss": 0.59517372, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61656821, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10992432, + "step": 11941, + "time_per_iteration": 2.521822690963745 + }, + { + "auxiliary_loss_clip": 0.01112765, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.04043353, + "balance_loss_mlp": 1.02083588, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 2.7137792673147536, + "language_loss": 0.79548705, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81696004, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.137146, + "step": 11942, + "time_per_iteration": 2.4817559719085693 + }, + { + "auxiliary_loss_clip": 0.01120937, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.04711366, + "balance_loss_mlp": 1.02085102, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.9314138170299433, + "language_loss": 0.66273373, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68426883, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11712646, + "step": 11943, + "time_per_iteration": 2.4044556617736816 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.0513761, + "balance_loss_mlp": 1.01595783, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.967440020556372, + "language_loss": 0.79092085, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81243551, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1229248, + "step": 11944, + "time_per_iteration": 2.430030584335327 + }, + { + "auxiliary_loss_clip": 0.01119057, + "auxiliary_loss_mlp": 0.01045879, + "balance_loss_clip": 1.0426842, + "balance_loss_mlp": 1.03072762, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.10925192821395, + "language_loss": 0.62998652, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65163589, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.15142822, + "step": 11945, + "time_per_iteration": 2.3916966915130615 + }, + { + "auxiliary_loss_clip": 0.0104175, + "auxiliary_loss_mlp": 0.01010694, + "balance_loss_clip": 1.01619494, + "balance_loss_mlp": 1.00899827, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7072313633380897, + "language_loss": 0.5106346, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53115904, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01699829, + "step": 11946, + "time_per_iteration": 2.9802093505859375 + }, + { + "auxiliary_loss_clip": 0.01124431, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.04934812, + "balance_loss_mlp": 1.02131605, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 3.0850844358300438, + "language_loss": 0.74807632, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76967579, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.14196777, + "step": 11947, + "time_per_iteration": 2.408759832382202 + }, + { + "auxiliary_loss_clip": 0.01120361, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.0454154, + "balance_loss_mlp": 1.01403391, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.518354247749342, + "language_loss": 0.72793841, + "learning_rate": 7.758575792474187e-07, + "loss": 0.7494055, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12304688, + "step": 11948, + "time_per_iteration": 2.511596918106079 + }, + { + "auxiliary_loss_clip": 0.01123411, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.04696357, + "balance_loss_mlp": 1.02461243, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.4527895757741651, + "language_loss": 0.71386075, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73547733, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13635254, + "step": 11949, + "time_per_iteration": 3.9115493297576904 + }, + { + "auxiliary_loss_clip": 0.01119151, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.04803789, + "balance_loss_mlp": 1.01521564, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.852210828384075, + "language_loss": 0.76561528, + "learning_rate": 7.752416974415598e-07, + "loss": 0.787072, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11315918, + "step": 11950, + "time_per_iteration": 2.5414719581604004 + }, + { + "auxiliary_loss_clip": 0.01120488, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.04666591, + "balance_loss_mlp": 1.02088189, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.616431844933155, + "language_loss": 0.67382777, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69537163, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.13012695, + "step": 11951, + "time_per_iteration": 2.4533700942993164 + }, + { + "auxiliary_loss_clip": 0.01125245, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.04905772, + "balance_loss_mlp": 1.01915359, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.9272391299752698, + "language_loss": 0.78204525, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80362326, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.1340332, + "step": 11952, + "time_per_iteration": 2.4780242443084717 + }, + { + "auxiliary_loss_clip": 0.01122983, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.04672122, + "balance_loss_mlp": 1.02226853, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 2.0014977411630204, + "language_loss": 0.75032258, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77190697, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13183594, + "step": 11953, + "time_per_iteration": 2.529201030731201 + }, + { + "auxiliary_loss_clip": 0.01115958, + "auxiliary_loss_mlp": 0.0102985, + "balance_loss_clip": 1.04263234, + "balance_loss_mlp": 1.01767886, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 2.0999032526932786, + "language_loss": 0.72918904, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75064713, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1217041, + "step": 11954, + "time_per_iteration": 2.451078414916992 + }, + { + "auxiliary_loss_clip": 0.01124363, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.0490135, + "balance_loss_mlp": 1.02376652, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.8920120838105188, + "language_loss": 0.7422905, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76389885, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12701416, + "step": 11955, + "time_per_iteration": 2.475914478302002 + }, + { + "auxiliary_loss_clip": 0.01125237, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.05027723, + "balance_loss_mlp": 1.01552987, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.5480129892739463, + "language_loss": 0.73763663, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75916523, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12091064, + "step": 11956, + "time_per_iteration": 2.5453882217407227 + }, + { + "auxiliary_loss_clip": 0.01117664, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.04322314, + "balance_loss_mlp": 1.0199424, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.76865002383632, + "language_loss": 0.70829237, + "learning_rate": 7.730875746869987e-07, + "loss": 0.72979093, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12255859, + "step": 11957, + "time_per_iteration": 2.4794414043426514 + }, + { + "auxiliary_loss_clip": 0.0111953, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.04373169, + "balance_loss_mlp": 1.02255666, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.774727693078068, + "language_loss": 0.73063231, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75218642, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13342285, + "step": 11958, + "time_per_iteration": 2.5288915634155273 + }, + { + "auxiliary_loss_clip": 0.01123959, + "auxiliary_loss_mlp": 0.01033467, + "balance_loss_clip": 1.05064225, + "balance_loss_mlp": 1.02123022, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.8304176661972802, + "language_loss": 0.84066147, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86223578, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12225342, + "step": 11959, + "time_per_iteration": 2.5022895336151123 + }, + { + "auxiliary_loss_clip": 0.01127138, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.05049515, + "balance_loss_mlp": 1.01672101, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.7337255913738905, + "language_loss": 0.81664717, + "learning_rate": 7.7216507685705e-07, + "loss": 0.8382175, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1317749, + "step": 11960, + "time_per_iteration": 2.4762680530548096 + }, + { + "auxiliary_loss_clip": 0.01115157, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.04466617, + "balance_loss_mlp": 1.02653289, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 2.0185553459422, + "language_loss": 0.77797282, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79951638, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12658691, + "step": 11961, + "time_per_iteration": 2.4927897453308105 + }, + { + "auxiliary_loss_clip": 0.0111146, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.04138947, + "balance_loss_mlp": 1.02031422, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.5084811913933378, + "language_loss": 0.7508176, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77224708, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11175537, + "step": 11962, + "time_per_iteration": 2.447021961212158 + }, + { + "auxiliary_loss_clip": 0.01122264, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.04619718, + "balance_loss_mlp": 1.01810074, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 4.389523296707008, + "language_loss": 0.75465369, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77619857, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.14123535, + "step": 11963, + "time_per_iteration": 2.496722936630249 + }, + { + "auxiliary_loss_clip": 0.01123148, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.04524088, + "balance_loss_mlp": 1.02135336, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 2.4623244053376228, + "language_loss": 0.80874664, + "learning_rate": 7.709357316395564e-07, + "loss": 0.83032811, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13641357, + "step": 11964, + "time_per_iteration": 2.433311700820923 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.04625678, + "balance_loss_mlp": 1.02128768, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 2.0440258865539906, + "language_loss": 0.74917305, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77069712, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12506104, + "step": 11965, + "time_per_iteration": 2.446084499359131 + }, + { + "auxiliary_loss_clip": 0.01125311, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.04728055, + "balance_loss_mlp": 1.02020192, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.7420350424394835, + "language_loss": 0.77463382, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79621744, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12860107, + "step": 11966, + "time_per_iteration": 3.8263704776763916 + }, + { + "auxiliary_loss_clip": 0.01114941, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.0423727, + "balance_loss_mlp": 1.01870418, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 2.1347398968151383, + "language_loss": 0.72903502, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75049007, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11846924, + "step": 11967, + "time_per_iteration": 2.4477620124816895 + }, + { + "auxiliary_loss_clip": 0.01116058, + "auxiliary_loss_mlp": 0.01029895, + "balance_loss_clip": 1.04875994, + "balance_loss_mlp": 1.01931465, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 2.782931480910464, + "language_loss": 0.81753457, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83899409, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.67333984, + "router_z_loss_mlp": 0.10577393, + "step": 11968, + "time_per_iteration": 2.5162575244903564 + }, + { + "auxiliary_loss_clip": 0.0111773, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.04520178, + "balance_loss_mlp": 1.01714122, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 1.871734049347213, + "language_loss": 0.76709324, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78856128, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11938477, + "step": 11969, + "time_per_iteration": 2.4740512371063232 + }, + { + "auxiliary_loss_clip": 0.01116277, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.04256415, + "balance_loss_mlp": 1.01495039, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.4559332361524095, + "language_loss": 0.71276516, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73420662, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12915039, + "step": 11970, + "time_per_iteration": 2.3949222564697266 + }, + { + "auxiliary_loss_clip": 0.01050297, + "auxiliary_loss_mlp": 0.01003396, + "balance_loss_clip": 1.02503586, + "balance_loss_mlp": 1.00184894, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9122383852581815, + "language_loss": 0.60811818, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62865508, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01547241, + "step": 11971, + "time_per_iteration": 4.370787858963013 + }, + { + "auxiliary_loss_clip": 0.01125343, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.04800367, + "balance_loss_mlp": 1.02101374, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.3280269421893904, + "language_loss": 0.80281729, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82441294, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13189697, + "step": 11972, + "time_per_iteration": 2.4764671325683594 + }, + { + "auxiliary_loss_clip": 0.0112456, + "auxiliary_loss_mlp": 0.01036068, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.02313411, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.826557915852691, + "language_loss": 0.75540948, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77701575, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12939453, + "step": 11973, + "time_per_iteration": 2.5073416233062744 + }, + { + "auxiliary_loss_clip": 0.01049557, + "auxiliary_loss_mlp": 0.00999903, + "balance_loss_clip": 1.02467656, + "balance_loss_mlp": 0.99850577, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8546941867341274, + "language_loss": 0.57263237, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59312695, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01397705, + "step": 11974, + "time_per_iteration": 2.922130823135376 + }, + { + "auxiliary_loss_clip": 0.01118694, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.04246509, + "balance_loss_mlp": 1.02065003, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.0803201002511265, + "language_loss": 0.61239707, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63391852, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12805176, + "step": 11975, + "time_per_iteration": 2.508955478668213 + }, + { + "auxiliary_loss_clip": 0.01115247, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.041152, + "balance_loss_mlp": 1.01894748, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.8547376491410899, + "language_loss": 0.67680061, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69825935, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11688232, + "step": 11976, + "time_per_iteration": 2.5578737258911133 + }, + { + "auxiliary_loss_clip": 0.01115202, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.04288888, + "balance_loss_mlp": 1.01785803, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.8268163741571803, + "language_loss": 0.67235285, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69379747, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11395264, + "step": 11977, + "time_per_iteration": 3.8912203311920166 + }, + { + "auxiliary_loss_clip": 0.01121052, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.04561448, + "balance_loss_mlp": 1.02191675, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.6303447825630455, + "language_loss": 0.75519133, + "learning_rate": 7.666389006550074e-07, + "loss": 0.7767415, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12060547, + "step": 11978, + "time_per_iteration": 2.506805181503296 + }, + { + "auxiliary_loss_clip": 0.01117888, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.04561841, + "balance_loss_mlp": 1.01794863, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 1.8737537580007277, + "language_loss": 0.78921974, + "learning_rate": 7.663323345468908e-07, + "loss": 0.8106994, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12121582, + "step": 11979, + "time_per_iteration": 2.4836206436157227 + }, + { + "auxiliary_loss_clip": 0.0111882, + "auxiliary_loss_mlp": 0.01027957, + "balance_loss_clip": 1.04554665, + "balance_loss_mlp": 1.01563108, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.8506340968696509, + "language_loss": 0.64635283, + "learning_rate": 7.660258152195767e-07, + "loss": 0.66782057, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12335205, + "step": 11980, + "time_per_iteration": 2.5403432846069336 + }, + { + "auxiliary_loss_clip": 0.01113987, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.04055405, + "balance_loss_mlp": 1.02022791, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.9593720599545175, + "language_loss": 0.67427802, + "learning_rate": 7.657193426846871e-07, + "loss": 0.6957556, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.13562012, + "step": 11981, + "time_per_iteration": 2.526365041732788 + }, + { + "auxiliary_loss_clip": 0.01120648, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.04493511, + "balance_loss_mlp": 1.01974034, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.7886445355049394, + "language_loss": 0.73719108, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75872546, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13049316, + "step": 11982, + "time_per_iteration": 2.4084842205047607 + }, + { + "auxiliary_loss_clip": 0.01113756, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.04045987, + "balance_loss_mlp": 1.01954007, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 2.064331261351503, + "language_loss": 0.65664452, + "learning_rate": 7.65106538038665e-07, + "loss": 0.67808402, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10650635, + "step": 11983, + "time_per_iteration": 2.3984785079956055 + }, + { + "auxiliary_loss_clip": 0.01123287, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.04894447, + "balance_loss_mlp": 1.02064037, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.6272207654764672, + "language_loss": 0.66511381, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68667549, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12237549, + "step": 11984, + "time_per_iteration": 2.430907726287842 + }, + { + "auxiliary_loss_clip": 0.01117163, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.04205966, + "balance_loss_mlp": 1.01873076, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.7423995621174617, + "language_loss": 0.73783702, + "learning_rate": 7.644939207017771e-07, + "loss": 0.75932389, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12799072, + "step": 11985, + "time_per_iteration": 2.411020040512085 + }, + { + "auxiliary_loss_clip": 0.01126639, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.05283844, + "balance_loss_mlp": 1.01644289, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.684880097783337, + "language_loss": 0.62762022, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64917064, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11968994, + "step": 11986, + "time_per_iteration": 2.4965481758117676 + }, + { + "auxiliary_loss_clip": 0.01120949, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.04704642, + "balance_loss_mlp": 1.01801562, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.9207743445275713, + "language_loss": 0.72696781, + "learning_rate": 7.638814907669455e-07, + "loss": 0.7485081, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1506958, + "step": 11987, + "time_per_iteration": 2.40541934967041 + }, + { + "auxiliary_loss_clip": 0.01113532, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.03879118, + "balance_loss_mlp": 1.02008486, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.9160205353415998, + "language_loss": 0.7877388, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80920398, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12915039, + "step": 11988, + "time_per_iteration": 2.4726099967956543 + }, + { + "auxiliary_loss_clip": 0.01120856, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.04678559, + "balance_loss_mlp": 1.01990831, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 1.866316178073426, + "language_loss": 0.78643298, + "learning_rate": 7.632692483270618e-07, + "loss": 0.8079614, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12084961, + "step": 11989, + "time_per_iteration": 2.4208438396453857 + }, + { + "auxiliary_loss_clip": 0.01119649, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04823232, + "balance_loss_mlp": 1.0204916, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.7641145141791925, + "language_loss": 0.82205582, + "learning_rate": 7.629631974467481e-07, + "loss": 0.8435775, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12042236, + "step": 11990, + "time_per_iteration": 2.468634605407715 + }, + { + "auxiliary_loss_clip": 0.01119693, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.04785991, + "balance_loss_mlp": 1.02425337, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 1.8893657428367172, + "language_loss": 0.75998425, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78153348, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10986328, + "step": 11991, + "time_per_iteration": 2.441307544708252 + }, + { + "auxiliary_loss_clip": 0.01115066, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.0444603, + "balance_loss_mlp": 1.0183847, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.7980372733423544, + "language_loss": 0.72690159, + "learning_rate": 7.623512364234022e-07, + "loss": 0.7483567, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1206665, + "step": 11992, + "time_per_iteration": 3.9546191692352295 + }, + { + "auxiliary_loss_clip": 0.01112556, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.0395875, + "balance_loss_mlp": 1.02145863, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.591264137492236, + "language_loss": 0.66376823, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68524086, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13244629, + "step": 11993, + "time_per_iteration": 2.5259382724761963 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.03977013, + "balance_loss_mlp": 1.02127838, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.2782345334757177, + "language_loss": 0.65797448, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67944002, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12921143, + "step": 11994, + "time_per_iteration": 2.4849793910980225 + }, + { + "auxiliary_loss_clip": 0.01115228, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.0418489, + "balance_loss_mlp": 1.01976395, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.777265198166603, + "language_loss": 0.66836751, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68986654, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.14923096, + "step": 11995, + "time_per_iteration": 2.4875082969665527 + }, + { + "auxiliary_loss_clip": 0.01114189, + "auxiliary_loss_mlp": 0.01040043, + "balance_loss_clip": 1.04178095, + "balance_loss_mlp": 1.02464139, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.754802299950762, + "language_loss": 0.7997036, + "learning_rate": 7.6112787765068e-07, + "loss": 0.82124591, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.1539917, + "step": 11996, + "time_per_iteration": 2.4677746295928955 + }, + { + "auxiliary_loss_clip": 0.01123817, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.0482285, + "balance_loss_mlp": 1.01819718, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 2.452747856931207, + "language_loss": 0.81532842, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83687073, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12213135, + "step": 11997, + "time_per_iteration": 2.5290260314941406 + }, + { + "auxiliary_loss_clip": 0.01125601, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.04716754, + "balance_loss_mlp": 1.02027655, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 2.0246348426549963, + "language_loss": 0.67292386, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69451642, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13378906, + "step": 11998, + "time_per_iteration": 2.4867515563964844 + }, + { + "auxiliary_loss_clip": 0.01121234, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.04885077, + "balance_loss_mlp": 1.01870346, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.9601225454699183, + "language_loss": 0.72481149, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74632269, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11181641, + "step": 11999, + "time_per_iteration": 2.424537420272827 + }, + { + "auxiliary_loss_clip": 0.01118273, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.04451299, + "balance_loss_mlp": 1.01555872, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.9378058234889215, + "language_loss": 0.83256531, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85402721, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12365723, + "step": 12000, + "time_per_iteration": 2.4199182987213135 + }, + { + "auxiliary_loss_clip": 0.01118423, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.04567003, + "balance_loss_mlp": 1.02073622, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 4.02756812541454, + "language_loss": 0.77185071, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79336345, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12115479, + "step": 12001, + "time_per_iteration": 2.4241724014282227 + }, + { + "auxiliary_loss_clip": 0.01128096, + "auxiliary_loss_mlp": 0.01040298, + "balance_loss_clip": 1.056705, + "balance_loss_mlp": 1.02787685, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.8130428535061234, + "language_loss": 0.81674892, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83843291, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12426758, + "step": 12002, + "time_per_iteration": 2.4323179721832275 + }, + { + "auxiliary_loss_clip": 0.01126006, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.04854345, + "balance_loss_mlp": 1.01871085, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 2.1721587339097828, + "language_loss": 0.62266529, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64424193, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12969971, + "step": 12003, + "time_per_iteration": 2.6928203105926514 + }, + { + "auxiliary_loss_clip": 0.01122042, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.04709959, + "balance_loss_mlp": 1.01648438, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 32.28100078785026, + "language_loss": 0.68598557, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70749462, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12390137, + "step": 12004, + "time_per_iteration": 2.4480926990509033 + }, + { + "auxiliary_loss_clip": 0.0105878, + "auxiliary_loss_mlp": 0.01010178, + "balance_loss_clip": 1.03424895, + "balance_loss_mlp": 1.00884449, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8581989119894202, + "language_loss": 0.54100215, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56169176, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.01333618, + "step": 12005, + "time_per_iteration": 3.0267982482910156 + }, + { + "auxiliary_loss_clip": 0.01121703, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.04912651, + "balance_loss_mlp": 1.02183914, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.9181931100237133, + "language_loss": 0.63534081, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65689969, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12347412, + "step": 12006, + "time_per_iteration": 2.5635428428649902 + }, + { + "auxiliary_loss_clip": 0.01125442, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.05260253, + "balance_loss_mlp": 1.02119052, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.9258400280277075, + "language_loss": 0.9215368, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94311786, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11456299, + "step": 12007, + "time_per_iteration": 2.4900715351104736 + }, + { + "auxiliary_loss_clip": 0.01109133, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.03554308, + "balance_loss_mlp": 1.01723623, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 2.728646291307606, + "language_loss": 0.64096969, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66236156, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12841797, + "step": 12008, + "time_per_iteration": 2.412855863571167 + }, + { + "auxiliary_loss_clip": 0.01120514, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.04348826, + "balance_loss_mlp": 1.01755452, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 2.3741749788679587, + "language_loss": 0.78627682, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80778295, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12548828, + "step": 12009, + "time_per_iteration": 3.8892438411712646 + }, + { + "auxiliary_loss_clip": 0.01116272, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.04114544, + "balance_loss_mlp": 1.02035582, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 7.59267504091234, + "language_loss": 0.63834655, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65984619, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13342285, + "step": 12010, + "time_per_iteration": 2.4644458293914795 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.05085266, + "balance_loss_mlp": 1.01868761, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 1.8839545931745312, + "language_loss": 0.77038836, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79194951, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.13085938, + "step": 12011, + "time_per_iteration": 2.480703592300415 + }, + { + "auxiliary_loss_clip": 0.01121558, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.04982281, + "balance_loss_mlp": 1.02158713, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.713640057076887, + "language_loss": 0.79444528, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81599587, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11920166, + "step": 12012, + "time_per_iteration": 2.509669780731201 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04049468, + "balance_loss_mlp": 1.019261, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 2.579791571130547, + "language_loss": 0.75896621, + "learning_rate": 7.559369974289171e-07, + "loss": 0.78042006, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11730957, + "step": 12013, + "time_per_iteration": 2.477534055709839 + }, + { + "auxiliary_loss_clip": 0.01122315, + "auxiliary_loss_mlp": 0.01025304, + "balance_loss_clip": 1.04646897, + "balance_loss_mlp": 1.01319289, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.5845512915901645, + "language_loss": 0.75773019, + "learning_rate": 7.556320755530484e-07, + "loss": 0.7792064, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12127686, + "step": 12014, + "time_per_iteration": 3.84297776222229 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.0437392, + "balance_loss_mlp": 1.01784945, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.5947735391156528, + "language_loss": 0.86727571, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88875937, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12200928, + "step": 12015, + "time_per_iteration": 2.5156445503234863 + }, + { + "auxiliary_loss_clip": 0.01117061, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.04511452, + "balance_loss_mlp": 1.02201664, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.8840070604173238, + "language_loss": 0.78203183, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80353975, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1171875, + "step": 12016, + "time_per_iteration": 2.495271921157837 + }, + { + "auxiliary_loss_clip": 0.01114932, + "auxiliary_loss_mlp": 0.01036975, + "balance_loss_clip": 1.0445329, + "balance_loss_mlp": 1.02545953, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.5800975275020537, + "language_loss": 0.77814794, + "learning_rate": 7.547175930910186e-07, + "loss": 0.799667, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11523438, + "step": 12017, + "time_per_iteration": 2.543332099914551 + }, + { + "auxiliary_loss_clip": 0.01107006, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.03768146, + "balance_loss_mlp": 1.01791322, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 1.866375978872914, + "language_loss": 0.74264944, + "learning_rate": 7.54412860030732e-07, + "loss": 0.76400864, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11004639, + "step": 12018, + "time_per_iteration": 2.5629324913024902 + }, + { + "auxiliary_loss_clip": 0.01119737, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.04917264, + "balance_loss_mlp": 1.02380776, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 1.7488824013458302, + "language_loss": 0.77555519, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79710275, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11218262, + "step": 12019, + "time_per_iteration": 2.474216938018799 + }, + { + "auxiliary_loss_clip": 0.0111225, + "auxiliary_loss_mlp": 0.01026583, + "balance_loss_clip": 1.03994012, + "balance_loss_mlp": 1.01441145, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 2.2345864308498697, + "language_loss": 0.74297607, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76436448, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12158203, + "step": 12020, + "time_per_iteration": 2.5401835441589355 + }, + { + "auxiliary_loss_clip": 0.01114087, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.03977704, + "balance_loss_mlp": 1.01993966, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.9653909697131742, + "language_loss": 0.77727216, + "learning_rate": 7.534989442928219e-07, + "loss": 0.7987414, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12890625, + "step": 12021, + "time_per_iteration": 3.946596622467041 + }, + { + "auxiliary_loss_clip": 0.01120046, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.04773498, + "balance_loss_mlp": 1.01955295, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 1.5322560805249499, + "language_loss": 0.68372202, + "learning_rate": 7.531944002330073e-07, + "loss": 0.7052381, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12005615, + "step": 12022, + "time_per_iteration": 2.5641417503356934 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.04132986, + "balance_loss_mlp": 1.01722336, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7954838262180581, + "language_loss": 0.69143593, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71286714, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1206665, + "step": 12023, + "time_per_iteration": 2.637131929397583 + }, + { + "auxiliary_loss_clip": 0.01116192, + "auxiliary_loss_mlp": 0.01026477, + "balance_loss_clip": 1.0433445, + "balance_loss_mlp": 1.01435947, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.710196123854113, + "language_loss": 0.71089292, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73231959, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12115479, + "step": 12024, + "time_per_iteration": 2.524444580078125 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.05095267, + "balance_loss_mlp": 1.02035105, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.7475555237663953, + "language_loss": 0.75709236, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77863747, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11346436, + "step": 12025, + "time_per_iteration": 2.5082123279571533 + }, + { + "auxiliary_loss_clip": 0.01121182, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.0475316, + "balance_loss_mlp": 1.01469159, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.3039588455685154, + "language_loss": 0.76339078, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78486687, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11743164, + "step": 12026, + "time_per_iteration": 2.4158103466033936 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.04456329, + "balance_loss_mlp": 1.0252161, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 2.232367016087739, + "language_loss": 0.67739165, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69892752, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11254883, + "step": 12027, + "time_per_iteration": 2.4988720417022705 + }, + { + "auxiliary_loss_clip": 0.01113837, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.04088342, + "balance_loss_mlp": 1.01632857, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.2819687019644026, + "language_loss": 0.79336309, + "learning_rate": 7.513681291370469e-07, + "loss": 0.81478637, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12164307, + "step": 12028, + "time_per_iteration": 2.455434799194336 + }, + { + "auxiliary_loss_clip": 0.01109555, + "auxiliary_loss_mlp": 0.01028029, + "balance_loss_clip": 1.0376761, + "balance_loss_mlp": 1.01563072, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.9476990811635602, + "language_loss": 0.81599921, + "learning_rate": 7.510639162726e-07, + "loss": 0.83737504, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1239624, + "step": 12029, + "time_per_iteration": 2.463855266571045 + }, + { + "auxiliary_loss_clip": 0.01051424, + "auxiliary_loss_mlp": 0.01004895, + "balance_loss_clip": 1.02699125, + "balance_loss_mlp": 1.00330091, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8080438616064997, + "language_loss": 0.61784327, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63840646, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.24462891, + "router_z_loss_mlp": 0.01597595, + "step": 12030, + "time_per_iteration": 3.112332582473755 + }, + { + "auxiliary_loss_clip": 0.01108269, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.03684008, + "balance_loss_mlp": 1.01724946, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.7534947634288232, + "language_loss": 0.78042173, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80181313, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.13623047, + "step": 12031, + "time_per_iteration": 2.4771604537963867 + }, + { + "auxiliary_loss_clip": 0.0112095, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.04539764, + "balance_loss_mlp": 1.01715481, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.1334136906852095, + "language_loss": 0.81681693, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83832192, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.1239624, + "step": 12032, + "time_per_iteration": 2.4312918186187744 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.04534185, + "balance_loss_mlp": 1.0193404, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.8967076631711994, + "language_loss": 0.7513268, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77285159, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12573242, + "step": 12033, + "time_per_iteration": 2.4466135501861572 + }, + { + "auxiliary_loss_clip": 0.01114992, + "auxiliary_loss_mlp": 0.01025971, + "balance_loss_clip": 1.04441106, + "balance_loss_mlp": 1.01499152, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 2.3305173469324183, + "language_loss": 0.75176024, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77316988, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10980225, + "step": 12034, + "time_per_iteration": 2.4130289554595947 + }, + { + "auxiliary_loss_clip": 0.01113101, + "auxiliary_loss_mlp": 0.01027794, + "balance_loss_clip": 1.03965425, + "balance_loss_mlp": 1.01670122, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 3.6270752706648306, + "language_loss": 0.80795217, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82936108, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11096191, + "step": 12035, + "time_per_iteration": 2.514406204223633 + }, + { + "auxiliary_loss_clip": 0.01120064, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.04409957, + "balance_loss_mlp": 1.01751435, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 3.0207551942411013, + "language_loss": 0.60594344, + "learning_rate": 7.489357529411326e-07, + "loss": 0.62744159, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12237549, + "step": 12036, + "time_per_iteration": 3.7778148651123047 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.04377651, + "balance_loss_mlp": 1.01821792, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.7811296380121313, + "language_loss": 0.67328358, + "learning_rate": 7.486319192777883e-07, + "loss": 0.6947276, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10632324, + "step": 12037, + "time_per_iteration": 2.4600746631622314 + }, + { + "auxiliary_loss_clip": 0.01118825, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.04718912, + "balance_loss_mlp": 1.02106392, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 2.0092581392218696, + "language_loss": 0.72237176, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74388838, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11767578, + "step": 12038, + "time_per_iteration": 2.479135513305664 + }, + { + "auxiliary_loss_clip": 0.0112199, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.0482918, + "balance_loss_mlp": 1.01820111, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 2.0000283650227098, + "language_loss": 0.71891409, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74044031, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12432861, + "step": 12039, + "time_per_iteration": 2.4740824699401855 + }, + { + "auxiliary_loss_clip": 0.01122904, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.04890549, + "balance_loss_mlp": 1.0167172, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 2.4468409278178145, + "language_loss": 0.76236641, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78387511, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11254883, + "step": 12040, + "time_per_iteration": 2.4725637435913086 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.04746282, + "balance_loss_mlp": 1.01753247, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.556542800349932, + "language_loss": 0.76755524, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78905499, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11761475, + "step": 12041, + "time_per_iteration": 2.519805431365967 + }, + { + "auxiliary_loss_clip": 0.01112677, + "auxiliary_loss_mlp": 0.01028767, + "balance_loss_clip": 1.03822553, + "balance_loss_mlp": 1.01702499, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 3.6077116548578405, + "language_loss": 0.63762307, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65903747, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11743164, + "step": 12042, + "time_per_iteration": 2.408478021621704 + }, + { + "auxiliary_loss_clip": 0.01118493, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.04249454, + "balance_loss_mlp": 1.01917577, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 2.063810430440595, + "language_loss": 0.83918726, + "learning_rate": 7.468099141929116e-07, + "loss": 0.86069155, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12744141, + "step": 12043, + "time_per_iteration": 2.487234115600586 + }, + { + "auxiliary_loss_clip": 0.01122042, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.04394639, + "balance_loss_mlp": 1.02027678, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 2.4818657155329644, + "language_loss": 0.64114344, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66270292, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13647461, + "step": 12044, + "time_per_iteration": 2.451563835144043 + }, + { + "auxiliary_loss_clip": 0.01125117, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.05075204, + "balance_loss_mlp": 1.02269995, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.5475318703991923, + "language_loss": 0.81673527, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83834076, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1272583, + "step": 12045, + "time_per_iteration": 2.50400447845459 + }, + { + "auxiliary_loss_clip": 0.01114911, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.04561663, + "balance_loss_mlp": 1.01631141, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 2.1784604934412286, + "language_loss": 0.72033173, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74176085, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11688232, + "step": 12046, + "time_per_iteration": 2.443439245223999 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.04668474, + "balance_loss_mlp": 1.01874661, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.8716533808199003, + "language_loss": 0.71292114, + "learning_rate": 7.455961944046553e-07, + "loss": 0.7344377, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12689209, + "step": 12047, + "time_per_iteration": 2.4665303230285645 + }, + { + "auxiliary_loss_clip": 0.01122594, + "auxiliary_loss_mlp": 0.01039575, + "balance_loss_clip": 1.04664242, + "balance_loss_mlp": 1.02550197, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 10.023337583686818, + "language_loss": 0.70076317, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72238487, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.14080811, + "step": 12048, + "time_per_iteration": 2.5470688343048096 + }, + { + "auxiliary_loss_clip": 0.01049072, + "auxiliary_loss_mlp": 0.01006429, + "balance_loss_clip": 1.02412379, + "balance_loss_mlp": 1.00489688, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8249761675096686, + "language_loss": 0.53773987, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55829489, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.24951172, + "router_z_loss_mlp": 0.01533508, + "step": 12049, + "time_per_iteration": 3.0981833934783936 + }, + { + "auxiliary_loss_clip": 0.01122173, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.04347825, + "balance_loss_mlp": 1.01994765, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 2.5230069355545295, + "language_loss": 0.60311699, + "learning_rate": 7.446864039779258e-07, + "loss": 0.62469405, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.15582275, + "step": 12050, + "time_per_iteration": 2.416292905807495 + }, + { + "auxiliary_loss_clip": 0.01042871, + "auxiliary_loss_mlp": 0.0100538, + "balance_loss_clip": 1.01814866, + "balance_loss_mlp": 1.00384974, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7665347852047982, + "language_loss": 0.53275239, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55323493, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01531982, + "step": 12051, + "time_per_iteration": 3.0776302814483643 + }, + { + "auxiliary_loss_clip": 0.01115216, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.04418373, + "balance_loss_mlp": 1.02013493, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.6864010198978148, + "language_loss": 0.71966529, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74113095, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11212158, + "step": 12052, + "time_per_iteration": 2.505783796310425 + }, + { + "auxiliary_loss_clip": 0.01118702, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.04651082, + "balance_loss_mlp": 1.0183388, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 1.926132515051984, + "language_loss": 0.74275035, + "learning_rate": 7.437770419657415e-07, + "loss": 0.7642554, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.13452148, + "step": 12053, + "time_per_iteration": 4.070875644683838 + }, + { + "auxiliary_loss_clip": 0.0111731, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.04562283, + "balance_loss_mlp": 1.01776087, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 4.770240566553976, + "language_loss": 0.78329271, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80476528, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12182617, + "step": 12054, + "time_per_iteration": 2.4525814056396484 + }, + { + "auxiliary_loss_clip": 0.01109438, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.03811324, + "balance_loss_mlp": 1.02006793, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.3070099140923697, + "language_loss": 0.68200576, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70342505, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12414551, + "step": 12055, + "time_per_iteration": 2.4456965923309326 + }, + { + "auxiliary_loss_clip": 0.01113657, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.04119873, + "balance_loss_mlp": 1.0243659, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.8917576498623419, + "language_loss": 0.7414602, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76297832, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.13800049, + "step": 12056, + "time_per_iteration": 2.4384264945983887 + }, + { + "auxiliary_loss_clip": 0.01117771, + "auxiliary_loss_mlp": 0.01025028, + "balance_loss_clip": 1.04653144, + "balance_loss_mlp": 1.01305366, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.8765524628820918, + "language_loss": 0.70770168, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72912973, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11981201, + "step": 12057, + "time_per_iteration": 2.4840352535247803 + }, + { + "auxiliary_loss_clip": 0.01119448, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.04587364, + "balance_loss_mlp": 1.01802993, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 2.0512431985548853, + "language_loss": 0.62310386, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64460099, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12237549, + "step": 12058, + "time_per_iteration": 3.855687379837036 + }, + { + "auxiliary_loss_clip": 0.01125952, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.04782939, + "balance_loss_mlp": 1.01978493, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 2.053890483221917, + "language_loss": 0.74757373, + "learning_rate": 7.419596044262535e-07, + "loss": 0.76915753, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12646484, + "step": 12059, + "time_per_iteration": 2.446981191635132 + }, + { + "auxiliary_loss_clip": 0.01114882, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.04359114, + "balance_loss_mlp": 1.02062607, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.896048799199956, + "language_loss": 0.79720151, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81866968, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11303711, + "step": 12060, + "time_per_iteration": 2.4363443851470947 + }, + { + "auxiliary_loss_clip": 0.01121128, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.04742861, + "balance_loss_mlp": 1.01607096, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 1.7002444961619632, + "language_loss": 0.76280546, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78429782, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12042236, + "step": 12061, + "time_per_iteration": 2.5368170738220215 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.04439473, + "balance_loss_mlp": 1.01859045, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 2.8232683575591944, + "language_loss": 0.81353712, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83497131, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.11334229, + "step": 12062, + "time_per_iteration": 2.4302704334259033 + }, + { + "auxiliary_loss_clip": 0.01117797, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.04249215, + "balance_loss_mlp": 1.01640284, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 3.142305445264046, + "language_loss": 0.69355547, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71503139, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.13409424, + "step": 12063, + "time_per_iteration": 2.562464714050293 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.03859115, + "balance_loss_mlp": 1.01313972, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.439408720572744, + "language_loss": 0.6983639, + "learning_rate": 7.40446384925973e-07, + "loss": 0.71968371, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.11480713, + "step": 12064, + "time_per_iteration": 2.42175555229187 + }, + { + "auxiliary_loss_clip": 0.01121119, + "auxiliary_loss_mlp": 0.01027732, + "balance_loss_clip": 1.04751968, + "balance_loss_mlp": 1.01624608, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 1.804655671957962, + "language_loss": 0.90669048, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92817891, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11499023, + "step": 12065, + "time_per_iteration": 3.9212074279785156 + }, + { + "auxiliary_loss_clip": 0.0103801, + "auxiliary_loss_mlp": 0.01006305, + "balance_loss_clip": 1.01388586, + "balance_loss_mlp": 1.00486839, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6548834614991611, + "language_loss": 0.56077331, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58121645, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01434326, + "step": 12066, + "time_per_iteration": 3.235313653945923 + }, + { + "auxiliary_loss_clip": 0.01118714, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.04361343, + "balance_loss_mlp": 1.02062404, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.7162149291277768, + "language_loss": 0.76702464, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78853261, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11462402, + "step": 12067, + "time_per_iteration": 2.48934268951416 + }, + { + "auxiliary_loss_clip": 0.01054011, + "auxiliary_loss_mlp": 0.01017592, + "balance_loss_clip": 1.02910304, + "balance_loss_mlp": 1.0161804, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7320646661196547, + "language_loss": 0.56945717, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59017324, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01409912, + "step": 12068, + "time_per_iteration": 2.9684853553771973 + }, + { + "auxiliary_loss_clip": 0.01075131, + "auxiliary_loss_mlp": 0.01004243, + "balance_loss_clip": 1.05043566, + "balance_loss_mlp": 1.00287163, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6593860359797561, + "language_loss": 0.55417299, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57496673, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01371765, + "step": 12069, + "time_per_iteration": 3.1517651081085205 + }, + { + "auxiliary_loss_clip": 0.01122914, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.05202055, + "balance_loss_mlp": 1.01975131, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 2.073992487894903, + "language_loss": 0.79446959, + "learning_rate": 7.38632097810854e-07, + "loss": 0.81600815, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11193848, + "step": 12070, + "time_per_iteration": 2.5376791954040527 + }, + { + "auxiliary_loss_clip": 0.01108863, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.04068589, + "balance_loss_mlp": 1.01750386, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 1.8045271895291222, + "language_loss": 0.72114635, + "learning_rate": 7.383298839673197e-07, + "loss": 0.7425245, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.11450195, + "step": 12071, + "time_per_iteration": 2.4926257133483887 + }, + { + "auxiliary_loss_clip": 0.01110091, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.03995204, + "balance_loss_mlp": 1.02356017, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.7899662386358155, + "language_loss": 0.70466208, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72611153, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11297607, + "step": 12072, + "time_per_iteration": 2.4979844093322754 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.0410707, + "balance_loss_mlp": 1.0181793, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.9281471994939101, + "language_loss": 0.78385067, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80532014, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12609863, + "step": 12073, + "time_per_iteration": 2.471505641937256 + }, + { + "auxiliary_loss_clip": 0.01112583, + "auxiliary_loss_mlp": 0.01026971, + "balance_loss_clip": 1.04204726, + "balance_loss_mlp": 1.0148654, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.4699576706110258, + "language_loss": 0.70179576, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72319126, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12109375, + "step": 12074, + "time_per_iteration": 2.538731336593628 + }, + { + "auxiliary_loss_clip": 0.01120496, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.04547954, + "balance_loss_mlp": 1.01262069, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.748247795325758, + "language_loss": 0.74261326, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76406932, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12506104, + "step": 12075, + "time_per_iteration": 2.467914581298828 + }, + { + "auxiliary_loss_clip": 0.0111984, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.04704523, + "balance_loss_mlp": 1.0170362, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.4812573356367622, + "language_loss": 0.63841391, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65990698, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12445068, + "step": 12076, + "time_per_iteration": 2.8526933193206787 + }, + { + "auxiliary_loss_clip": 0.01116037, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.04409432, + "balance_loss_mlp": 1.01490378, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 1.9669160603226885, + "language_loss": 0.79109395, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81252116, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11767578, + "step": 12077, + "time_per_iteration": 2.465984582901001 + }, + { + "auxiliary_loss_clip": 0.01050713, + "auxiliary_loss_mlp": 0.010048, + "balance_loss_clip": 1.02496576, + "balance_loss_mlp": 1.00326228, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.989937022271492, + "language_loss": 0.65002942, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67058456, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01539612, + "step": 12078, + "time_per_iteration": 3.0501456260681152 + }, + { + "auxiliary_loss_clip": 0.01046721, + "auxiliary_loss_mlp": 0.01005108, + "balance_loss_clip": 1.02077246, + "balance_loss_mlp": 1.00361824, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.76445950842708, + "language_loss": 0.5923351, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61285341, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01489258, + "step": 12079, + "time_per_iteration": 3.1973345279693604 + }, + { + "auxiliary_loss_clip": 0.01112641, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.04055977, + "balance_loss_mlp": 1.01612687, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 5.771524274614958, + "language_loss": 0.65017956, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67158723, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11993408, + "step": 12080, + "time_per_iteration": 3.9572207927703857 + }, + { + "auxiliary_loss_clip": 0.01119797, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.04665709, + "balance_loss_mlp": 1.01438046, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 4.593083288510441, + "language_loss": 0.70183158, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72329623, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12286377, + "step": 12081, + "time_per_iteration": 2.442500114440918 + }, + { + "auxiliary_loss_clip": 0.01117674, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.0431509, + "balance_loss_mlp": 1.01735497, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.7623309532222566, + "language_loss": 0.81371224, + "learning_rate": 7.350086918237237e-07, + "loss": 0.835181, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11859131, + "step": 12082, + "time_per_iteration": 2.3981289863586426 + }, + { + "auxiliary_loss_clip": 0.01122234, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.0455327, + "balance_loss_mlp": 1.01584911, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.7241434241210865, + "language_loss": 0.77147895, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79298931, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.1293335, + "step": 12083, + "time_per_iteration": 2.525057554244995 + }, + { + "auxiliary_loss_clip": 0.01123502, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.04927754, + "balance_loss_mlp": 1.01568592, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.7229245014557468, + "language_loss": 0.73086572, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75238025, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12268066, + "step": 12084, + "time_per_iteration": 2.5065078735351562 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.04626727, + "balance_loss_mlp": 1.02365255, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.7029223713897075, + "language_loss": 0.77755904, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79915845, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12982178, + "step": 12085, + "time_per_iteration": 2.5286521911621094 + }, + { + "auxiliary_loss_clip": 0.01117714, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.04444814, + "balance_loss_mlp": 1.01813209, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.9589692587257659, + "language_loss": 0.72446382, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74594164, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1194458, + "step": 12086, + "time_per_iteration": 2.4372498989105225 + }, + { + "auxiliary_loss_clip": 0.01115956, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.04331243, + "balance_loss_mlp": 1.01930428, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.588634797606115, + "language_loss": 0.69479203, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71626592, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12121582, + "step": 12087, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01123193, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.04919338, + "balance_loss_mlp": 1.02036703, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 2.0745533421717903, + "language_loss": 0.78612161, + "learning_rate": 7.331995778981088e-07, + "loss": 0.80768466, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12744141, + "step": 12088, + "time_per_iteration": 2.4446253776550293 + }, + { + "auxiliary_loss_clip": 0.01116364, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.04249501, + "balance_loss_mlp": 1.02375197, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 4.834725826651278, + "language_loss": 0.74137163, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76290339, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1307373, + "step": 12089, + "time_per_iteration": 2.445122718811035 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.04369831, + "balance_loss_mlp": 1.02044511, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.824511681257002, + "language_loss": 0.7142539, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73573267, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11834717, + "step": 12090, + "time_per_iteration": 2.4508183002471924 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.04504979, + "balance_loss_mlp": 1.01964951, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.6342171307651587, + "language_loss": 0.76841843, + "learning_rate": 7.322956692831528e-07, + "loss": 0.78994101, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13018799, + "step": 12091, + "time_per_iteration": 2.536163568496704 + }, + { + "auxiliary_loss_clip": 0.01118969, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.04385304, + "balance_loss_mlp": 1.02021253, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 1.7254384851066764, + "language_loss": 0.7127623, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73427927, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12524414, + "step": 12092, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01120145, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.04744577, + "balance_loss_mlp": 1.01721025, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.857261755524238, + "language_loss": 0.61132449, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63282108, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12316895, + "step": 12093, + "time_per_iteration": 2.554771661758423 + }, + { + "auxiliary_loss_clip": 0.01114293, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.04383159, + "balance_loss_mlp": 1.02142835, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.5538335634395457, + "language_loss": 0.75525796, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77673417, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11889648, + "step": 12094, + "time_per_iteration": 2.4284677505493164 + }, + { + "auxiliary_loss_clip": 0.01104027, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.03408575, + "balance_loss_mlp": 1.01584673, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.7848528906881742, + "language_loss": 0.84996271, + "learning_rate": 7.310911308504808e-07, + "loss": 0.87127364, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11212158, + "step": 12095, + "time_per_iteration": 2.475046396255493 + }, + { + "auxiliary_loss_clip": 0.01113076, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.0413332, + "balance_loss_mlp": 1.02170563, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.8051286362305878, + "language_loss": 0.77784109, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79931021, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12133789, + "step": 12096, + "time_per_iteration": 2.439352512359619 + }, + { + "auxiliary_loss_clip": 0.01111692, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.03954327, + "balance_loss_mlp": 1.01778507, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 1.8813063608815772, + "language_loss": 0.72794259, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74935693, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11956787, + "step": 12097, + "time_per_iteration": 2.4538941383361816 + }, + { + "auxiliary_loss_clip": 0.01111978, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.03798175, + "balance_loss_mlp": 1.01865315, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 2.1758011329692266, + "language_loss": 0.76520777, + "learning_rate": 7.301882322160935e-07, + "loss": 0.7866447, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13067627, + "step": 12098, + "time_per_iteration": 4.0529608726501465 + }, + { + "auxiliary_loss_clip": 0.01117574, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.0402987, + "balance_loss_mlp": 1.02026975, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 2.270519442620609, + "language_loss": 0.67734581, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69885546, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13116455, + "step": 12099, + "time_per_iteration": 2.8175363540649414 + }, + { + "auxiliary_loss_clip": 0.01115226, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.03743589, + "balance_loss_mlp": 1.0181365, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.588073857890023, + "language_loss": 0.72390139, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74537545, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.14031982, + "step": 12100, + "time_per_iteration": 2.4528086185455322 + }, + { + "auxiliary_loss_clip": 0.01117944, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.04437804, + "balance_loss_mlp": 1.02418423, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 2.5598466342392485, + "language_loss": 0.74851775, + "learning_rate": 7.292857669442005e-07, + "loss": 0.77005446, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11547852, + "step": 12101, + "time_per_iteration": 2.479656934738159 + }, + { + "auxiliary_loss_clip": 0.01120899, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.04863715, + "balance_loss_mlp": 1.0179311, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 2.000592306682282, + "language_loss": 0.82742018, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84891689, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10845947, + "step": 12102, + "time_per_iteration": 3.84096097946167 + }, + { + "auxiliary_loss_clip": 0.01115146, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.04352558, + "balance_loss_mlp": 1.01708531, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.027243668577583, + "language_loss": 0.81580377, + "learning_rate": 7.286843643386495e-07, + "loss": 0.83723831, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11224365, + "step": 12103, + "time_per_iteration": 2.4221436977386475 + }, + { + "auxiliary_loss_clip": 0.01119702, + "auxiliary_loss_mlp": 0.01028566, + "balance_loss_clip": 1.04711032, + "balance_loss_mlp": 1.01594758, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.7135338062292984, + "language_loss": 0.66348672, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68496931, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1262207, + "step": 12104, + "time_per_iteration": 2.423631191253662 + }, + { + "auxiliary_loss_clip": 0.01116354, + "auxiliary_loss_mlp": 0.01026356, + "balance_loss_clip": 1.04712355, + "balance_loss_mlp": 1.01465595, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 2.280743272573897, + "language_loss": 0.66564119, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68706828, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11706543, + "step": 12105, + "time_per_iteration": 2.5182511806488037 + }, + { + "auxiliary_loss_clip": 0.01114135, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.04189062, + "balance_loss_mlp": 1.01888847, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.1150769990410447, + "language_loss": 0.75835443, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77980512, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12042236, + "step": 12106, + "time_per_iteration": 2.4401023387908936 + }, + { + "auxiliary_loss_clip": 0.01116257, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.04200864, + "balance_loss_mlp": 1.01675558, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 4.628264127063718, + "language_loss": 0.70327449, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72475255, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.14794922, + "step": 12107, + "time_per_iteration": 2.55493426322937 + }, + { + "auxiliary_loss_clip": 0.0110845, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.03785157, + "balance_loss_mlp": 1.01713717, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.4862650654749134, + "language_loss": 0.75288367, + "learning_rate": 7.271817016715205e-07, + "loss": 0.7742576, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11810303, + "step": 12108, + "time_per_iteration": 4.301839828491211 + }, + { + "auxiliary_loss_clip": 0.01113994, + "auxiliary_loss_mlp": 0.01029799, + "balance_loss_clip": 1.04177308, + "balance_loss_mlp": 1.01752043, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.976188423254504, + "language_loss": 0.66800499, + "learning_rate": 7.268813138887124e-07, + "loss": 0.68944293, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12280273, + "step": 12109, + "time_per_iteration": 2.622417449951172 + }, + { + "auxiliary_loss_clip": 0.01111987, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.0392729, + "balance_loss_mlp": 1.02208233, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 1.9928223072856268, + "language_loss": 0.63454515, + "learning_rate": 7.265809743826912e-07, + "loss": 0.6560272, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.14135742, + "step": 12110, + "time_per_iteration": 2.4096579551696777 + }, + { + "auxiliary_loss_clip": 0.01118195, + "auxiliary_loss_mlp": 0.01025327, + "balance_loss_clip": 1.04272449, + "balance_loss_mlp": 1.01252413, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.7862018139782478, + "language_loss": 0.58089554, + "learning_rate": 7.26280683164847e-07, + "loss": 0.6023308, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12805176, + "step": 12111, + "time_per_iteration": 2.5499024391174316 + }, + { + "auxiliary_loss_clip": 0.01118023, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.04450858, + "balance_loss_mlp": 1.01628876, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.360745362378547, + "language_loss": 0.7391957, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76066911, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.13037109, + "step": 12112, + "time_per_iteration": 2.4088222980499268 + }, + { + "auxiliary_loss_clip": 0.01110027, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.03994429, + "balance_loss_mlp": 1.01899111, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 3.4734456091308696, + "language_loss": 0.66590655, + "learning_rate": 7.25680245639237e-07, + "loss": 0.68732166, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.12487793, + "step": 12113, + "time_per_iteration": 2.462521553039551 + }, + { + "auxiliary_loss_clip": 0.01119699, + "auxiliary_loss_mlp": 0.01028664, + "balance_loss_clip": 1.04580426, + "balance_loss_mlp": 1.01609898, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.8019261568112774, + "language_loss": 0.73147678, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75296044, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12567139, + "step": 12114, + "time_per_iteration": 2.43390154838562 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01029015, + "balance_loss_clip": 1.03960681, + "balance_loss_mlp": 1.01659894, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.107265021383691, + "language_loss": 0.67795968, + "learning_rate": 7.250800014029564e-07, + "loss": 0.69937205, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12426758, + "step": 12115, + "time_per_iteration": 2.49519419670105 + }, + { + "auxiliary_loss_clip": 0.01128924, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.05131698, + "balance_loss_mlp": 1.01593971, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 3.040364871865975, + "language_loss": 0.60004091, + "learning_rate": 7.247799517967674e-07, + "loss": 0.62161344, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12390137, + "step": 12116, + "time_per_iteration": 2.4409127235412598 + }, + { + "auxiliary_loss_clip": 0.01117031, + "auxiliary_loss_mlp": 0.01028843, + "balance_loss_clip": 1.04547131, + "balance_loss_mlp": 1.0173279, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 2.3844508971137675, + "language_loss": 0.7331785, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75463724, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11523438, + "step": 12117, + "time_per_iteration": 2.447627067565918 + }, + { + "auxiliary_loss_clip": 0.0111723, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.04553616, + "balance_loss_mlp": 1.01669121, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 1.8663641464927583, + "language_loss": 0.6957711, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71722722, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11688232, + "step": 12118, + "time_per_iteration": 2.4227378368377686 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.04329598, + "balance_loss_mlp": 1.02404213, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.808679603772988, + "language_loss": 0.84253776, + "learning_rate": 7.238800931625346e-07, + "loss": 0.8640089, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.1105957, + "step": 12119, + "time_per_iteration": 2.424603223800659 + }, + { + "auxiliary_loss_clip": 0.01123218, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.05047512, + "balance_loss_mlp": 1.01808023, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.0010768146475555, + "language_loss": 0.81581122, + "learning_rate": 7.235802370504831e-07, + "loss": 0.83734155, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11730957, + "step": 12120, + "time_per_iteration": 2.38277530670166 + }, + { + "auxiliary_loss_clip": 0.01115962, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.04290843, + "balance_loss_mlp": 1.02273679, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 2.091416893920687, + "language_loss": 0.78437662, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80589122, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12768555, + "step": 12121, + "time_per_iteration": 2.4056971073150635 + }, + { + "auxiliary_loss_clip": 0.01115354, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.03977919, + "balance_loss_mlp": 1.0234499, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.723343574918168, + "language_loss": 0.68847096, + "learning_rate": 7.229806700436441e-07, + "loss": 0.70999795, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13909912, + "step": 12122, + "time_per_iteration": 2.5532186031341553 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.04286671, + "balance_loss_mlp": 1.02211189, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.9173849232717777, + "language_loss": 0.871975, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89347196, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11340332, + "step": 12123, + "time_per_iteration": 2.4604873657226562 + }, + { + "auxiliary_loss_clip": 0.01114795, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.04246962, + "balance_loss_mlp": 1.01644206, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 2.740841128240617, + "language_loss": 0.82759047, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84902161, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11871338, + "step": 12124, + "time_per_iteration": 3.847109079360962 + }, + { + "auxiliary_loss_clip": 0.01124184, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.05199862, + "balance_loss_mlp": 1.01575089, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.893401132986313, + "language_loss": 0.67083621, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69235331, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11779785, + "step": 12125, + "time_per_iteration": 2.497976541519165 + }, + { + "auxiliary_loss_clip": 0.01122009, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.04644799, + "balance_loss_mlp": 1.02162337, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.7900369806571637, + "language_loss": 0.75110275, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77267146, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.13226318, + "step": 12126, + "time_per_iteration": 2.477257251739502 + }, + { + "auxiliary_loss_clip": 0.01073154, + "auxiliary_loss_mlp": 0.01005435, + "balance_loss_clip": 1.04841232, + "balance_loss_mlp": 1.00388646, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.819466288753483, + "language_loss": 0.58583379, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60661972, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01548767, + "step": 12127, + "time_per_iteration": 2.9971232414245605 + }, + { + "auxiliary_loss_clip": 0.01109453, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03907084, + "balance_loss_mlp": 1.01920235, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 2.0046231341889404, + "language_loss": 0.68342519, + "learning_rate": 7.21183131579562e-07, + "loss": 0.70483404, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.12225342, + "step": 12128, + "time_per_iteration": 2.4730217456817627 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.03826785, + "balance_loss_mlp": 1.01592588, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.157217117142977, + "language_loss": 0.65635395, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67776453, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.128479, + "step": 12129, + "time_per_iteration": 2.5100040435791016 + }, + { + "auxiliary_loss_clip": 0.01116664, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.04482698, + "balance_loss_mlp": 1.01319647, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.634105557008407, + "language_loss": 0.74333966, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76476026, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12194824, + "step": 12130, + "time_per_iteration": 2.508517265319824 + }, + { + "auxiliary_loss_clip": 0.01118733, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.0466826, + "balance_loss_mlp": 1.01783848, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.6767997690185836, + "language_loss": 0.70008326, + "learning_rate": 7.202850168478374e-07, + "loss": 0.72156715, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11828613, + "step": 12131, + "time_per_iteration": 2.4918105602264404 + }, + { + "auxiliary_loss_clip": 0.01112268, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.04116154, + "balance_loss_mlp": 1.02140367, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 2.6931708008856963, + "language_loss": 0.77462173, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79607153, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11322021, + "step": 12132, + "time_per_iteration": 2.478551149368286 + }, + { + "auxiliary_loss_clip": 0.01115585, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.04216301, + "balance_loss_mlp": 1.02372253, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.425701925353356, + "language_loss": 0.78676832, + "learning_rate": 7.196865163090358e-07, + "loss": 0.8082763, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11486816, + "step": 12133, + "time_per_iteration": 2.4252991676330566 + }, + { + "auxiliary_loss_clip": 0.01111737, + "auxiliary_loss_mlp": 0.01024294, + "balance_loss_clip": 1.04031062, + "balance_loss_mlp": 1.01273084, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 2.0986162028137474, + "language_loss": 0.72179723, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74315751, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11560059, + "step": 12134, + "time_per_iteration": 2.5088682174682617 + }, + { + "auxiliary_loss_clip": 0.01123578, + "auxiliary_loss_mlp": 0.01039136, + "balance_loss_clip": 1.04800606, + "balance_loss_mlp": 1.02616048, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.8415577456454129, + "language_loss": 0.71136534, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73299253, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12976074, + "step": 12135, + "time_per_iteration": 2.4490952491760254 + }, + { + "auxiliary_loss_clip": 0.01113894, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.0397203, + "balance_loss_mlp": 1.03100348, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.690067448534329, + "language_loss": 0.62375098, + "learning_rate": 7.187891296513075e-07, + "loss": 0.6453433, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.14337158, + "step": 12136, + "time_per_iteration": 2.5283591747283936 + }, + { + "auxiliary_loss_clip": 0.0111185, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.04092574, + "balance_loss_mlp": 1.02521968, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.9811513262225866, + "language_loss": 0.74872899, + "learning_rate": 7.184900979175654e-07, + "loss": 0.77022547, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.12579346, + "step": 12137, + "time_per_iteration": 2.475700616836548 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.04486728, + "balance_loss_mlp": 1.02267826, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.5489448452868846, + "language_loss": 0.74142718, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76296562, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11889648, + "step": 12138, + "time_per_iteration": 2.5420572757720947 + }, + { + "auxiliary_loss_clip": 0.01124318, + "auxiliary_loss_mlp": 0.01028138, + "balance_loss_clip": 1.04935396, + "balance_loss_mlp": 1.01679564, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.4434306178332412, + "language_loss": 0.72455627, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74608088, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11340332, + "step": 12139, + "time_per_iteration": 2.4806809425354004 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.04305375, + "balance_loss_mlp": 1.01536059, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.8829204808873798, + "language_loss": 0.73798501, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75936079, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.11090088, + "step": 12140, + "time_per_iteration": 2.5176358222961426 + }, + { + "auxiliary_loss_clip": 0.01111395, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.03986645, + "balance_loss_mlp": 1.01932263, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.7209620902468452, + "language_loss": 0.55754948, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57897973, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12310791, + "step": 12141, + "time_per_iteration": 2.530273914337158 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.04126287, + "balance_loss_mlp": 1.01521993, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.6133297512307874, + "language_loss": 0.72745258, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74883378, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11499023, + "step": 12142, + "time_per_iteration": 3.898263454437256 + }, + { + "auxiliary_loss_clip": 0.01113959, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.04275119, + "balance_loss_mlp": 1.02340722, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.9016321712264046, + "language_loss": 0.73732829, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75881565, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11364746, + "step": 12143, + "time_per_iteration": 2.419071912765503 + }, + { + "auxiliary_loss_clip": 0.0111254, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.0403986, + "balance_loss_mlp": 1.02262366, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 1.982492610795792, + "language_loss": 0.66201401, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68349737, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.13165283, + "step": 12144, + "time_per_iteration": 2.443094253540039 + }, + { + "auxiliary_loss_clip": 0.01118818, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.04401875, + "balance_loss_mlp": 1.02142072, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.5863689343755516, + "language_loss": 0.78885221, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81038284, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12835693, + "step": 12145, + "time_per_iteration": 2.391606569290161 + }, + { + "auxiliary_loss_clip": 0.01112771, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.04134178, + "balance_loss_mlp": 1.02023005, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.7116062618136128, + "language_loss": 0.91740721, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93886256, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12554932, + "step": 12146, + "time_per_iteration": 3.8664374351501465 + }, + { + "auxiliary_loss_clip": 0.01113328, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.04322159, + "balance_loss_mlp": 1.01547992, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.843833197230541, + "language_loss": 0.6193161, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64072043, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11621094, + "step": 12147, + "time_per_iteration": 2.4624998569488525 + }, + { + "auxiliary_loss_clip": 0.01126268, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.04887938, + "balance_loss_mlp": 1.02247715, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 2.0943449004201113, + "language_loss": 0.75453281, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77614707, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12683105, + "step": 12148, + "time_per_iteration": 2.409999132156372 + }, + { + "auxiliary_loss_clip": 0.01045572, + "auxiliary_loss_mlp": 0.0100114, + "balance_loss_clip": 1.02092969, + "balance_loss_mlp": 0.99969637, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 1.5817663786868035, + "language_loss": 0.56784666, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58831382, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.24633789, + "router_z_loss_mlp": 0.01445007, + "step": 12149, + "time_per_iteration": 3.060979127883911 + }, + { + "auxiliary_loss_clip": 0.01118052, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.04418373, + "balance_loss_mlp": 1.01993418, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 2.0841963044391862, + "language_loss": 0.73687017, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75837243, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12255859, + "step": 12150, + "time_per_iteration": 2.433387517929077 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.04409504, + "balance_loss_mlp": 1.01670671, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 9.889395641421753, + "language_loss": 0.8391059, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86057019, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12567139, + "step": 12151, + "time_per_iteration": 2.4290668964385986 + }, + { + "auxiliary_loss_clip": 0.0112977, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.05483556, + "balance_loss_mlp": 1.02307737, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 2.9048197883982905, + "language_loss": 0.77537352, + "learning_rate": 7.14010459655127e-07, + "loss": 0.79702669, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12457275, + "step": 12152, + "time_per_iteration": 3.944225549697876 + }, + { + "auxiliary_loss_clip": 0.01116256, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.0441488, + "balance_loss_mlp": 1.01655066, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.585668025325582, + "language_loss": 0.79478985, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81623507, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11694336, + "step": 12153, + "time_per_iteration": 2.5078914165496826 + }, + { + "auxiliary_loss_clip": 0.0112291, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.04802346, + "balance_loss_mlp": 1.02153385, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.710039171004026, + "language_loss": 0.67582089, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69738483, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11950684, + "step": 12154, + "time_per_iteration": 2.413239002227783 + }, + { + "auxiliary_loss_clip": 0.01117681, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.04511905, + "balance_loss_mlp": 1.01898885, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 2.1199332647934055, + "language_loss": 0.66082841, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68231928, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12420654, + "step": 12155, + "time_per_iteration": 2.4690542221069336 + }, + { + "auxiliary_loss_clip": 0.01103648, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.0337646, + "balance_loss_mlp": 1.01766992, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.6267325918480782, + "language_loss": 0.81670839, + "learning_rate": 7.128177409391851e-07, + "loss": 0.8380391, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11767578, + "step": 12156, + "time_per_iteration": 2.4275310039520264 + }, + { + "auxiliary_loss_clip": 0.01110388, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.04140544, + "balance_loss_mlp": 1.01947784, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.6924770783506697, + "language_loss": 0.75526774, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77667487, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10839844, + "step": 12157, + "time_per_iteration": 2.4405934810638428 + }, + { + "auxiliary_loss_clip": 0.01106039, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.03763771, + "balance_loss_mlp": 1.01764917, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.758435543874312, + "language_loss": 0.73455733, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75589597, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10174561, + "step": 12158, + "time_per_iteration": 2.3946831226348877 + }, + { + "auxiliary_loss_clip": 0.01130364, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.05402267, + "balance_loss_mlp": 1.01938331, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.9188070213535506, + "language_loss": 0.85289121, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87450135, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1126709, + "step": 12159, + "time_per_iteration": 2.4857664108276367 + }, + { + "auxiliary_loss_clip": 0.01114954, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.03982139, + "balance_loss_mlp": 1.01976061, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 6.291854175886561, + "language_loss": 0.73510075, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75659138, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.14349365, + "step": 12160, + "time_per_iteration": 2.397655963897705 + }, + { + "auxiliary_loss_clip": 0.01123691, + "auxiliary_loss_mlp": 0.01043215, + "balance_loss_clip": 1.0446434, + "balance_loss_mlp": 1.02932084, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 1.8823611145353094, + "language_loss": 0.72493625, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74660528, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13885498, + "step": 12161, + "time_per_iteration": 2.4484009742736816 + }, + { + "auxiliary_loss_clip": 0.01123962, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.04520476, + "balance_loss_mlp": 1.01755917, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 2.7147468161946113, + "language_loss": 0.69802761, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71957666, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.1338501, + "step": 12162, + "time_per_iteration": 2.4757120609283447 + }, + { + "auxiliary_loss_clip": 0.01115657, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.04192889, + "balance_loss_mlp": 1.01693726, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.689293932601844, + "language_loss": 0.6654709, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68692219, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12524414, + "step": 12163, + "time_per_iteration": 2.4599554538726807 + }, + { + "auxiliary_loss_clip": 0.01111017, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.03895354, + "balance_loss_mlp": 1.02121973, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.45836975245543, + "language_loss": 0.68307954, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70452607, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.12408447, + "step": 12164, + "time_per_iteration": 2.4279537200927734 + }, + { + "auxiliary_loss_clip": 0.01118782, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.0457567, + "balance_loss_mlp": 1.02500892, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.516671350485393, + "language_loss": 0.7254203, + "learning_rate": 7.101369803195391e-07, + "loss": 0.74697459, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11639404, + "step": 12165, + "time_per_iteration": 2.4923930168151855 + }, + { + "auxiliary_loss_clip": 0.01127147, + "auxiliary_loss_mlp": 0.01029832, + "balance_loss_clip": 1.05113745, + "balance_loss_mlp": 1.01758933, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 2.2969439434424017, + "language_loss": 0.76936126, + "learning_rate": 7.098393624365988e-07, + "loss": 0.79093111, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12237549, + "step": 12166, + "time_per_iteration": 2.4543333053588867 + }, + { + "auxiliary_loss_clip": 0.01118409, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.04759836, + "balance_loss_mlp": 1.02054489, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.7562995709339257, + "language_loss": 0.79816711, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81967062, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11401367, + "step": 12167, + "time_per_iteration": 3.962615489959717 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.04655361, + "balance_loss_mlp": 1.02251339, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 2.0055100447020946, + "language_loss": 0.77110273, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79260153, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10668945, + "step": 12168, + "time_per_iteration": 2.4739015102386475 + }, + { + "auxiliary_loss_clip": 0.01117347, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.04484272, + "balance_loss_mlp": 1.02184832, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.5305160443612218, + "language_loss": 0.8181271, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83965069, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1315918, + "step": 12169, + "time_per_iteration": 2.5029447078704834 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.04269123, + "balance_loss_mlp": 1.03398538, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.7785073067818846, + "language_loss": 0.70375508, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72543216, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13464355, + "step": 12170, + "time_per_iteration": 2.499265193939209 + }, + { + "auxiliary_loss_clip": 0.01114014, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.04299855, + "balance_loss_mlp": 1.0178721, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.0244130159588645, + "language_loss": 0.69837773, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71982569, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.12890625, + "step": 12171, + "time_per_iteration": 2.4750452041625977 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.04771328, + "balance_loss_mlp": 1.02466273, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 1.7149974141387052, + "language_loss": 0.65623063, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67780268, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12145996, + "step": 12172, + "time_per_iteration": 2.586127758026123 + }, + { + "auxiliary_loss_clip": 0.01124144, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.04837942, + "balance_loss_mlp": 1.01696861, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.241561554867889, + "language_loss": 0.6114428, + "learning_rate": 7.077574077323564e-07, + "loss": 0.6329776, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12359619, + "step": 12173, + "time_per_iteration": 2.487710475921631 + }, + { + "auxiliary_loss_clip": 0.01123932, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.05034769, + "balance_loss_mlp": 1.0162909, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 3.100560882045325, + "language_loss": 0.73856741, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76008809, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11834717, + "step": 12174, + "time_per_iteration": 2.465183973312378 + }, + { + "auxiliary_loss_clip": 0.0111266, + "auxiliary_loss_mlp": 0.01023916, + "balance_loss_clip": 1.04250956, + "balance_loss_mlp": 1.01280522, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.653231963820771, + "language_loss": 0.80812758, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82949334, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11108398, + "step": 12175, + "time_per_iteration": 2.5219759941101074 + }, + { + "auxiliary_loss_clip": 0.01113797, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.04137635, + "balance_loss_mlp": 1.01607251, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 2.1866688545114688, + "language_loss": 0.76482666, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78624022, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11499023, + "step": 12176, + "time_per_iteration": 2.512617826461792 + }, + { + "auxiliary_loss_clip": 0.01118572, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.04851496, + "balance_loss_mlp": 1.02212334, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 5.341862920148035, + "language_loss": 0.76495332, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78647768, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11737061, + "step": 12177, + "time_per_iteration": 2.5565993785858154 + }, + { + "auxiliary_loss_clip": 0.01123417, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.05037999, + "balance_loss_mlp": 1.0221045, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.139110008667949, + "language_loss": 0.74939609, + "learning_rate": 7.06271767062772e-07, + "loss": 0.7709679, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11657715, + "step": 12178, + "time_per_iteration": 2.4948391914367676 + }, + { + "auxiliary_loss_clip": 0.01119511, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.04454899, + "balance_loss_mlp": 1.02122808, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 2.2414920023084313, + "language_loss": 0.82366908, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84519273, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11621094, + "step": 12179, + "time_per_iteration": 2.5664947032928467 + }, + { + "auxiliary_loss_clip": 0.01116195, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.04891002, + "balance_loss_mlp": 1.0164628, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.9172015536386415, + "language_loss": 0.74778032, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76921332, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.67285156, + "router_z_loss_mlp": 0.10638428, + "step": 12180, + "time_per_iteration": 2.4527690410614014 + }, + { + "auxiliary_loss_clip": 0.01116205, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.03952587, + "balance_loss_mlp": 1.01558673, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.9753048647181815, + "language_loss": 0.78918833, + "learning_rate": 7.053809712705396e-07, + "loss": 0.810633, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12677002, + "step": 12181, + "time_per_iteration": 2.546471357345581 + }, + { + "auxiliary_loss_clip": 0.01121662, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.04852641, + "balance_loss_mlp": 1.01760578, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 2.013857694037487, + "language_loss": 0.71880651, + "learning_rate": 7.050841375089506e-07, + "loss": 0.74031687, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11773682, + "step": 12182, + "time_per_iteration": 2.4433741569519043 + }, + { + "auxiliary_loss_clip": 0.01122025, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.0467577, + "balance_loss_mlp": 1.02385688, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.5195071038030452, + "language_loss": 0.71345496, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73503911, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12512207, + "step": 12183, + "time_per_iteration": 2.551107406616211 + }, + { + "auxiliary_loss_clip": 0.01113404, + "auxiliary_loss_mlp": 0.01033261, + "balance_loss_clip": 1.04012442, + "balance_loss_mlp": 1.02030301, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.8898911651717496, + "language_loss": 0.7311095, + "learning_rate": 7.04490617307045e-07, + "loss": 0.75257611, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1295166, + "step": 12184, + "time_per_iteration": 2.4759626388549805 + }, + { + "auxiliary_loss_clip": 0.0104282, + "auxiliary_loss_mlp": 0.01001735, + "balance_loss_clip": 1.01799834, + "balance_loss_mlp": 1.00024176, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.7530444774008973, + "language_loss": 0.65180278, + "learning_rate": 7.041939308892344e-07, + "loss": 0.6722483, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.0149231, + "step": 12185, + "time_per_iteration": 3.0955636501312256 + }, + { + "auxiliary_loss_clip": 0.01111048, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.03793502, + "balance_loss_mlp": 1.0130527, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.8615952827653004, + "language_loss": 0.80404884, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82541537, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12542725, + "step": 12186, + "time_per_iteration": 3.9062821865081787 + }, + { + "auxiliary_loss_clip": 0.01113306, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.0402832, + "balance_loss_mlp": 1.01634538, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.7921063234517924, + "language_loss": 0.73202598, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75345331, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1307373, + "step": 12187, + "time_per_iteration": 2.444363832473755 + }, + { + "auxiliary_loss_clip": 0.01113248, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.0398463, + "balance_loss_mlp": 1.02161133, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 2.7523638519601192, + "language_loss": 0.88964045, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91110778, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11865234, + "step": 12188, + "time_per_iteration": 2.4324264526367188 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.0424521, + "balance_loss_mlp": 1.02505314, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 1.8884050821823022, + "language_loss": 0.74521238, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76678407, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.14538574, + "step": 12189, + "time_per_iteration": 2.4285647869110107 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01026019, + "balance_loss_clip": 1.04056025, + "balance_loss_mlp": 1.0134182, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.5926377456802046, + "language_loss": 0.82490653, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84631157, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12591553, + "step": 12190, + "time_per_iteration": 3.819413423538208 + }, + { + "auxiliary_loss_clip": 0.01117999, + "auxiliary_loss_mlp": 0.0103632, + "balance_loss_clip": 1.04207373, + "balance_loss_mlp": 1.02297425, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.934522933536527, + "language_loss": 0.71843231, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73997545, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.13348389, + "step": 12191, + "time_per_iteration": 2.4528121948242188 + }, + { + "auxiliary_loss_clip": 0.01118134, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.04462278, + "balance_loss_mlp": 1.02272236, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5693275150698904, + "language_loss": 0.69336772, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71489972, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12335205, + "step": 12192, + "time_per_iteration": 2.496317148208618 + }, + { + "auxiliary_loss_clip": 0.01116819, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.04463911, + "balance_loss_mlp": 1.01876783, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.6476510737489, + "language_loss": 0.73149037, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75296199, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11572266, + "step": 12193, + "time_per_iteration": 2.4900729656219482 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.04218364, + "balance_loss_mlp": 1.02261305, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 2.041208939200246, + "language_loss": 0.7704345, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79198539, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13134766, + "step": 12194, + "time_per_iteration": 2.4404094219207764 + }, + { + "auxiliary_loss_clip": 0.01113025, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.04114342, + "balance_loss_mlp": 1.01737428, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 2.1373863329488922, + "language_loss": 0.7057544, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72718936, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.13092041, + "step": 12195, + "time_per_iteration": 2.4241316318511963 + }, + { + "auxiliary_loss_clip": 0.01123539, + "auxiliary_loss_mlp": 0.0103676, + "balance_loss_clip": 1.04709888, + "balance_loss_mlp": 1.02532792, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 2.1599754217252993, + "language_loss": 0.72596312, + "learning_rate": 7.009336258154057e-07, + "loss": 0.7475661, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11444092, + "step": 12196, + "time_per_iteration": 2.403128147125244 + }, + { + "auxiliary_loss_clip": 0.01121472, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.01527452, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.6928105186042401, + "language_loss": 0.71677828, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73826534, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11962891, + "step": 12197, + "time_per_iteration": 3.9315855503082275 + }, + { + "auxiliary_loss_clip": 0.01124916, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.04799366, + "balance_loss_mlp": 1.02480233, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 2.20119890204034, + "language_loss": 0.77774036, + "learning_rate": 7.003414830260282e-07, + "loss": 0.79937029, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13256836, + "step": 12198, + "time_per_iteration": 2.4503376483917236 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.04909527, + "balance_loss_mlp": 1.02070689, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.833295882948694, + "language_loss": 0.74039692, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76193392, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11541748, + "step": 12199, + "time_per_iteration": 2.486907482147217 + }, + { + "auxiliary_loss_clip": 0.01125179, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.04841805, + "balance_loss_mlp": 1.01932287, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.414713162720585, + "language_loss": 0.77066672, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79224133, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1295166, + "step": 12200, + "time_per_iteration": 2.376185417175293 + }, + { + "auxiliary_loss_clip": 0.01119331, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.04671311, + "balance_loss_mlp": 1.01815307, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 2.2026889598290134, + "language_loss": 0.61894238, + "learning_rate": 6.994536384938754e-07, + "loss": 0.64043599, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11871338, + "step": 12201, + "time_per_iteration": 2.4848198890686035 + }, + { + "auxiliary_loss_clip": 0.01120278, + "auxiliary_loss_mlp": 0.01024465, + "balance_loss_clip": 1.04936254, + "balance_loss_mlp": 1.01342666, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.6079788781440276, + "language_loss": 0.51530337, + "learning_rate": 6.991577889352264e-07, + "loss": 0.53675079, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11035156, + "step": 12202, + "time_per_iteration": 2.536757707595825 + }, + { + "auxiliary_loss_clip": 0.01118355, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.04678166, + "balance_loss_mlp": 1.01714492, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.6420914879840929, + "language_loss": 0.68711573, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70858765, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11700439, + "step": 12203, + "time_per_iteration": 2.452150583267212 + }, + { + "auxiliary_loss_clip": 0.0111881, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.04122829, + "balance_loss_mlp": 1.02200246, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.3102871834407948, + "language_loss": 0.65683806, + "learning_rate": 6.985662378133474e-07, + "loss": 0.67837769, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13171387, + "step": 12204, + "time_per_iteration": 2.424609661102295 + }, + { + "auxiliary_loss_clip": 0.0111496, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.0419836, + "balance_loss_mlp": 1.02238166, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.0125280348861594, + "language_loss": 0.77309531, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79459786, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12908936, + "step": 12205, + "time_per_iteration": 2.422924518585205 + }, + { + "auxiliary_loss_clip": 0.01112934, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.04334903, + "balance_loss_mlp": 1.0201385, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.6385999706166512, + "language_loss": 0.79457837, + "learning_rate": 6.979748840934601e-07, + "loss": 0.8160376, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.128479, + "step": 12206, + "time_per_iteration": 2.4917995929718018 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01029409, + "balance_loss_clip": 1.04207182, + "balance_loss_mlp": 1.0172075, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 2.1815347554093654, + "language_loss": 0.71563739, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73707688, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12200928, + "step": 12207, + "time_per_iteration": 2.5583505630493164 + }, + { + "auxiliary_loss_clip": 0.01040022, + "auxiliary_loss_mlp": 0.01005714, + "balance_loss_clip": 1.01522756, + "balance_loss_mlp": 1.00424159, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7796019868817953, + "language_loss": 0.54788893, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56834632, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01472473, + "step": 12208, + "time_per_iteration": 3.146855354309082 + }, + { + "auxiliary_loss_clip": 0.01119911, + "auxiliary_loss_mlp": 0.01027859, + "balance_loss_clip": 1.04773712, + "balance_loss_mlp": 1.01698732, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.380463399248008, + "language_loss": 0.80618358, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82766128, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10876465, + "step": 12209, + "time_per_iteration": 2.465439796447754 + }, + { + "auxiliary_loss_clip": 0.01115883, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.04398966, + "balance_loss_mlp": 1.01637161, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.5604000218338638, + "language_loss": 0.79000181, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81143707, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1126709, + "step": 12210, + "time_per_iteration": 2.472728729248047 + }, + { + "auxiliary_loss_clip": 0.01110172, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.03897381, + "balance_loss_mlp": 1.01662481, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 2.1201598721206714, + "language_loss": 0.76473391, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78612339, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12133789, + "step": 12211, + "time_per_iteration": 3.8265721797943115 + }, + { + "auxiliary_loss_clip": 0.01121453, + "auxiliary_loss_mlp": 0.01026295, + "balance_loss_clip": 1.0469805, + "balance_loss_mlp": 1.01413524, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 4.3213686184572335, + "language_loss": 0.71777046, + "learning_rate": 6.962020082425748e-07, + "loss": 0.73924792, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1217041, + "step": 12212, + "time_per_iteration": 2.4569597244262695 + }, + { + "auxiliary_loss_clip": 0.01118222, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.04547691, + "balance_loss_mlp": 1.02077222, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.5434011080736427, + "language_loss": 0.68740344, + "learning_rate": 6.959067019092766e-07, + "loss": 0.70890975, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11627197, + "step": 12213, + "time_per_iteration": 2.4419453144073486 + }, + { + "auxiliary_loss_clip": 0.0103866, + "auxiliary_loss_mlp": 0.01001871, + "balance_loss_clip": 1.01392376, + "balance_loss_mlp": 1.0004704, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7239378601706804, + "language_loss": 0.54272878, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56313407, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01400757, + "step": 12214, + "time_per_iteration": 2.9641809463500977 + }, + { + "auxiliary_loss_clip": 0.01113206, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.03778672, + "balance_loss_mlp": 1.01652098, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 2.2449367346148144, + "language_loss": 0.70244992, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72386611, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11901855, + "step": 12215, + "time_per_iteration": 2.4117586612701416 + }, + { + "auxiliary_loss_clip": 0.01111652, + "auxiliary_loss_mlp": 0.01025755, + "balance_loss_clip": 1.04313254, + "balance_loss_mlp": 1.01529396, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.6037443089314252, + "language_loss": 0.72614574, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74751985, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10449219, + "step": 12216, + "time_per_iteration": 2.427468776702881 + }, + { + "auxiliary_loss_clip": 0.01124631, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.04664803, + "balance_loss_mlp": 1.0231638, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 2.2988536888994684, + "language_loss": 0.78194308, + "learning_rate": 6.947259712015236e-07, + "loss": 0.8035776, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.15649414, + "step": 12217, + "time_per_iteration": 2.469353437423706 + }, + { + "auxiliary_loss_clip": 0.01116841, + "auxiliary_loss_mlp": 0.01024288, + "balance_loss_clip": 1.04445839, + "balance_loss_mlp": 1.01336801, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 2.185561547373076, + "language_loss": 0.77921373, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80062503, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10919189, + "step": 12218, + "time_per_iteration": 2.420926094055176 + }, + { + "auxiliary_loss_clip": 0.01115745, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.04503894, + "balance_loss_mlp": 1.01667953, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 2.3254297042973864, + "language_loss": 0.72039706, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74184167, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12042236, + "step": 12219, + "time_per_iteration": 2.5917835235595703 + }, + { + "auxiliary_loss_clip": 0.01109543, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.03951263, + "balance_loss_mlp": 1.01801825, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 2.855010717495066, + "language_loss": 0.75167358, + "learning_rate": 6.938409428408061e-07, + "loss": 0.77306211, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11303711, + "step": 12220, + "time_per_iteration": 2.5550851821899414 + }, + { + "auxiliary_loss_clip": 0.01122856, + "auxiliary_loss_mlp": 0.01025587, + "balance_loss_clip": 1.04811764, + "balance_loss_mlp": 1.014256, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.682308609178172, + "language_loss": 0.65615273, + "learning_rate": 6.93546032431684e-07, + "loss": 0.6776371, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11334229, + "step": 12221, + "time_per_iteration": 2.504639148712158 + }, + { + "auxiliary_loss_clip": 0.01116032, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.04164815, + "balance_loss_mlp": 1.02833617, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 2.2119165434471775, + "language_loss": 0.69199419, + "learning_rate": 6.932511715634273e-07, + "loss": 0.7135762, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13830566, + "step": 12222, + "time_per_iteration": 2.555262804031372 + }, + { + "auxiliary_loss_clip": 0.01111244, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.03966975, + "balance_loss_mlp": 1.01762986, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.931970912076261, + "language_loss": 0.66241717, + "learning_rate": 6.92956360247217e-07, + "loss": 0.68381345, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10748291, + "step": 12223, + "time_per_iteration": 2.4613821506500244 + }, + { + "auxiliary_loss_clip": 0.01119721, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.04190588, + "balance_loss_mlp": 1.02309799, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.9218143736824433, + "language_loss": 0.72311455, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74466109, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.1182251, + "step": 12224, + "time_per_iteration": 2.4331793785095215 + }, + { + "auxiliary_loss_clip": 0.01122621, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.04648554, + "balance_loss_mlp": 1.0183512, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.7641672558135773, + "language_loss": 0.72613132, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74766469, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.1237793, + "step": 12225, + "time_per_iteration": 2.4655449390411377 + }, + { + "auxiliary_loss_clip": 0.01124937, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.04582298, + "balance_loss_mlp": 1.02125978, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.9395303448521313, + "language_loss": 0.76285547, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78445482, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13739014, + "step": 12226, + "time_per_iteration": 2.4627268314361572 + }, + { + "auxiliary_loss_clip": 0.01124166, + "auxiliary_loss_mlp": 0.0102483, + "balance_loss_clip": 1.05004609, + "balance_loss_mlp": 1.01291466, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.4638225694001752, + "language_loss": 0.66704345, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68853348, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1192627, + "step": 12227, + "time_per_iteration": 2.4848461151123047 + }, + { + "auxiliary_loss_clip": 0.01122742, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.04369628, + "balance_loss_mlp": 1.02838302, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.492340467110389, + "language_loss": 0.63705122, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65868193, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11950684, + "step": 12228, + "time_per_iteration": 2.4900362491607666 + }, + { + "auxiliary_loss_clip": 0.01114633, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.04264176, + "balance_loss_mlp": 1.02197051, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 1.655470014050093, + "language_loss": 0.63401735, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65549344, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11004639, + "step": 12229, + "time_per_iteration": 2.402348518371582 + }, + { + "auxiliary_loss_clip": 0.01124264, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.04782271, + "balance_loss_mlp": 1.02122056, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.632510204281967, + "language_loss": 0.73699152, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75857538, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12908936, + "step": 12230, + "time_per_iteration": 3.950106382369995 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.04613042, + "balance_loss_mlp": 1.02382016, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 1.9929833758648043, + "language_loss": 0.71402538, + "learning_rate": 6.90599654932332e-07, + "loss": 0.73557878, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11975098, + "step": 12231, + "time_per_iteration": 2.4160690307617188 + }, + { + "auxiliary_loss_clip": 0.01120837, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.04734325, + "balance_loss_mlp": 1.02489376, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 2.5818949687765484, + "language_loss": 0.63831538, + "learning_rate": 6.903052900873823e-07, + "loss": 0.65990692, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.13415527, + "step": 12232, + "time_per_iteration": 2.437093496322632 + }, + { + "auxiliary_loss_clip": 0.01123461, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.04731989, + "balance_loss_mlp": 1.02366006, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.8143712923862219, + "language_loss": 0.75194716, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77354026, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12194824, + "step": 12233, + "time_per_iteration": 2.3858816623687744 + }, + { + "auxiliary_loss_clip": 0.01133508, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.05686045, + "balance_loss_mlp": 1.01442921, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.9452851997455913, + "language_loss": 0.73528004, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75687951, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12030029, + "step": 12234, + "time_per_iteration": 3.915034055709839 + }, + { + "auxiliary_loss_clip": 0.01117995, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.0457648, + "balance_loss_mlp": 1.01578665, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.4505338720007086, + "language_loss": 0.59553373, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61698961, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11810303, + "step": 12235, + "time_per_iteration": 2.4729599952697754 + }, + { + "auxiliary_loss_clip": 0.01123208, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.05103564, + "balance_loss_mlp": 1.01444304, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.19301444495208, + "language_loss": 0.86277533, + "learning_rate": 6.891283274567259e-07, + "loss": 0.88426888, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11712646, + "step": 12236, + "time_per_iteration": 2.382417917251587 + }, + { + "auxiliary_loss_clip": 0.01119015, + "auxiliary_loss_mlp": 0.01026183, + "balance_loss_clip": 1.04631937, + "balance_loss_mlp": 1.01463151, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.8636806000372306, + "language_loss": 0.69732612, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71877813, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11560059, + "step": 12237, + "time_per_iteration": 2.4362730979919434 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.04504585, + "balance_loss_mlp": 1.01565325, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.6061234102762512, + "language_loss": 0.72494078, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74638486, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1161499, + "step": 12238, + "time_per_iteration": 2.4086625576019287 + }, + { + "auxiliary_loss_clip": 0.01119537, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.04334235, + "balance_loss_mlp": 1.01775622, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.8690132819754846, + "language_loss": 0.72825533, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74975151, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12341309, + "step": 12239, + "time_per_iteration": 2.491899013519287 + }, + { + "auxiliary_loss_clip": 0.01111441, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.04288459, + "balance_loss_mlp": 1.01695812, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.4405873795575395, + "language_loss": 0.7899611, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81135893, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.11376953, + "step": 12240, + "time_per_iteration": 3.9197254180908203 + }, + { + "auxiliary_loss_clip": 0.0111459, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.04370117, + "balance_loss_mlp": 1.01918507, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 2.635224886031936, + "language_loss": 0.83255088, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85400307, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11444092, + "step": 12241, + "time_per_iteration": 2.4389281272888184 + }, + { + "auxiliary_loss_clip": 0.01122272, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.05050063, + "balance_loss_mlp": 1.01711082, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 2.082523823775671, + "language_loss": 0.79091763, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81242752, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1161499, + "step": 12242, + "time_per_iteration": 2.501608371734619 + }, + { + "auxiliary_loss_clip": 0.01117551, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.04640615, + "balance_loss_mlp": 1.01557839, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 3.0278229685026097, + "language_loss": 0.79547644, + "learning_rate": 6.870705570551145e-07, + "loss": 0.8169176, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10998535, + "step": 12243, + "time_per_iteration": 2.5426864624023438 + }, + { + "auxiliary_loss_clip": 0.011185, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.04532683, + "balance_loss_mlp": 1.02063012, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.323958496694838, + "language_loss": 0.73950922, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76102328, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12280273, + "step": 12244, + "time_per_iteration": 2.436062812805176 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.04279983, + "balance_loss_mlp": 1.01809359, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.8920570584209815, + "language_loss": 0.69782281, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71927595, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12280273, + "step": 12245, + "time_per_iteration": 2.5079505443573 + }, + { + "auxiliary_loss_clip": 0.01113451, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.04370832, + "balance_loss_mlp": 1.01718259, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.486866124188068, + "language_loss": 0.73357117, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75500154, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.12408447, + "step": 12246, + "time_per_iteration": 2.5030314922332764 + }, + { + "auxiliary_loss_clip": 0.01108796, + "auxiliary_loss_mlp": 0.01021007, + "balance_loss_clip": 1.03889608, + "balance_loss_mlp": 1.00989103, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.1029875604119193, + "language_loss": 0.73124552, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75254351, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11114502, + "step": 12247, + "time_per_iteration": 2.4332122802734375 + }, + { + "auxiliary_loss_clip": 0.01115897, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.04678273, + "balance_loss_mlp": 1.01712561, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.665968844540561, + "language_loss": 0.74266779, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76411188, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.1138916, + "step": 12248, + "time_per_iteration": 2.463920831680298 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.0396328, + "balance_loss_mlp": 1.0226295, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 2.9521874182320094, + "language_loss": 0.72635096, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74783868, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1206665, + "step": 12249, + "time_per_iteration": 2.422677516937256 + }, + { + "auxiliary_loss_clip": 0.01118859, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.04470527, + "balance_loss_mlp": 1.01901579, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.9145771087510484, + "language_loss": 0.77154124, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79304492, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12493896, + "step": 12250, + "time_per_iteration": 2.463750123977661 + }, + { + "auxiliary_loss_clip": 0.01117592, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.04331601, + "balance_loss_mlp": 1.01598048, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.595407854328932, + "language_loss": 0.71223301, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73368788, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11920166, + "step": 12251, + "time_per_iteration": 2.497804641723633 + }, + { + "auxiliary_loss_clip": 0.01125033, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.05153251, + "balance_loss_mlp": 1.01718915, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.493581485557777, + "language_loss": 0.65714246, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67868477, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12017822, + "step": 12252, + "time_per_iteration": 2.8303942680358887 + }, + { + "auxiliary_loss_clip": 0.01120254, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.04613352, + "balance_loss_mlp": 1.01727426, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.561295517839661, + "language_loss": 0.79278266, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81427801, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12005615, + "step": 12253, + "time_per_iteration": 2.4921154975891113 + }, + { + "auxiliary_loss_clip": 0.01111525, + "auxiliary_loss_mlp": 0.01024179, + "balance_loss_clip": 1.04327893, + "balance_loss_mlp": 1.01299691, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 4.22163833315176, + "language_loss": 0.75847971, + "learning_rate": 6.83841848176905e-07, + "loss": 0.77983677, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.11175537, + "step": 12254, + "time_per_iteration": 2.432638645172119 + }, + { + "auxiliary_loss_clip": 0.01118693, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.0467875, + "balance_loss_mlp": 1.0190922, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.5860642159524776, + "language_loss": 0.69800234, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71950322, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12304688, + "step": 12255, + "time_per_iteration": 3.946059465408325 + }, + { + "auxiliary_loss_clip": 0.01121152, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.04609942, + "balance_loss_mlp": 1.01661241, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 2.134924352946164, + "language_loss": 0.75648588, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77798557, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12194824, + "step": 12256, + "time_per_iteration": 2.4579598903656006 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.0361774, + "balance_loss_mlp": 1.01673079, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.6791724827536443, + "language_loss": 0.74097085, + "learning_rate": 6.829623386729182e-07, + "loss": 0.7623539, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12896729, + "step": 12257, + "time_per_iteration": 2.5685269832611084 + }, + { + "auxiliary_loss_clip": 0.01110339, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.03813624, + "balance_loss_mlp": 1.01929307, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.807478444662749, + "language_loss": 0.78101158, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80242026, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11230469, + "step": 12258, + "time_per_iteration": 2.442566156387329 + }, + { + "auxiliary_loss_clip": 0.01113643, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.04025817, + "balance_loss_mlp": 1.0177443, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.517646662189253, + "language_loss": 0.66621459, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68764669, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11834717, + "step": 12259, + "time_per_iteration": 2.454167604446411 + }, + { + "auxiliary_loss_clip": 0.01114787, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.04174447, + "balance_loss_mlp": 1.01701665, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.8759907582586925, + "language_loss": 0.73441803, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75587124, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.13519287, + "step": 12260, + "time_per_iteration": 2.45849347114563 + }, + { + "auxiliary_loss_clip": 0.01116589, + "auxiliary_loss_mlp": 0.01027093, + "balance_loss_clip": 1.04195905, + "balance_loss_mlp": 1.01550031, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.5607616490950011, + "language_loss": 0.73581076, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75724763, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11602783, + "step": 12261, + "time_per_iteration": 2.4326651096343994 + }, + { + "auxiliary_loss_clip": 0.01119393, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.04491758, + "balance_loss_mlp": 1.0185113, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 2.562218947492304, + "language_loss": 0.66793936, + "learning_rate": 6.814974884917438e-07, + "loss": 0.68944466, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1262207, + "step": 12262, + "time_per_iteration": 2.4787209033966064 + }, + { + "auxiliary_loss_clip": 0.01127074, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.01931977, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.772158790707059, + "language_loss": 0.88626897, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90787143, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1385498, + "step": 12263, + "time_per_iteration": 2.3975419998168945 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.01025631, + "balance_loss_clip": 1.04172146, + "balance_loss_mlp": 1.01540923, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.494680511270852, + "language_loss": 0.67294806, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69429874, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.67724609, + "router_z_loss_mlp": 0.10223389, + "step": 12264, + "time_per_iteration": 2.464247226715088 + }, + { + "auxiliary_loss_clip": 0.01109772, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03943503, + "balance_loss_mlp": 1.01812482, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 1.7624622348698107, + "language_loss": 0.80456495, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82597262, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.12860107, + "step": 12265, + "time_per_iteration": 2.5453433990478516 + }, + { + "auxiliary_loss_clip": 0.01118025, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.04138899, + "balance_loss_mlp": 1.017892, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.7296276687065149, + "language_loss": 0.74577844, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76725733, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11975098, + "step": 12266, + "time_per_iteration": 2.4841160774230957 + }, + { + "auxiliary_loss_clip": 0.01112141, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.0390451, + "balance_loss_mlp": 1.0185101, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.7224533593781326, + "language_loss": 0.73659152, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75802916, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.13110352, + "step": 12267, + "time_per_iteration": 2.588874578475952 + }, + { + "auxiliary_loss_clip": 0.01112243, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.04163957, + "balance_loss_mlp": 1.01939189, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 2.0347923745584007, + "language_loss": 0.82797801, + "learning_rate": 6.797413183219923e-07, + "loss": 0.84940761, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11328125, + "step": 12268, + "time_per_iteration": 2.466635227203369 + }, + { + "auxiliary_loss_clip": 0.0111979, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.04456961, + "balance_loss_mlp": 1.02115011, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.852814471227356, + "language_loss": 0.733845, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75537395, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11962891, + "step": 12269, + "time_per_iteration": 2.4932985305786133 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.03876305, + "balance_loss_mlp": 1.0216856, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 1.9463672492477535, + "language_loss": 0.7000618, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72154963, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13891602, + "step": 12270, + "time_per_iteration": 2.6247177124023438 + }, + { + "auxiliary_loss_clip": 0.01114439, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.04209018, + "balance_loss_mlp": 1.01937163, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.9195998789542121, + "language_loss": 0.69457841, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71602762, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11108398, + "step": 12271, + "time_per_iteration": 2.4603428840637207 + }, + { + "auxiliary_loss_clip": 0.01113309, + "auxiliary_loss_mlp": 0.01031469, + "balance_loss_clip": 1.03870356, + "balance_loss_mlp": 1.01896381, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 2.316662278697389, + "language_loss": 0.68084037, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70228815, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12518311, + "step": 12272, + "time_per_iteration": 2.509002685546875 + }, + { + "auxiliary_loss_clip": 0.0111389, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.0418961, + "balance_loss_mlp": 1.02465177, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 2.4109368835915133, + "language_loss": 0.78241432, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80393106, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.13146973, + "step": 12273, + "time_per_iteration": 2.5390334129333496 + }, + { + "auxiliary_loss_clip": 0.01111828, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.04038644, + "balance_loss_mlp": 1.01635122, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 3.1841926839869332, + "language_loss": 0.83497858, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85637879, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11834717, + "step": 12274, + "time_per_iteration": 3.8719348907470703 + }, + { + "auxiliary_loss_clip": 0.01119722, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.04277313, + "balance_loss_mlp": 1.0139308, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.807479517486468, + "language_loss": 0.73666549, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75812781, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12585449, + "step": 12275, + "time_per_iteration": 2.531001567840576 + }, + { + "auxiliary_loss_clip": 0.01129001, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.04967177, + "balance_loss_mlp": 1.02418828, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.7487505819493043, + "language_loss": 0.73512769, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75678766, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12811279, + "step": 12276, + "time_per_iteration": 2.5019521713256836 + }, + { + "auxiliary_loss_clip": 0.01115468, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.04316783, + "balance_loss_mlp": 1.01598489, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 1.9079985591835291, + "language_loss": 0.77138126, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79281139, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11560059, + "step": 12277, + "time_per_iteration": 2.4466865062713623 + }, + { + "auxiliary_loss_clip": 0.01106063, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.02298808, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.763356623870088, + "language_loss": 0.78424287, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80565226, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11877441, + "step": 12278, + "time_per_iteration": 2.448265790939331 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.05004275, + "balance_loss_mlp": 1.02396417, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.0853213205622794, + "language_loss": 0.71948385, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74111569, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12701416, + "step": 12279, + "time_per_iteration": 3.8212027549743652 + }, + { + "auxiliary_loss_clip": 0.0111586, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.04069626, + "balance_loss_mlp": 1.02319813, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.331866372201481, + "language_loss": 0.85953319, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88105834, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13452148, + "step": 12280, + "time_per_iteration": 2.391735553741455 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.04119945, + "balance_loss_mlp": 1.01859546, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.9134268580400444, + "language_loss": 0.72630417, + "learning_rate": 6.759424690946408e-07, + "loss": 0.7477746, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1348877, + "step": 12281, + "time_per_iteration": 2.418186902999878 + }, + { + "auxiliary_loss_clip": 0.01115267, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.04009616, + "balance_loss_mlp": 1.01774955, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.5260186204136756, + "language_loss": 0.60778725, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62924051, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12304688, + "step": 12282, + "time_per_iteration": 2.426941156387329 + }, + { + "auxiliary_loss_clip": 0.01120895, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.0448544, + "balance_loss_mlp": 1.01979363, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 1.8767334564625269, + "language_loss": 0.676929, + "learning_rate": 6.753587832687632e-07, + "loss": 0.69846451, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12872314, + "step": 12283, + "time_per_iteration": 3.898775577545166 + }, + { + "auxiliary_loss_clip": 0.01115436, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.04325068, + "balance_loss_mlp": 1.02217209, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.7410392233229353, + "language_loss": 0.75799024, + "learning_rate": 6.750670156960832e-07, + "loss": 0.77948612, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11975098, + "step": 12284, + "time_per_iteration": 2.5850770473480225 + }, + { + "auxiliary_loss_clip": 0.0111133, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.03791916, + "balance_loss_mlp": 1.02078056, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 2.0264558182772685, + "language_loss": 0.69553643, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71700084, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.14337158, + "step": 12285, + "time_per_iteration": 2.4298901557922363 + }, + { + "auxiliary_loss_clip": 0.01118437, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.04276896, + "balance_loss_mlp": 1.01803994, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 2.033250539977999, + "language_loss": 0.79736406, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81885689, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12805176, + "step": 12286, + "time_per_iteration": 2.4592065811157227 + }, + { + "auxiliary_loss_clip": 0.01112985, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.04220891, + "balance_loss_mlp": 1.01558483, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 3.357320578619325, + "language_loss": 0.6568135, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67822111, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.12182617, + "step": 12287, + "time_per_iteration": 2.429119110107422 + }, + { + "auxiliary_loss_clip": 0.01109792, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.04029465, + "balance_loss_mlp": 1.01356733, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.784560348698202, + "language_loss": 0.76531672, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78666162, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11126709, + "step": 12288, + "time_per_iteration": 2.4875192642211914 + }, + { + "auxiliary_loss_clip": 0.01112712, + "auxiliary_loss_mlp": 0.01030744, + "balance_loss_clip": 1.03808022, + "balance_loss_mlp": 1.01730299, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.7932842175368335, + "language_loss": 0.58425796, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60569251, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.13446045, + "step": 12289, + "time_per_iteration": 2.6616315841674805 + }, + { + "auxiliary_loss_clip": 0.01048369, + "auxiliary_loss_mlp": 0.01010415, + "balance_loss_clip": 1.02382874, + "balance_loss_mlp": 1.0088923, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6427894566960363, + "language_loss": 0.49258679, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51317465, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01522827, + "step": 12290, + "time_per_iteration": 3.2560384273529053 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.03981948, + "balance_loss_mlp": 1.01690733, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 1.8104691854356083, + "language_loss": 0.67283976, + "learning_rate": 6.730260500712237e-07, + "loss": 0.6942836, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.1340332, + "step": 12291, + "time_per_iteration": 2.4663174152374268 + }, + { + "auxiliary_loss_clip": 0.01039699, + "auxiliary_loss_mlp": 0.0101152, + "balance_loss_clip": 1.01489282, + "balance_loss_mlp": 1.01001048, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9738591988607913, + "language_loss": 0.60819662, + "learning_rate": 6.727346847409052e-07, + "loss": 0.62870884, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.0151062, + "step": 12292, + "time_per_iteration": 2.7288336753845215 + }, + { + "auxiliary_loss_clip": 0.01120698, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.04619074, + "balance_loss_mlp": 1.02133036, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 1.8164357626244438, + "language_loss": 0.67396009, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69549644, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1159668, + "step": 12293, + "time_per_iteration": 2.507342576980591 + }, + { + "auxiliary_loss_clip": 0.01116336, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.04327822, + "balance_loss_mlp": 1.01973283, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.951509613688628, + "language_loss": 0.83393002, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85543096, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.14019775, + "step": 12294, + "time_per_iteration": 2.4159536361694336 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.0103116, + "balance_loss_clip": 1.04277694, + "balance_loss_mlp": 1.01864934, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 2.4479684651843145, + "language_loss": 0.73142123, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75285816, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.12506104, + "step": 12295, + "time_per_iteration": 2.5349538326263428 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.04357696, + "balance_loss_mlp": 1.02355456, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 2.053963522315817, + "language_loss": 0.78733182, + "learning_rate": 6.715697268304215e-07, + "loss": 0.80880356, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.11346436, + "step": 12296, + "time_per_iteration": 2.5198116302490234 + }, + { + "auxiliary_loss_clip": 0.01113234, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.04033244, + "balance_loss_mlp": 1.02059364, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.9716301303093307, + "language_loss": 0.66649526, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68797356, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.14013672, + "step": 12297, + "time_per_iteration": 2.5853798389434814 + }, + { + "auxiliary_loss_clip": 0.01117352, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.04286981, + "balance_loss_mlp": 1.0228653, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 1.5688756764443568, + "language_loss": 0.68724096, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70877135, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1282959, + "step": 12298, + "time_per_iteration": 3.870903730392456 + }, + { + "auxiliary_loss_clip": 0.0111538, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.0430603, + "balance_loss_mlp": 1.01876879, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.915399460531852, + "language_loss": 0.74293214, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76439542, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12176514, + "step": 12299, + "time_per_iteration": 2.4170188903808594 + }, + { + "auxiliary_loss_clip": 0.01049918, + "auxiliary_loss_mlp": 0.01002286, + "balance_loss_clip": 1.02525449, + "balance_loss_mlp": 1.00092983, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7234467716468349, + "language_loss": 0.60832554, + "learning_rate": 6.704055749072455e-07, + "loss": 0.6288476, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01353455, + "step": 12300, + "time_per_iteration": 3.1171722412109375 + }, + { + "auxiliary_loss_clip": 0.01115222, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.04306674, + "balance_loss_mlp": 1.01473522, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.685969077394449, + "language_loss": 0.80257201, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82399899, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12744141, + "step": 12301, + "time_per_iteration": 2.496673822402954 + }, + { + "auxiliary_loss_clip": 0.01116631, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.04518867, + "balance_loss_mlp": 1.01408553, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.4906301060264888, + "language_loss": 0.73137188, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75279284, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1137085, + "step": 12302, + "time_per_iteration": 2.4608993530273438 + }, + { + "auxiliary_loss_clip": 0.01115083, + "auxiliary_loss_mlp": 0.01046322, + "balance_loss_clip": 1.04048634, + "balance_loss_mlp": 1.03151071, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 2.271964123060381, + "language_loss": 0.7408908, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76250488, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.14819336, + "step": 12303, + "time_per_iteration": 2.4833364486694336 + }, + { + "auxiliary_loss_clip": 0.01116277, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.04292846, + "balance_loss_mlp": 1.01939261, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.7540943741388928, + "language_loss": 0.5428583, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56433374, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11877441, + "step": 12304, + "time_per_iteration": 2.479308843612671 + }, + { + "auxiliary_loss_clip": 0.0111618, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.04375911, + "balance_loss_mlp": 1.01720524, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.8962299025955893, + "language_loss": 0.84693027, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86838222, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11810303, + "step": 12305, + "time_per_iteration": 2.465219736099243 + }, + { + "auxiliary_loss_clip": 0.01054534, + "auxiliary_loss_mlp": 0.01001971, + "balance_loss_clip": 1.02996695, + "balance_loss_mlp": 1.00056899, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.873389378848187, + "language_loss": 0.57670814, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59727323, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01402283, + "step": 12306, + "time_per_iteration": 3.0535616874694824 + }, + { + "auxiliary_loss_clip": 0.0112432, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.05031848, + "balance_loss_mlp": 1.01594973, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 3.390756213014766, + "language_loss": 0.81699848, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83852232, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12109375, + "step": 12307, + "time_per_iteration": 2.4446849822998047 + }, + { + "auxiliary_loss_clip": 0.01125009, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.05402231, + "balance_loss_mlp": 1.02085412, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 2.008278419780899, + "language_loss": 0.7001878, + "learning_rate": 6.680796918475893e-07, + "loss": 0.7217645, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11804199, + "step": 12308, + "time_per_iteration": 2.407345771789551 + }, + { + "auxiliary_loss_clip": 0.01106104, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.03639793, + "balance_loss_mlp": 1.01609039, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 2.0452850150958986, + "language_loss": 0.81748915, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83884162, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.13037109, + "step": 12309, + "time_per_iteration": 2.5014615058898926 + }, + { + "auxiliary_loss_clip": 0.01119141, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.04408741, + "balance_loss_mlp": 1.01710224, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 4.188389030167782, + "language_loss": 0.72663879, + "learning_rate": 6.674987259277692e-07, + "loss": 0.74812663, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12542725, + "step": 12310, + "time_per_iteration": 2.4439914226531982 + }, + { + "auxiliary_loss_clip": 0.01120738, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.04511094, + "balance_loss_mlp": 1.02011764, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.9337624577188026, + "language_loss": 0.88225675, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90379763, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.13226318, + "step": 12311, + "time_per_iteration": 2.411268711090088 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.04011512, + "balance_loss_mlp": 1.01529288, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.5113417752536917, + "language_loss": 0.80331343, + "learning_rate": 6.669179621222738e-07, + "loss": 0.8246932, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10986328, + "step": 12312, + "time_per_iteration": 2.446115732192993 + }, + { + "auxiliary_loss_clip": 0.01109335, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.03928304, + "balance_loss_mlp": 1.0201807, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.6915622693386116, + "language_loss": 0.77761734, + "learning_rate": 6.666276560399273e-07, + "loss": 0.79903889, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.12646484, + "step": 12313, + "time_per_iteration": 2.5782582759857178 + }, + { + "auxiliary_loss_clip": 0.01113269, + "auxiliary_loss_mlp": 0.01037222, + "balance_loss_clip": 1.03814769, + "balance_loss_mlp": 1.02280974, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 1.8711758100069495, + "language_loss": 0.78396565, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80547059, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.14416504, + "step": 12314, + "time_per_iteration": 2.410741090774536 + }, + { + "auxiliary_loss_clip": 0.0104318, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.01782382, + "balance_loss_mlp": 1.00085831, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8816582471750225, + "language_loss": 0.55162585, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57208049, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.25317383, + "router_z_loss_mlp": 0.01423645, + "step": 12315, + "time_per_iteration": 3.058337926864624 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.04591417, + "balance_loss_mlp": 1.01843643, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.7575881094325647, + "language_loss": 0.79631788, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81776392, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.11553955, + "step": 12316, + "time_per_iteration": 2.5554118156433105 + }, + { + "auxiliary_loss_clip": 0.01112505, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.04055977, + "balance_loss_mlp": 1.01748502, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.6892834178017484, + "language_loss": 0.74943465, + "learning_rate": 6.654669374367275e-07, + "loss": 0.77085209, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11767578, + "step": 12317, + "time_per_iteration": 3.9474217891693115 + }, + { + "auxiliary_loss_clip": 0.01109931, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.04177785, + "balance_loss_mlp": 1.02308202, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.6770586549790794, + "language_loss": 0.81521052, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83664942, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.10882568, + "step": 12318, + "time_per_iteration": 2.5268328189849854 + }, + { + "auxiliary_loss_clip": 0.01114122, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.04007232, + "balance_loss_mlp": 1.01567721, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 1.8547143095998138, + "language_loss": 0.76699328, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78841174, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1204834, + "step": 12319, + "time_per_iteration": 2.4233508110046387 + }, + { + "auxiliary_loss_clip": 0.01111483, + "auxiliary_loss_mlp": 0.01027839, + "balance_loss_clip": 1.04085541, + "balance_loss_mlp": 1.01693761, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.7484560430270175, + "language_loss": 0.64256996, + "learning_rate": 6.64596929804897e-07, + "loss": 0.6639632, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10900879, + "step": 12320, + "time_per_iteration": 2.423461437225342 + }, + { + "auxiliary_loss_clip": 0.01132802, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.05520403, + "balance_loss_mlp": 1.01998806, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.679939393930697, + "language_loss": 0.82413435, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84578586, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12347412, + "step": 12321, + "time_per_iteration": 3.8656764030456543 + }, + { + "auxiliary_loss_clip": 0.01122138, + "auxiliary_loss_mlp": 0.01047628, + "balance_loss_clip": 1.04303241, + "balance_loss_mlp": 1.03188038, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 1.8667552860914203, + "language_loss": 0.72475135, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74644899, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.15759277, + "step": 12322, + "time_per_iteration": 2.4397270679473877 + }, + { + "auxiliary_loss_clip": 0.01118089, + "auxiliary_loss_mlp": 0.01029484, + "balance_loss_clip": 1.04427958, + "balance_loss_mlp": 1.01748586, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 2.1056429316172554, + "language_loss": 0.64051807, + "learning_rate": 6.637273779206183e-07, + "loss": 0.6619938, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12005615, + "step": 12323, + "time_per_iteration": 2.4710500240325928 + }, + { + "auxiliary_loss_clip": 0.01117372, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.04374921, + "balance_loss_mlp": 1.01485872, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.408502998438283, + "language_loss": 0.75851554, + "learning_rate": 6.634376286210559e-07, + "loss": 0.77996171, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1239624, + "step": 12324, + "time_per_iteration": 2.545370101928711 + }, + { + "auxiliary_loss_clip": 0.01124767, + "auxiliary_loss_mlp": 0.01024212, + "balance_loss_clip": 1.05043268, + "balance_loss_mlp": 1.01263058, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 2.44225400670347, + "language_loss": 0.74746084, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76895058, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11584473, + "step": 12325, + "time_per_iteration": 2.419538736343384 + }, + { + "auxiliary_loss_clip": 0.01124065, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.04699826, + "balance_loss_mlp": 1.02018642, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.9083881114505046, + "language_loss": 0.69244528, + "learning_rate": 6.628582820806545e-07, + "loss": 0.71401674, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12902832, + "step": 12326, + "time_per_iteration": 2.443896770477295 + }, + { + "auxiliary_loss_clip": 0.01117254, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.04474199, + "balance_loss_mlp": 1.01611745, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.741909110049372, + "language_loss": 0.89594269, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91739619, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11981201, + "step": 12327, + "time_per_iteration": 3.883025884628296 + }, + { + "auxiliary_loss_clip": 0.01109429, + "auxiliary_loss_mlp": 0.01030153, + "balance_loss_clip": 1.03769636, + "balance_loss_mlp": 1.01776123, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.8247208100939474, + "language_loss": 0.85410678, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87550271, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1239624, + "step": 12328, + "time_per_iteration": 2.435798168182373 + }, + { + "auxiliary_loss_clip": 0.01108918, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.03817236, + "balance_loss_mlp": 1.01665235, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.8011600081472752, + "language_loss": 0.67057592, + "learning_rate": 6.619896425816103e-07, + "loss": 0.69196224, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.1305542, + "step": 12329, + "time_per_iteration": 2.4599528312683105 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.03991413, + "balance_loss_mlp": 1.02075446, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.7267175580901801, + "language_loss": 0.66934597, + "learning_rate": 6.617001975422647e-07, + "loss": 0.69084346, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11773682, + "step": 12330, + "time_per_iteration": 2.507594108581543 + }, + { + "auxiliary_loss_clip": 0.01126723, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.04715061, + "balance_loss_mlp": 1.01846027, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 2.0506304155219706, + "language_loss": 0.85701782, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87861472, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.1449585, + "step": 12331, + "time_per_iteration": 2.4957504272460938 + }, + { + "auxiliary_loss_clip": 0.0111946, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.04540896, + "balance_loss_mlp": 1.0187825, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 2.227112887206376, + "language_loss": 0.7003746, + "learning_rate": 6.611214597199364e-07, + "loss": 0.72187978, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1227417, + "step": 12332, + "time_per_iteration": 2.4451918601989746 + }, + { + "auxiliary_loss_clip": 0.01117488, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.04488254, + "balance_loss_mlp": 1.02047229, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 1.9882949744049987, + "language_loss": 0.63456535, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65607536, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13037109, + "step": 12333, + "time_per_iteration": 2.465688705444336 + }, + { + "auxiliary_loss_clip": 0.01121077, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.05062628, + "balance_loss_mlp": 1.02022922, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.6643108190441722, + "language_loss": 0.71079975, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73233229, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11950684, + "step": 12334, + "time_per_iteration": 2.499681234359741 + }, + { + "auxiliary_loss_clip": 0.0111399, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.04144549, + "balance_loss_mlp": 1.01348257, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.8067711275070855, + "language_loss": 0.82891172, + "learning_rate": 6.602537337919257e-07, + "loss": 0.85030299, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11657715, + "step": 12335, + "time_per_iteration": 2.4511749744415283 + }, + { + "auxiliary_loss_clip": 0.01118588, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.04483509, + "balance_loss_mlp": 1.01718247, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.674656199424718, + "language_loss": 0.75284165, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77432901, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12957764, + "step": 12336, + "time_per_iteration": 2.497047185897827 + }, + { + "auxiliary_loss_clip": 0.01125176, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.04806471, + "balance_loss_mlp": 1.01790726, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.897934941996042, + "language_loss": 0.73677248, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75832379, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.1204834, + "step": 12337, + "time_per_iteration": 2.5201141834259033 + }, + { + "auxiliary_loss_clip": 0.01116551, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.04638994, + "balance_loss_mlp": 1.01995575, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.6205739263528716, + "language_loss": 0.76405966, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78554177, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11694336, + "step": 12338, + "time_per_iteration": 2.4526686668395996 + }, + { + "auxiliary_loss_clip": 0.01105232, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.03520882, + "balance_loss_mlp": 1.01609874, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7425589721196386, + "language_loss": 0.731125, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75244629, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10803223, + "step": 12339, + "time_per_iteration": 2.513392925262451 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.04146051, + "balance_loss_mlp": 1.01737714, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.6028268011340567, + "language_loss": 0.79644012, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81787676, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12322998, + "step": 12340, + "time_per_iteration": 2.465952157974243 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.03585124, + "balance_loss_mlp": 1.01786649, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.6800781240729552, + "language_loss": 0.75780308, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77918303, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12145996, + "step": 12341, + "time_per_iteration": 3.9916436672210693 + }, + { + "auxiliary_loss_clip": 0.01102431, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.03698575, + "balance_loss_mlp": 1.01911223, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.4816133460597818, + "language_loss": 0.79987907, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82121408, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.65380859, + "router_z_loss_mlp": 0.11956787, + "step": 12342, + "time_per_iteration": 2.495680570602417 + }, + { + "auxiliary_loss_clip": 0.01110217, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.03801394, + "balance_loss_mlp": 1.01640522, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 2.0350828327595676, + "language_loss": 0.77593029, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79731262, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1159668, + "step": 12343, + "time_per_iteration": 2.8273329734802246 + }, + { + "auxiliary_loss_clip": 0.01117303, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.04339111, + "balance_loss_mlp": 1.01843369, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 2.136492378101711, + "language_loss": 0.67893147, + "learning_rate": 6.576533005704843e-07, + "loss": 0.70040268, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1138916, + "step": 12344, + "time_per_iteration": 2.4768054485321045 + }, + { + "auxiliary_loss_clip": 0.01110978, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.03755641, + "balance_loss_mlp": 1.02043414, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.528653238123176, + "language_loss": 0.81488502, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83633161, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.13250732, + "step": 12345, + "time_per_iteration": 2.419883966445923 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.0422554, + "balance_loss_mlp": 1.0191927, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9554425520620384, + "language_loss": 0.71037412, + "learning_rate": 6.570759861612988e-07, + "loss": 0.73182023, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.1206665, + "step": 12346, + "time_per_iteration": 2.4677281379699707 + }, + { + "auxiliary_loss_clip": 0.01113471, + "auxiliary_loss_mlp": 0.01029135, + "balance_loss_clip": 1.04062057, + "balance_loss_mlp": 1.01738691, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.7637032351643713, + "language_loss": 0.73095477, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75238085, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11761475, + "step": 12347, + "time_per_iteration": 2.516890287399292 + }, + { + "auxiliary_loss_clip": 0.0111891, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.04320061, + "balance_loss_mlp": 1.01919794, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7853345819561068, + "language_loss": 0.81278241, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83428431, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12078857, + "step": 12348, + "time_per_iteration": 2.474438190460205 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.03978646, + "balance_loss_mlp": 1.0192039, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.7203939853883983, + "language_loss": 0.72367084, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74509645, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11755371, + "step": 12349, + "time_per_iteration": 2.566751718521118 + }, + { + "auxiliary_loss_clip": 0.01112953, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.0363723, + "balance_loss_mlp": 1.01763105, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 2.0187539318069705, + "language_loss": 0.79116917, + "learning_rate": 6.559219685162165e-07, + "loss": 0.81260711, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13232422, + "step": 12350, + "time_per_iteration": 2.478576898574829 + }, + { + "auxiliary_loss_clip": 0.01106112, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.03562689, + "balance_loss_mlp": 1.02124739, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 2.74817069315392, + "language_loss": 0.75286853, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77425742, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11529541, + "step": 12351, + "time_per_iteration": 2.5489695072174072 + }, + { + "auxiliary_loss_clip": 0.01112543, + "auxiliary_loss_mlp": 0.01026687, + "balance_loss_clip": 1.04132891, + "balance_loss_mlp": 1.01421165, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 1.9970115488134683, + "language_loss": 0.81281418, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83420646, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12487793, + "step": 12352, + "time_per_iteration": 2.4146387577056885 + }, + { + "auxiliary_loss_clip": 0.01122874, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.05004799, + "balance_loss_mlp": 1.02465165, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 2.1014807380778904, + "language_loss": 0.71673167, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73832339, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11645508, + "step": 12353, + "time_per_iteration": 2.459104299545288 + }, + { + "auxiliary_loss_clip": 0.01124902, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.05306101, + "balance_loss_mlp": 1.02133405, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.733806775373175, + "language_loss": 0.72411406, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74568832, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11187744, + "step": 12354, + "time_per_iteration": 2.473158359527588 + }, + { + "auxiliary_loss_clip": 0.0106796, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.00280535, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.6949212319452779, + "language_loss": 0.59533471, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61605746, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.01512146, + "step": 12355, + "time_per_iteration": 3.250601053237915 + }, + { + "auxiliary_loss_clip": 0.01116421, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.0442431, + "balance_loss_mlp": 1.01590931, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.6004465051006211, + "language_loss": 0.67797494, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69942147, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12298584, + "step": 12356, + "time_per_iteration": 2.4785377979278564 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.04004467, + "balance_loss_mlp": 1.0204699, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 4.722943540844455, + "language_loss": 0.72275048, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74424446, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12158203, + "step": 12357, + "time_per_iteration": 2.701443672180176 + }, + { + "auxiliary_loss_clip": 0.01113995, + "auxiliary_loss_mlp": 0.01027884, + "balance_loss_clip": 1.04419398, + "balance_loss_mlp": 1.01683974, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 2.2095918026204577, + "language_loss": 0.65343344, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67485219, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11053467, + "step": 12358, + "time_per_iteration": 2.4686810970306396 + }, + { + "auxiliary_loss_clip": 0.01121171, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.04670739, + "balance_loss_mlp": 1.01963079, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 2.1097489864072796, + "language_loss": 0.80767733, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82921553, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13024902, + "step": 12359, + "time_per_iteration": 2.4860475063323975 + }, + { + "auxiliary_loss_clip": 0.01112562, + "auxiliary_loss_mlp": 0.01028721, + "balance_loss_clip": 1.04055035, + "balance_loss_mlp": 1.0172298, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.9558955884778033, + "language_loss": 0.6858021, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70721495, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11499023, + "step": 12360, + "time_per_iteration": 2.4664316177368164 + }, + { + "auxiliary_loss_clip": 0.01106174, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.03490841, + "balance_loss_mlp": 1.01889253, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 2.156894939531461, + "language_loss": 0.72693372, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74830639, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12200928, + "step": 12361, + "time_per_iteration": 3.935516595840454 + }, + { + "auxiliary_loss_clip": 0.01114425, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.04147196, + "balance_loss_mlp": 1.01597106, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 2.0728179124981114, + "language_loss": 0.55753577, + "learning_rate": 6.524648112660027e-07, + "loss": 0.57895529, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11553955, + "step": 12362, + "time_per_iteration": 2.472804069519043 + }, + { + "auxiliary_loss_clip": 0.01112951, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.0415504, + "balance_loss_mlp": 1.01598597, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.722801425498523, + "language_loss": 0.77582371, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79722774, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11474609, + "step": 12363, + "time_per_iteration": 2.465000629425049 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.04012108, + "balance_loss_mlp": 1.02361214, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.8760465049118997, + "language_loss": 0.78002107, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80147493, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10980225, + "step": 12364, + "time_per_iteration": 2.4470887184143066 + }, + { + "auxiliary_loss_clip": 0.01119874, + "auxiliary_loss_mlp": 0.01033115, + "balance_loss_clip": 1.04725885, + "balance_loss_mlp": 1.02151036, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.6330689406824308, + "language_loss": 0.78313071, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80466062, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11608887, + "step": 12365, + "time_per_iteration": 3.8938868045806885 + }, + { + "auxiliary_loss_clip": 0.01115992, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.02116489, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.5691157816091275, + "language_loss": 0.76961029, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79110718, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12536621, + "step": 12366, + "time_per_iteration": 2.7467052936553955 + }, + { + "auxiliary_loss_clip": 0.01110057, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.04072773, + "balance_loss_mlp": 1.01957095, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.5882637894227933, + "language_loss": 0.71235096, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73377085, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.12335205, + "step": 12367, + "time_per_iteration": 2.4834210872650146 + }, + { + "auxiliary_loss_clip": 0.01119838, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.04338884, + "balance_loss_mlp": 1.01892924, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.7406868400743878, + "language_loss": 0.74556744, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76707256, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11749268, + "step": 12368, + "time_per_iteration": 2.505239486694336 + }, + { + "auxiliary_loss_clip": 0.01112725, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.0416888, + "balance_loss_mlp": 1.01921153, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 1.7490773295030422, + "language_loss": 0.69484156, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71626979, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10882568, + "step": 12369, + "time_per_iteration": 2.6490817070007324 + }, + { + "auxiliary_loss_clip": 0.01109353, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0386579, + "balance_loss_mlp": 1.01861119, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 2.20826182887781, + "language_loss": 0.75685358, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77824914, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11602783, + "step": 12370, + "time_per_iteration": 4.07944393157959 + }, + { + "auxiliary_loss_clip": 0.0111157, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.04009962, + "balance_loss_mlp": 1.02554977, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.355082440716435, + "language_loss": 0.78251243, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80400288, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11920166, + "step": 12371, + "time_per_iteration": 2.470897912979126 + }, + { + "auxiliary_loss_clip": 0.01115748, + "auxiliary_loss_mlp": 0.01028056, + "balance_loss_clip": 1.04102087, + "balance_loss_mlp": 1.01568198, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 2.0710570796001595, + "language_loss": 0.69772065, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71915865, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12384033, + "step": 12372, + "time_per_iteration": 2.49603533744812 + }, + { + "auxiliary_loss_clip": 0.01112532, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.04137444, + "balance_loss_mlp": 1.01816189, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.5113923751985383, + "language_loss": 0.75655437, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77797443, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11334229, + "step": 12373, + "time_per_iteration": 2.390582799911499 + }, + { + "auxiliary_loss_clip": 0.01123626, + "auxiliary_loss_mlp": 0.01034337, + "balance_loss_clip": 1.04536057, + "balance_loss_mlp": 1.02140927, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.8368792774399636, + "language_loss": 0.77525234, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79683197, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.1293335, + "step": 12374, + "time_per_iteration": 2.440380573272705 + }, + { + "auxiliary_loss_clip": 0.01121433, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.0483278, + "balance_loss_mlp": 1.0211184, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.04255849914135, + "language_loss": 0.7635383, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78508651, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12280273, + "step": 12375, + "time_per_iteration": 2.438823938369751 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.04595673, + "balance_loss_mlp": 1.01550508, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.8759980006048709, + "language_loss": 0.77297235, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79438895, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10437012, + "step": 12376, + "time_per_iteration": 2.4118614196777344 + }, + { + "auxiliary_loss_clip": 0.01119555, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.04663813, + "balance_loss_mlp": 1.01465023, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.7969588416326385, + "language_loss": 0.79246891, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81393206, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12097168, + "step": 12377, + "time_per_iteration": 2.495159149169922 + }, + { + "auxiliary_loss_clip": 0.01119105, + "auxiliary_loss_mlp": 0.01026462, + "balance_loss_clip": 1.04561114, + "balance_loss_mlp": 1.01470208, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 2.228409162219023, + "language_loss": 0.67417228, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69562793, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11773682, + "step": 12378, + "time_per_iteration": 2.8429365158081055 + }, + { + "auxiliary_loss_clip": 0.01118258, + "auxiliary_loss_mlp": 0.01038257, + "balance_loss_clip": 1.04176867, + "balance_loss_mlp": 1.02512014, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.8781262310130955, + "language_loss": 0.71867847, + "learning_rate": 6.475797721245648e-07, + "loss": 0.74024367, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13140869, + "step": 12379, + "time_per_iteration": 2.5093135833740234 + }, + { + "auxiliary_loss_clip": 0.01119906, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.0466404, + "balance_loss_mlp": 1.02366352, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.8012366561701418, + "language_loss": 0.65287161, + "learning_rate": 6.472928779135085e-07, + "loss": 0.6744321, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12475586, + "step": 12380, + "time_per_iteration": 2.477118492126465 + }, + { + "auxiliary_loss_clip": 0.01121585, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.04856086, + "balance_loss_mlp": 1.01655602, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 2.0304194185229276, + "language_loss": 0.78618038, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80768418, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12231445, + "step": 12381, + "time_per_iteration": 2.471465587615967 + }, + { + "auxiliary_loss_clip": 0.01120849, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.04643655, + "balance_loss_mlp": 1.02170396, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 1.9896166400068742, + "language_loss": 0.73115921, + "learning_rate": 6.467192433866411e-07, + "loss": 0.75270903, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12414551, + "step": 12382, + "time_per_iteration": 2.5352163314819336 + }, + { + "auxiliary_loss_clip": 0.0104086, + "auxiliary_loss_mlp": 0.01009309, + "balance_loss_clip": 1.0150497, + "balance_loss_mlp": 1.00788188, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6524620026082745, + "language_loss": 0.54625952, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56676126, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.25830078, + "router_z_loss_mlp": 0.01428223, + "step": 12383, + "time_per_iteration": 3.269881010055542 + }, + { + "auxiliary_loss_clip": 0.01116988, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.04231811, + "balance_loss_mlp": 1.01833177, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 1.973799275570762, + "language_loss": 0.75993037, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78140396, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12030029, + "step": 12384, + "time_per_iteration": 3.962553024291992 + }, + { + "auxiliary_loss_clip": 0.01115131, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.04364324, + "balance_loss_mlp": 1.01879811, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 2.114896607694523, + "language_loss": 0.79523546, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81668884, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11401367, + "step": 12385, + "time_per_iteration": 2.494826078414917 + }, + { + "auxiliary_loss_clip": 0.0112121, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.04464555, + "balance_loss_mlp": 1.01986134, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.765647431980292, + "language_loss": 0.81981939, + "learning_rate": 6.455725902183813e-07, + "loss": 0.84136379, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13366699, + "step": 12386, + "time_per_iteration": 2.4619505405426025 + }, + { + "auxiliary_loss_clip": 0.01121027, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.04981661, + "balance_loss_mlp": 1.01687956, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.8016763836588405, + "language_loss": 0.71572101, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73721099, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11102295, + "step": 12387, + "time_per_iteration": 2.4604415893554688 + }, + { + "auxiliary_loss_clip": 0.01116671, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.04149926, + "balance_loss_mlp": 1.02050304, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.091042802159712, + "language_loss": 0.7043997, + "learning_rate": 6.449995717509138e-07, + "loss": 0.7258842, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11273193, + "step": 12388, + "time_per_iteration": 2.4112179279327393 + }, + { + "auxiliary_loss_clip": 0.0111417, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.04035449, + "balance_loss_mlp": 1.01883221, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 2.017363772366918, + "language_loss": 0.8521443, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87359667, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12231445, + "step": 12389, + "time_per_iteration": 2.46590256690979 + }, + { + "auxiliary_loss_clip": 0.01115771, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0433042, + "balance_loss_mlp": 1.02206469, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 4.873488677758931, + "language_loss": 0.7916224, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81311774, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11706543, + "step": 12390, + "time_per_iteration": 2.488279104232788 + }, + { + "auxiliary_loss_clip": 0.01114217, + "auxiliary_loss_mlp": 0.01026447, + "balance_loss_clip": 1.04039884, + "balance_loss_mlp": 1.01390624, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 2.0519004960602456, + "language_loss": 0.84924555, + "learning_rate": 6.441404294400014e-07, + "loss": 0.8706522, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12548828, + "step": 12391, + "time_per_iteration": 2.4290590286254883 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01029503, + "balance_loss_clip": 1.04053867, + "balance_loss_mlp": 1.01791, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 2.0226016244871734, + "language_loss": 0.73827636, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75970513, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11590576, + "step": 12392, + "time_per_iteration": 2.4130473136901855 + }, + { + "auxiliary_loss_clip": 0.01104936, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.03588367, + "balance_loss_mlp": 1.01992226, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.7514043806310686, + "language_loss": 0.76908338, + "learning_rate": 6.435679249529487e-07, + "loss": 0.79046017, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.12817383, + "step": 12393, + "time_per_iteration": 2.421928882598877 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.0400188, + "balance_loss_mlp": 1.02152491, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 2.1749137266808973, + "language_loss": 0.72717142, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74864352, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.13049316, + "step": 12394, + "time_per_iteration": 2.495513439178467 + }, + { + "auxiliary_loss_clip": 0.01117473, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.04295564, + "balance_loss_mlp": 1.01598239, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.719189542508859, + "language_loss": 0.81703824, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83849537, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12268066, + "step": 12395, + "time_per_iteration": 2.40755558013916 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.04202604, + "balance_loss_mlp": 1.02084279, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 2.159355841249028, + "language_loss": 0.71335489, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73484421, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12243652, + "step": 12396, + "time_per_iteration": 2.4918107986450195 + }, + { + "auxiliary_loss_clip": 0.01114432, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.03973997, + "balance_loss_mlp": 1.0192467, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 2.519533983124939, + "language_loss": 0.68508792, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70654774, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1229248, + "step": 12397, + "time_per_iteration": 2.4685981273651123 + }, + { + "auxiliary_loss_clip": 0.01105905, + "auxiliary_loss_mlp": 0.01037106, + "balance_loss_clip": 1.03530562, + "balance_loss_mlp": 1.02506554, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 2.806759743857977, + "language_loss": 0.77283406, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7942642, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12054443, + "step": 12398, + "time_per_iteration": 2.4673306941986084 + }, + { + "auxiliary_loss_clip": 0.01114653, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.04426217, + "balance_loss_mlp": 1.01619148, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.7775890431781292, + "language_loss": 0.77984476, + "learning_rate": 6.418516463039363e-07, + "loss": 0.80127543, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12231445, + "step": 12399, + "time_per_iteration": 2.456482172012329 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.0443362, + "balance_loss_mlp": 1.01719284, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 2.0907422230406123, + "language_loss": 0.74238861, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76380116, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.11816406, + "step": 12400, + "time_per_iteration": 2.4483423233032227 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.03656542, + "balance_loss_mlp": 1.01898158, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.7322575412963708, + "language_loss": 0.82032454, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84170628, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1137085, + "step": 12401, + "time_per_iteration": 2.529266119003296 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.04378867, + "balance_loss_mlp": 1.02275229, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.9072616387227042, + "language_loss": 0.65109801, + "learning_rate": 6.409942020981611e-07, + "loss": 0.6726197, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11816406, + "step": 12402, + "time_per_iteration": 2.5238096714019775 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.04093885, + "balance_loss_mlp": 1.02052391, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.6274182022576216, + "language_loss": 0.73406863, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75552052, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12145996, + "step": 12403, + "time_per_iteration": 2.578899621963501 + }, + { + "auxiliary_loss_clip": 0.01044409, + "auxiliary_loss_mlp": 0.01008182, + "balance_loss_clip": 1.01900601, + "balance_loss_mlp": 1.00671232, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8363047109094853, + "language_loss": 0.58790892, + "learning_rate": 6.404228302777621e-07, + "loss": 0.6084348, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01469421, + "step": 12404, + "time_per_iteration": 4.328583478927612 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.04444146, + "balance_loss_mlp": 1.01705182, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.771388464860694, + "language_loss": 0.77825463, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79968536, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10614014, + "step": 12405, + "time_per_iteration": 2.537973165512085 + }, + { + "auxiliary_loss_clip": 0.01109369, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.04098344, + "balance_loss_mlp": 1.01817822, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.6018261310391593, + "language_loss": 0.6938774, + "learning_rate": 6.398516646785698e-07, + "loss": 0.7152639, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.11108398, + "step": 12406, + "time_per_iteration": 2.4701573848724365 + }, + { + "auxiliary_loss_clip": 0.01123773, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.0471375, + "balance_loss_mlp": 1.02169347, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.8054358991640795, + "language_loss": 0.64793634, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66951728, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12628174, + "step": 12407, + "time_per_iteration": 2.443491220474243 + }, + { + "auxiliary_loss_clip": 0.01111771, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.03850555, + "balance_loss_mlp": 1.02364445, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.868388216288447, + "language_loss": 0.72066408, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74215972, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.14160156, + "step": 12408, + "time_per_iteration": 3.8875927925109863 + }, + { + "auxiliary_loss_clip": 0.01124226, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.046435, + "balance_loss_mlp": 1.01921582, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 2.796089060575372, + "language_loss": 0.72743213, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74900341, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13684082, + "step": 12409, + "time_per_iteration": 2.49520206451416 + }, + { + "auxiliary_loss_clip": 0.01116722, + "auxiliary_loss_mlp": 0.01037025, + "balance_loss_clip": 1.04377794, + "balance_loss_mlp": 1.02599823, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.6768231020842561, + "language_loss": 0.65964139, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68117881, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11029053, + "step": 12410, + "time_per_iteration": 2.4529285430908203 + }, + { + "auxiliary_loss_clip": 0.01111592, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.04235697, + "balance_loss_mlp": 1.01608086, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.060732210028184, + "language_loss": 0.84414899, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86554426, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11853027, + "step": 12411, + "time_per_iteration": 2.4386022090911865 + }, + { + "auxiliary_loss_clip": 0.01117511, + "auxiliary_loss_mlp": 0.01025112, + "balance_loss_clip": 1.04517007, + "balance_loss_mlp": 1.0131377, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.5798095933430916, + "language_loss": 0.78290093, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80432713, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11975098, + "step": 12412, + "time_per_iteration": 2.456343173980713 + }, + { + "auxiliary_loss_clip": 0.01127164, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.05155301, + "balance_loss_mlp": 1.02054596, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.9715738169240515, + "language_loss": 0.62626088, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64785135, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11322021, + "step": 12413, + "time_per_iteration": 2.5377213954925537 + }, + { + "auxiliary_loss_clip": 0.01056684, + "auxiliary_loss_mlp": 0.01005177, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.00386548, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7540744112567843, + "language_loss": 0.54887789, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56949651, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01312256, + "step": 12414, + "time_per_iteration": 4.49786114692688 + }, + { + "auxiliary_loss_clip": 0.01114963, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.04285991, + "balance_loss_mlp": 1.0148263, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.918572637555755, + "language_loss": 0.55126733, + "learning_rate": 6.372839737918154e-07, + "loss": 0.57268316, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11791992, + "step": 12415, + "time_per_iteration": 2.542088747024536 + }, + { + "auxiliary_loss_clip": 0.01118095, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04757762, + "balance_loss_mlp": 1.0183208, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.6673649802609762, + "language_loss": 0.75023919, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77172291, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11956787, + "step": 12416, + "time_per_iteration": 2.5582070350646973 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.04066479, + "balance_loss_mlp": 1.02233505, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.6129388641317908, + "language_loss": 0.69537604, + "learning_rate": 6.367139439570233e-07, + "loss": 0.7168498, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12084961, + "step": 12417, + "time_per_iteration": 2.6733014583587646 + }, + { + "auxiliary_loss_clip": 0.01119156, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.04527235, + "balance_loss_mlp": 1.01911712, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.7401763991047359, + "language_loss": 0.7375493, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75906038, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12835693, + "step": 12418, + "time_per_iteration": 2.431044101715088 + }, + { + "auxiliary_loss_clip": 0.01113885, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.04216337, + "balance_loss_mlp": 1.01851678, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.772064934959386, + "language_loss": 0.6937536, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71520114, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12359619, + "step": 12419, + "time_per_iteration": 2.4378252029418945 + }, + { + "auxiliary_loss_clip": 0.01106384, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.03808737, + "balance_loss_mlp": 1.01818621, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.7922121265486874, + "language_loss": 0.74821299, + "learning_rate": 6.358592869514216e-07, + "loss": 0.7695688, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.11004639, + "step": 12420, + "time_per_iteration": 2.5266993045806885 + }, + { + "auxiliary_loss_clip": 0.01120701, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.04611278, + "balance_loss_mlp": 1.01640844, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.7033154267199886, + "language_loss": 0.67253369, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69402921, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12445068, + "step": 12421, + "time_per_iteration": 2.4430854320526123 + }, + { + "auxiliary_loss_clip": 0.0111955, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.04222608, + "balance_loss_mlp": 1.02309394, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 2.3802516047083757, + "language_loss": 0.72416461, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74573386, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.14282227, + "step": 12422, + "time_per_iteration": 2.4975173473358154 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.04393506, + "balance_loss_mlp": 1.01959622, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 1.9365726832525731, + "language_loss": 0.74572635, + "learning_rate": 6.350050955009796e-07, + "loss": 0.7671814, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11407471, + "step": 12423, + "time_per_iteration": 2.501737117767334 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01022093, + "balance_loss_clip": 1.04207659, + "balance_loss_mlp": 1.01146555, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.310397346725759, + "language_loss": 0.67498398, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69631726, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10638428, + "step": 12424, + "time_per_iteration": 2.508493661880493 + }, + { + "auxiliary_loss_clip": 0.01122474, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.04839003, + "balance_loss_mlp": 1.0217762, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.6986132533002656, + "language_loss": 0.75069684, + "learning_rate": 6.344358933197418e-07, + "loss": 0.772259, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11968994, + "step": 12425, + "time_per_iteration": 2.534414768218994 + }, + { + "auxiliary_loss_clip": 0.01119513, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.0468874, + "balance_loss_mlp": 1.01740575, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 2.509956411108191, + "language_loss": 0.6981287, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71961844, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12060547, + "step": 12426, + "time_per_iteration": 2.5102221965789795 + }, + { + "auxiliary_loss_clip": 0.01113715, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.04356527, + "balance_loss_mlp": 1.01999307, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.4439183782279832, + "language_loss": 0.65435445, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67580312, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11157227, + "step": 12427, + "time_per_iteration": 2.5341594219207764 + }, + { + "auxiliary_loss_clip": 0.01113487, + "auxiliary_loss_mlp": 0.01025561, + "balance_loss_clip": 1.04223466, + "balance_loss_mlp": 1.01341987, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.5791531971931703, + "language_loss": 0.74696654, + "learning_rate": 6.335824784423118e-07, + "loss": 0.76835704, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12133789, + "step": 12428, + "time_per_iteration": 3.9312634468078613 + }, + { + "auxiliary_loss_clip": 0.01115028, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_clip": 1.04021239, + "balance_loss_mlp": 1.02683139, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 2.225775083561819, + "language_loss": 0.58558494, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60715061, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.14709473, + "step": 12429, + "time_per_iteration": 2.466977834701538 + }, + { + "auxiliary_loss_clip": 0.01118421, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.04242206, + "balance_loss_mlp": 1.01724327, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.9090025385818097, + "language_loss": 0.60572243, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62719846, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11932373, + "step": 12430, + "time_per_iteration": 2.5056042671203613 + }, + { + "auxiliary_loss_clip": 0.0111354, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.04233277, + "balance_loss_mlp": 1.01628709, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.3964293283426348, + "language_loss": 0.75419849, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77561617, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11938477, + "step": 12431, + "time_per_iteration": 2.4863297939300537 + }, + { + "auxiliary_loss_clip": 0.01113723, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.04098284, + "balance_loss_mlp": 1.01539528, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.9100197470143097, + "language_loss": 0.75484371, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77625203, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11712646, + "step": 12432, + "time_per_iteration": 2.389547109603882 + }, + { + "auxiliary_loss_clip": 0.01114379, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.03906107, + "balance_loss_mlp": 1.02165496, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.961565482727629, + "language_loss": 0.69856441, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72006989, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.14526367, + "step": 12433, + "time_per_iteration": 2.404839038848877 + }, + { + "auxiliary_loss_clip": 0.01111532, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.03898478, + "balance_loss_mlp": 1.02503562, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.7125252250710397, + "language_loss": 0.67373836, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69524169, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13769531, + "step": 12434, + "time_per_iteration": 2.4046926498413086 + }, + { + "auxiliary_loss_clip": 0.01107992, + "auxiliary_loss_mlp": 0.01026024, + "balance_loss_clip": 1.04174495, + "balance_loss_mlp": 1.01571822, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.4122899014994497, + "language_loss": 0.79495144, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81629163, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.66259766, + "router_z_loss_mlp": 0.10308838, + "step": 12435, + "time_per_iteration": 2.4954662322998047 + }, + { + "auxiliary_loss_clip": 0.01117945, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.04330516, + "balance_loss_mlp": 1.01994216, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 14.48062672694107, + "language_loss": 0.67967141, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70116997, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11968994, + "step": 12436, + "time_per_iteration": 2.521224021911621 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.04366589, + "balance_loss_mlp": 1.01858735, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.5301963235864404, + "language_loss": 0.70588058, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72736746, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12011719, + "step": 12437, + "time_per_iteration": 2.5348734855651855 + }, + { + "auxiliary_loss_clip": 0.01110695, + "auxiliary_loss_mlp": 0.01025237, + "balance_loss_clip": 1.04277599, + "balance_loss_mlp": 1.01473486, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 2.2437191648555217, + "language_loss": 0.67031252, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69167185, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.10498047, + "step": 12438, + "time_per_iteration": 2.47053861618042 + }, + { + "auxiliary_loss_clip": 0.01113946, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.04002643, + "balance_loss_mlp": 1.02383125, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.5834519094720267, + "language_loss": 0.80951172, + "learning_rate": 6.304572825026344e-07, + "loss": 0.83101195, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12255859, + "step": 12439, + "time_per_iteration": 2.4002037048339844 + }, + { + "auxiliary_loss_clip": 0.01114644, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.04350996, + "balance_loss_mlp": 1.02267098, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 2.1638150369226743, + "language_loss": 0.71082431, + "learning_rate": 6.301734851646674e-07, + "loss": 0.73231196, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11450195, + "step": 12440, + "time_per_iteration": 2.4035961627960205 + }, + { + "auxiliary_loss_clip": 0.01112403, + "auxiliary_loss_mlp": 0.01026324, + "balance_loss_clip": 1.04264688, + "balance_loss_mlp": 1.01524925, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.6034542974089976, + "language_loss": 0.74410951, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76549679, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11077881, + "step": 12441, + "time_per_iteration": 2.449793577194214 + }, + { + "auxiliary_loss_clip": 0.01120007, + "auxiliary_loss_mlp": 0.01028776, + "balance_loss_clip": 1.04585314, + "balance_loss_mlp": 1.01641393, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.1647497523267005, + "language_loss": 0.82886374, + "learning_rate": 6.296060463313698e-07, + "loss": 0.85035151, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12353516, + "step": 12442, + "time_per_iteration": 2.399614095687866 + }, + { + "auxiliary_loss_clip": 0.01116134, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.04282546, + "balance_loss_mlp": 1.02015328, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 2.703462520514475, + "language_loss": 0.62821561, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64970452, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12591553, + "step": 12443, + "time_per_iteration": 2.4811758995056152 + }, + { + "auxiliary_loss_clip": 0.01115188, + "auxiliary_loss_mlp": 0.01026672, + "balance_loss_clip": 1.04529297, + "balance_loss_mlp": 1.01559758, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 1.885575605927081, + "language_loss": 0.71668684, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73810548, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11077881, + "step": 12444, + "time_per_iteration": 2.438059091567993 + }, + { + "auxiliary_loss_clip": 0.011066, + "auxiliary_loss_mlp": 0.01027096, + "balance_loss_clip": 1.03606749, + "balance_loss_mlp": 1.0152998, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.4175844087545986, + "language_loss": 0.68906319, + "learning_rate": 6.287552778493786e-07, + "loss": 0.7104001, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11798096, + "step": 12445, + "time_per_iteration": 2.4584906101226807 + }, + { + "auxiliary_loss_clip": 0.01116256, + "auxiliary_loss_mlp": 0.01022819, + "balance_loss_clip": 1.0428071, + "balance_loss_mlp": 1.01148188, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.7036417929763867, + "language_loss": 0.74445826, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76584899, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11340332, + "step": 12446, + "time_per_iteration": 2.420518398284912 + }, + { + "auxiliary_loss_clip": 0.01125299, + "auxiliary_loss_mlp": 0.0102894, + "balance_loss_clip": 1.04835129, + "balance_loss_mlp": 1.01608956, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.162323049291806, + "language_loss": 0.72788978, + "learning_rate": 6.281883588321475e-07, + "loss": 0.74943209, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12854004, + "step": 12447, + "time_per_iteration": 2.448960304260254 + }, + { + "auxiliary_loss_clip": 0.01117961, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.04439628, + "balance_loss_mlp": 1.01928973, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.9628769105977106, + "language_loss": 0.72082794, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74232131, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12078857, + "step": 12448, + "time_per_iteration": 3.9229512214660645 + }, + { + "auxiliary_loss_clip": 0.0112077, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.04773295, + "balance_loss_mlp": 1.02194536, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 2.058095950620671, + "language_loss": 0.739779, + "learning_rate": 6.276216478918543e-07, + "loss": 0.76132452, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11846924, + "step": 12449, + "time_per_iteration": 2.476979970932007 + }, + { + "auxiliary_loss_clip": 0.01126069, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.04939401, + "balance_loss_mlp": 1.02169383, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 1.9685859870361029, + "language_loss": 0.60960287, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63120568, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12512207, + "step": 12450, + "time_per_iteration": 2.521897554397583 + }, + { + "auxiliary_loss_clip": 0.01110034, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.04247499, + "balance_loss_mlp": 1.01448631, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 1.887756562244993, + "language_loss": 0.70502973, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72638112, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.10614014, + "step": 12451, + "time_per_iteration": 3.9522383213043213 + }, + { + "auxiliary_loss_clip": 0.01125498, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.04909611, + "balance_loss_mlp": 1.01598072, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.1667571779590653, + "language_loss": 0.8024112, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82394242, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11645508, + "step": 12452, + "time_per_iteration": 2.5396809577941895 + }, + { + "auxiliary_loss_clip": 0.01121029, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.04588044, + "balance_loss_mlp": 1.01885462, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 2.566176934489125, + "language_loss": 0.71515548, + "learning_rate": 6.264888505858843e-07, + "loss": 0.7366733, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11895752, + "step": 12453, + "time_per_iteration": 2.462831497192383 + }, + { + "auxiliary_loss_clip": 0.01119803, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.04730082, + "balance_loss_mlp": 1.02072918, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.629195949805835, + "language_loss": 0.73811376, + "learning_rate": 6.262057814417517e-07, + "loss": 0.7596395, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12060547, + "step": 12454, + "time_per_iteration": 2.444517135620117 + }, + { + "auxiliary_loss_clip": 0.01046787, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.02229333, + "balance_loss_mlp": 1.00147069, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7328506248621497, + "language_loss": 0.59413528, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61462939, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.24438477, + "router_z_loss_mlp": 0.01158142, + "step": 12455, + "time_per_iteration": 3.186350107192993 + }, + { + "auxiliary_loss_clip": 0.01108508, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.03785682, + "balance_loss_mlp": 1.01862383, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 1.9618273005489288, + "language_loss": 0.79739738, + "learning_rate": 6.256397994474592e-07, + "loss": 0.8187843, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11566162, + "step": 12456, + "time_per_iteration": 2.445420026779175 + }, + { + "auxiliary_loss_clip": 0.01044747, + "auxiliary_loss_mlp": 0.01007983, + "balance_loss_clip": 1.02057278, + "balance_loss_mlp": 1.00676739, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.8366385014720905, + "language_loss": 0.61423934, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63476658, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.24194336, + "router_z_loss_mlp": 0.012146, + "step": 12457, + "time_per_iteration": 3.0083601474761963 + }, + { + "auxiliary_loss_clip": 0.01127315, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.05321467, + "balance_loss_mlp": 1.0185219, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 2.095173953670709, + "language_loss": 0.68485689, + "learning_rate": 6.250740259166711e-07, + "loss": 0.70642614, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11090088, + "step": 12458, + "time_per_iteration": 3.814009666442871 + }, + { + "auxiliary_loss_clip": 0.01127882, + "auxiliary_loss_mlp": 0.01027785, + "balance_loss_clip": 1.05496478, + "balance_loss_mlp": 1.01682353, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 1.7170032642950062, + "language_loss": 0.80106544, + "learning_rate": 6.247912173519106e-07, + "loss": 0.82262212, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10961914, + "step": 12459, + "time_per_iteration": 2.4450266361236572 + }, + { + "auxiliary_loss_clip": 0.01122629, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.04962063, + "balance_loss_mlp": 1.01806939, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.6285827276833091, + "language_loss": 0.80426466, + "learning_rate": 6.245084609352043e-07, + "loss": 0.8257916, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11975098, + "step": 12460, + "time_per_iteration": 2.4382803440093994 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.03836417, + "balance_loss_mlp": 1.01684856, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.888564466494051, + "language_loss": 0.86340117, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88478839, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12530518, + "step": 12461, + "time_per_iteration": 2.505418539047241 + }, + { + "auxiliary_loss_clip": 0.01116011, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.04521537, + "balance_loss_mlp": 1.01697612, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 2.0388986150293764, + "language_loss": 0.69549799, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71694583, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11779785, + "step": 12462, + "time_per_iteration": 2.534137010574341 + }, + { + "auxiliary_loss_clip": 0.01115013, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.04313803, + "balance_loss_mlp": 1.0163517, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.088309456940866, + "language_loss": 0.70609027, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72752988, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12597656, + "step": 12463, + "time_per_iteration": 2.6118805408477783 + }, + { + "auxiliary_loss_clip": 0.01114811, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.04193687, + "balance_loss_mlp": 1.02254212, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.9148172532988665, + "language_loss": 0.77598631, + "learning_rate": 6.233779569633419e-07, + "loss": 0.7974751, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11529541, + "step": 12464, + "time_per_iteration": 2.6465675830841064 + }, + { + "auxiliary_loss_clip": 0.01106881, + "auxiliary_loss_mlp": 0.01026157, + "balance_loss_clip": 1.0364567, + "balance_loss_mlp": 1.01484942, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.683729984640327, + "language_loss": 0.7875253, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80885565, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11297607, + "step": 12465, + "time_per_iteration": 2.4662368297576904 + }, + { + "auxiliary_loss_clip": 0.01126364, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.04762268, + "balance_loss_mlp": 1.01801658, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.4216627531043406, + "language_loss": 0.74442923, + "learning_rate": 6.22813018144422e-07, + "loss": 0.76600671, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.13366699, + "step": 12466, + "time_per_iteration": 2.456413984298706 + }, + { + "auxiliary_loss_clip": 0.01111673, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.03928542, + "balance_loss_mlp": 1.01901889, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.0227266925659717, + "language_loss": 0.6652087, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68663096, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11541748, + "step": 12467, + "time_per_iteration": 2.4950802326202393 + }, + { + "auxiliary_loss_clip": 0.01114088, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.04146457, + "balance_loss_mlp": 1.0205617, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.3955826148476023, + "language_loss": 0.75911301, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78058863, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12921143, + "step": 12468, + "time_per_iteration": 2.4083988666534424 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.04199791, + "balance_loss_mlp": 1.02036691, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 24.72263476121939, + "language_loss": 0.69537097, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71686327, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1282959, + "step": 12469, + "time_per_iteration": 2.452785015106201 + }, + { + "auxiliary_loss_clip": 0.01117607, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.04347372, + "balance_loss_mlp": 1.02215576, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 2.017123987168362, + "language_loss": 0.69618094, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71770167, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12298584, + "step": 12470, + "time_per_iteration": 2.5270628929138184 + }, + { + "auxiliary_loss_clip": 0.01112842, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.03725624, + "balance_loss_mlp": 1.01977825, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 1.7098609430799698, + "language_loss": 0.75403631, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77550328, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.14074707, + "step": 12471, + "time_per_iteration": 3.871826410293579 + }, + { + "auxiliary_loss_clip": 0.01123029, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.04841566, + "balance_loss_mlp": 1.0127157, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.9323454116512986, + "language_loss": 0.77138615, + "learning_rate": 6.211194553838929e-07, + "loss": 0.79286766, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12402344, + "step": 12472, + "time_per_iteration": 2.383338689804077 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.04083538, + "balance_loss_mlp": 1.01854897, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.493630935629013, + "language_loss": 0.84217346, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86358106, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11328125, + "step": 12473, + "time_per_iteration": 2.4500229358673096 + }, + { + "auxiliary_loss_clip": 0.01125146, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.04851079, + "balance_loss_mlp": 1.02346206, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 1.9520063861870462, + "language_loss": 0.73943937, + "learning_rate": 6.205553526478829e-07, + "loss": 0.76105785, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13244629, + "step": 12474, + "time_per_iteration": 2.4710116386413574 + }, + { + "auxiliary_loss_clip": 0.01120802, + "auxiliary_loss_mlp": 0.01037425, + "balance_loss_clip": 1.04402447, + "balance_loss_mlp": 1.0233407, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 2.277576636643511, + "language_loss": 0.74904114, + "learning_rate": 6.202733797375492e-07, + "loss": 0.77062345, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.14093018, + "step": 12475, + "time_per_iteration": 2.490290880203247 + }, + { + "auxiliary_loss_clip": 0.01125661, + "auxiliary_loss_mlp": 0.01037838, + "balance_loss_clip": 1.04405069, + "balance_loss_mlp": 1.02432585, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 2.291043838956338, + "language_loss": 0.80324322, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82487822, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13494873, + "step": 12476, + "time_per_iteration": 2.3869805335998535 + }, + { + "auxiliary_loss_clip": 0.01111533, + "auxiliary_loss_mlp": 0.0102662, + "balance_loss_clip": 1.03993523, + "balance_loss_mlp": 1.01487803, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.8626921535655305, + "language_loss": 0.77692699, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79830855, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11743164, + "step": 12477, + "time_per_iteration": 2.478334665298462 + }, + { + "auxiliary_loss_clip": 0.01071923, + "auxiliary_loss_mlp": 0.01002675, + "balance_loss_clip": 1.04733157, + "balance_loss_mlp": 1.00119662, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.80820794518327, + "language_loss": 0.54379201, + "learning_rate": 6.194277749655394e-07, + "loss": 0.564538, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01477051, + "step": 12478, + "time_per_iteration": 3.045955181121826 + }, + { + "auxiliary_loss_clip": 0.0111427, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.0422523, + "balance_loss_mlp": 1.0184325, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.9789849761357834, + "language_loss": 0.7983551, + "learning_rate": 6.191460113968272e-07, + "loss": 0.81979918, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11700439, + "step": 12479, + "time_per_iteration": 2.431641101837158 + }, + { + "auxiliary_loss_clip": 0.01117675, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.04120648, + "balance_loss_mlp": 1.02148867, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.1656225135835525, + "language_loss": 0.62961096, + "learning_rate": 6.188643001902369e-07, + "loss": 0.65115011, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.14752197, + "step": 12480, + "time_per_iteration": 2.4246222972869873 + }, + { + "auxiliary_loss_clip": 0.01110076, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.04152918, + "balance_loss_mlp": 1.02137506, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.7109180657017649, + "language_loss": 0.7808423, + "learning_rate": 6.185826413564512e-07, + "loss": 0.8022669, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.11010742, + "step": 12481, + "time_per_iteration": 2.4551258087158203 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.03860497, + "balance_loss_mlp": 1.01980901, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.7195780659923847, + "language_loss": 0.71444654, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73589754, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12438965, + "step": 12482, + "time_per_iteration": 2.441107988357544 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.04474044, + "balance_loss_mlp": 1.02304912, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.8758292673442967, + "language_loss": 0.70614755, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72767907, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12194824, + "step": 12483, + "time_per_iteration": 2.4976563453674316 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.03877032, + "balance_loss_mlp": 1.02155936, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 1.7647996560146813, + "language_loss": 0.74323285, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76468629, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12902832, + "step": 12484, + "time_per_iteration": 2.426461935043335 + }, + { + "auxiliary_loss_clip": 0.01118176, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.04242325, + "balance_loss_mlp": 1.02289271, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.5077807301470285, + "language_loss": 0.84800881, + "learning_rate": 6.174565299629295e-07, + "loss": 0.8695488, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12939453, + "step": 12485, + "time_per_iteration": 2.413583278656006 + }, + { + "auxiliary_loss_clip": 0.01120227, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.04452932, + "balance_loss_mlp": 1.01761973, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.4697000711554942, + "language_loss": 0.7813853, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80288357, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11993408, + "step": 12486, + "time_per_iteration": 2.423201560974121 + }, + { + "auxiliary_loss_clip": 0.01115975, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.04199898, + "balance_loss_mlp": 1.01771164, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.5623425208811335, + "language_loss": 0.73089868, + "learning_rate": 6.168937887805932e-07, + "loss": 0.75236654, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.13092041, + "step": 12487, + "time_per_iteration": 2.4517927169799805 + }, + { + "auxiliary_loss_clip": 0.01114731, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.04168248, + "balance_loss_mlp": 1.01331735, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.9523487387372005, + "language_loss": 0.67901516, + "learning_rate": 6.166124968553801e-07, + "loss": 0.70041287, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11730957, + "step": 12488, + "time_per_iteration": 2.473583221435547 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.04336035, + "balance_loss_mlp": 1.02237916, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.7893692010844666, + "language_loss": 0.77441263, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79590476, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12316895, + "step": 12489, + "time_per_iteration": 2.4451799392700195 + }, + { + "auxiliary_loss_clip": 0.0111535, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.04311609, + "balance_loss_mlp": 1.01870251, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 2.1198890646928215, + "language_loss": 0.74910629, + "learning_rate": 6.160500703901956e-07, + "loss": 0.7705577, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11090088, + "step": 12490, + "time_per_iteration": 2.4801549911499023 + }, + { + "auxiliary_loss_clip": 0.01118857, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.04704952, + "balance_loss_mlp": 1.01738751, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.9556488901310005, + "language_loss": 0.78536677, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80685174, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12249756, + "step": 12491, + "time_per_iteration": 3.988168716430664 + }, + { + "auxiliary_loss_clip": 0.0111134, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.04091883, + "balance_loss_mlp": 1.01934433, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.707925002553535, + "language_loss": 0.76964319, + "learning_rate": 6.154878538430899e-07, + "loss": 0.79105783, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10778809, + "step": 12492, + "time_per_iteration": 2.461195468902588 + }, + { + "auxiliary_loss_clip": 0.01115034, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.04347789, + "balance_loss_mlp": 1.0185411, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 1.914572361198578, + "language_loss": 0.71261436, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73405915, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10919189, + "step": 12493, + "time_per_iteration": 2.4981067180633545 + }, + { + "auxiliary_loss_clip": 0.01118112, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.04461145, + "balance_loss_mlp": 1.01443243, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.754331265918814, + "language_loss": 0.80845606, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82989943, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11785889, + "step": 12494, + "time_per_iteration": 2.462204694747925 + }, + { + "auxiliary_loss_clip": 0.01116239, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.04247499, + "balance_loss_mlp": 1.02110314, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 2.515151302799068, + "language_loss": 0.78386569, + "learning_rate": 6.146449228053634e-07, + "loss": 0.8053745, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.13531494, + "step": 12495, + "time_per_iteration": 3.896636724472046 + }, + { + "auxiliary_loss_clip": 0.01118208, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.04540777, + "balance_loss_mlp": 1.02188826, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.4103975911540476, + "language_loss": 0.71091717, + "learning_rate": 6.143640508441898e-07, + "loss": 0.73243457, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11645508, + "step": 12496, + "time_per_iteration": 2.490901470184326 + }, + { + "auxiliary_loss_clip": 0.0111491, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.04348183, + "balance_loss_mlp": 1.01732862, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.6087147340269337, + "language_loss": 0.7826615, + "learning_rate": 6.140832314264705e-07, + "loss": 0.8040961, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11212158, + "step": 12497, + "time_per_iteration": 2.4715309143066406 + }, + { + "auxiliary_loss_clip": 0.01118645, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.04359674, + "balance_loss_mlp": 1.02055824, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.612713439072265, + "language_loss": 0.76935446, + "learning_rate": 6.13802464562855e-07, + "loss": 0.79086864, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12213135, + "step": 12498, + "time_per_iteration": 2.4799582958221436 + }, + { + "auxiliary_loss_clip": 0.011207, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.05195427, + "balance_loss_mlp": 1.01914394, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.8111033467359, + "language_loss": 0.74414366, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76564765, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10546875, + "step": 12499, + "time_per_iteration": 2.439340114593506 + }, + { + "auxiliary_loss_clip": 0.01121487, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.04891121, + "balance_loss_mlp": 1.01431155, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.1320562012774316, + "language_loss": 0.79741925, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81889105, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11383057, + "step": 12500, + "time_per_iteration": 2.46039080619812 + }, + { + "auxiliary_loss_clip": 0.0112777, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.04809356, + "balance_loss_mlp": 1.01940286, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 2.40083811476804, + "language_loss": 0.73900652, + "learning_rate": 6.129604794030794e-07, + "loss": 0.76062059, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.14239502, + "step": 12501, + "time_per_iteration": 3.955064535140991 + }, + { + "auxiliary_loss_clip": 0.01114995, + "auxiliary_loss_mlp": 0.01023452, + "balance_loss_clip": 1.04389596, + "balance_loss_mlp": 1.01204932, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.6420051361077204, + "language_loss": 0.785694, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80707848, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11401367, + "step": 12502, + "time_per_iteration": 2.4453535079956055 + }, + { + "auxiliary_loss_clip": 0.01121842, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04937267, + "balance_loss_mlp": 1.01973724, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.1346147774826028, + "language_loss": 0.71027434, + "learning_rate": 6.123994189288786e-07, + "loss": 0.73181105, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12103271, + "step": 12503, + "time_per_iteration": 2.4422595500946045 + }, + { + "auxiliary_loss_clip": 0.01046388, + "auxiliary_loss_mlp": 0.01008799, + "balance_loss_clip": 1.02186227, + "balance_loss_mlp": 1.00752389, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.989204926356361, + "language_loss": 0.63965505, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66020691, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01275635, + "step": 12504, + "time_per_iteration": 3.024372100830078 + }, + { + "auxiliary_loss_clip": 0.01102139, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.03431797, + "balance_loss_mlp": 1.01745236, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.5137712110331285, + "language_loss": 0.68807304, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70939106, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.12213135, + "step": 12505, + "time_per_iteration": 2.6367297172546387 + }, + { + "auxiliary_loss_clip": 0.01040989, + "auxiliary_loss_mlp": 0.01006344, + "balance_loss_clip": 1.01636946, + "balance_loss_mlp": 1.00476146, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6430204376921332, + "language_loss": 0.55110371, + "learning_rate": 6.11558222878809e-07, + "loss": 0.57157695, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01582336, + "step": 12506, + "time_per_iteration": 3.1598072052001953 + }, + { + "auxiliary_loss_clip": 0.01115161, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.0411818, + "balance_loss_mlp": 1.02371621, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 1.7387186157219474, + "language_loss": 0.78562599, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80714893, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.13415527, + "step": 12507, + "time_per_iteration": 2.4338998794555664 + }, + { + "auxiliary_loss_clip": 0.01122086, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.04799879, + "balance_loss_mlp": 1.0193764, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.8259907668066075, + "language_loss": 0.71024585, + "learning_rate": 6.10997688743631e-07, + "loss": 0.7317766, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11608887, + "step": 12508, + "time_per_iteration": 2.3778393268585205 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.04148126, + "balance_loss_mlp": 1.01526546, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.7973230105061695, + "language_loss": 0.72273886, + "learning_rate": 6.107175006773885e-07, + "loss": 0.7441175, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11529541, + "step": 12509, + "time_per_iteration": 2.4518351554870605 + }, + { + "auxiliary_loss_clip": 0.01122284, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.04649985, + "balance_loss_mlp": 1.02014506, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.669415194551313, + "language_loss": 0.6208272, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64238226, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13085938, + "step": 12510, + "time_per_iteration": 2.5160608291625977 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.04619718, + "balance_loss_mlp": 1.01699901, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.7781392279613353, + "language_loss": 0.81451321, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83597147, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11437988, + "step": 12511, + "time_per_iteration": 2.447420120239258 + }, + { + "auxiliary_loss_clip": 0.01118334, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.04213881, + "balance_loss_mlp": 1.01940441, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.7081136466983498, + "language_loss": 0.75983202, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78133762, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12811279, + "step": 12512, + "time_per_iteration": 2.403012990951538 + }, + { + "auxiliary_loss_clip": 0.01112408, + "auxiliary_loss_mlp": 0.01027376, + "balance_loss_clip": 1.04449344, + "balance_loss_mlp": 1.0166055, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.699546193929408, + "language_loss": 0.82559031, + "learning_rate": 6.095972753359537e-07, + "loss": 0.8469882, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.10766602, + "step": 12513, + "time_per_iteration": 2.4802231788635254 + }, + { + "auxiliary_loss_clip": 0.01115387, + "auxiliary_loss_mlp": 0.01033834, + "balance_loss_clip": 1.04130638, + "balance_loss_mlp": 1.02100146, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 7.068050724432461, + "language_loss": 0.75031334, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77180552, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.128479, + "step": 12514, + "time_per_iteration": 3.8587210178375244 + }, + { + "auxiliary_loss_clip": 0.0111608, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.04615355, + "balance_loss_mlp": 1.01723611, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.8982458433178984, + "language_loss": 0.68980885, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71124732, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10546875, + "step": 12515, + "time_per_iteration": 2.4343864917755127 + }, + { + "auxiliary_loss_clip": 0.01114057, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.0431118, + "balance_loss_mlp": 1.02373016, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 1.6357931324030495, + "language_loss": 0.70085359, + "learning_rate": 6.087576598969137e-07, + "loss": 0.7223416, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11016846, + "step": 12516, + "time_per_iteration": 2.6361753940582275 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.04796433, + "balance_loss_mlp": 1.01951098, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.475826674959354, + "language_loss": 0.89637315, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91784561, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.11755371, + "step": 12517, + "time_per_iteration": 2.586402416229248 + }, + { + "auxiliary_loss_clip": 0.01118756, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.04549861, + "balance_loss_mlp": 1.02339768, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.8127562168898497, + "language_loss": 0.74385399, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76539153, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11608887, + "step": 12518, + "time_per_iteration": 2.4646291732788086 + }, + { + "auxiliary_loss_clip": 0.01047722, + "auxiliary_loss_mlp": 0.01007685, + "balance_loss_clip": 1.02225101, + "balance_loss_mlp": 1.00625205, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.743991593198899, + "language_loss": 0.55746579, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57801986, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.25463867, + "router_z_loss_mlp": 0.01434326, + "step": 12519, + "time_per_iteration": 3.2668304443359375 + }, + { + "auxiliary_loss_clip": 0.01106464, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.03603172, + "balance_loss_mlp": 1.01685476, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.484600557491821, + "language_loss": 0.77775395, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79909033, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10321045, + "step": 12520, + "time_per_iteration": 2.504387855529785 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.01038413, + "balance_loss_clip": 1.04306424, + "balance_loss_mlp": 1.02710569, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 3.967460267489708, + "language_loss": 0.74221468, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76376092, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11315918, + "step": 12521, + "time_per_iteration": 2.4224283695220947 + }, + { + "auxiliary_loss_clip": 0.01122542, + "auxiliary_loss_mlp": 0.01037643, + "balance_loss_clip": 1.04342067, + "balance_loss_mlp": 1.02407718, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.7968646803089976, + "language_loss": 0.67440665, + "learning_rate": 6.070798537185016e-07, + "loss": 0.6960085, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13574219, + "step": 12522, + "time_per_iteration": 2.5172767639160156 + }, + { + "auxiliary_loss_clip": 0.01129435, + "auxiliary_loss_mlp": 0.01038209, + "balance_loss_clip": 1.05373228, + "balance_loss_mlp": 1.02578139, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 5.56107433344238, + "language_loss": 0.78198421, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80366063, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12420654, + "step": 12523, + "time_per_iteration": 2.45440411567688 + }, + { + "auxiliary_loss_clip": 0.01118765, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.04694867, + "balance_loss_mlp": 1.01589751, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 2.0045600337632568, + "language_loss": 0.80713183, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82858872, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11035156, + "step": 12524, + "time_per_iteration": 2.45361328125 + }, + { + "auxiliary_loss_clip": 0.01115542, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.044644, + "balance_loss_mlp": 1.01581144, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.5559098784813998, + "language_loss": 0.73833954, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75976467, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.1116333, + "step": 12525, + "time_per_iteration": 2.458625316619873 + }, + { + "auxiliary_loss_clip": 0.01117518, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.04522491, + "balance_loss_mlp": 1.01695359, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.8382671146577858, + "language_loss": 0.72278237, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74424553, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.1184082, + "step": 12526, + "time_per_iteration": 2.471508264541626 + }, + { + "auxiliary_loss_clip": 0.0111564, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.04295981, + "balance_loss_mlp": 1.01633298, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 1.7061064457808195, + "language_loss": 0.72122562, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74265814, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.112854, + "step": 12527, + "time_per_iteration": 2.4825057983398438 + }, + { + "auxiliary_loss_clip": 0.01123906, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.05113173, + "balance_loss_mlp": 1.01794684, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.8372151896623141, + "language_loss": 0.81291449, + "learning_rate": 6.054039490480539e-07, + "loss": 0.83444172, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10876465, + "step": 12528, + "time_per_iteration": 2.434089422225952 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.04469287, + "balance_loss_mlp": 1.01997638, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 2.1611381438368125, + "language_loss": 0.85143536, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87291586, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12261963, + "step": 12529, + "time_per_iteration": 2.432528018951416 + }, + { + "auxiliary_loss_clip": 0.01123012, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.04878187, + "balance_loss_mlp": 1.01888132, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.8583777411198779, + "language_loss": 0.73906273, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76060098, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1194458, + "step": 12530, + "time_per_iteration": 2.4817755222320557 + }, + { + "auxiliary_loss_clip": 0.01060775, + "auxiliary_loss_mlp": 0.01000788, + "balance_loss_clip": 1.03654218, + "balance_loss_mlp": 0.99957055, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8262669617813972, + "language_loss": 0.63647574, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65709138, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.24194336, + "router_z_loss_mlp": 0.01217651, + "step": 12531, + "time_per_iteration": 2.934159517288208 + }, + { + "auxiliary_loss_clip": 0.01120718, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.04489446, + "balance_loss_mlp": 1.01650083, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 2.074266968964322, + "language_loss": 0.70432246, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72582018, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12542725, + "step": 12532, + "time_per_iteration": 2.4810492992401123 + }, + { + "auxiliary_loss_clip": 0.01105011, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.03802359, + "balance_loss_mlp": 1.01738608, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.4966600655679057, + "language_loss": 0.77531505, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79663849, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.0994873, + "step": 12533, + "time_per_iteration": 2.4477148056030273 + }, + { + "auxiliary_loss_clip": 0.01042367, + "auxiliary_loss_mlp": 0.01002708, + "balance_loss_clip": 1.01786327, + "balance_loss_mlp": 1.00128913, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7833247541873581, + "language_loss": 0.57343519, + "learning_rate": 6.037299481733886e-07, + "loss": 0.5938859, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01419067, + "step": 12534, + "time_per_iteration": 3.168273448944092 + }, + { + "auxiliary_loss_clip": 0.01111949, + "auxiliary_loss_mlp": 0.0102484, + "balance_loss_clip": 1.04010701, + "balance_loss_mlp": 1.01282954, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.3833212586404637, + "language_loss": 0.71136999, + "learning_rate": 6.03451133279365e-07, + "loss": 0.7327379, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12005615, + "step": 12535, + "time_per_iteration": 4.0379815101623535 + }, + { + "auxiliary_loss_clip": 0.01109881, + "auxiliary_loss_mlp": 0.01026991, + "balance_loss_clip": 1.0357796, + "balance_loss_mlp": 1.0145452, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.817354015775105, + "language_loss": 0.80916971, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83053845, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12451172, + "step": 12536, + "time_per_iteration": 2.4921600818634033 + }, + { + "auxiliary_loss_clip": 0.01104875, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.03660035, + "balance_loss_mlp": 1.01725101, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 2.0680471636927638, + "language_loss": 0.74251413, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76384795, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.1126709, + "step": 12537, + "time_per_iteration": 2.5857739448547363 + }, + { + "auxiliary_loss_clip": 0.01127636, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.05065048, + "balance_loss_mlp": 1.02198052, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.891138850272362, + "language_loss": 0.74494922, + "learning_rate": 6.026150063832111e-07, + "loss": 0.7665664, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12103271, + "step": 12538, + "time_per_iteration": 2.401442289352417 + }, + { + "auxiliary_loss_clip": 0.01114818, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.04221082, + "balance_loss_mlp": 1.01632595, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.485579293021538, + "language_loss": 0.67940092, + "learning_rate": 6.023364033816956e-07, + "loss": 0.70083082, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1184082, + "step": 12539, + "time_per_iteration": 3.8943238258361816 + }, + { + "auxiliary_loss_clip": 0.01114601, + "auxiliary_loss_mlp": 0.01026166, + "balance_loss_clip": 1.04538238, + "balance_loss_mlp": 1.01442945, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.996790957233065, + "language_loss": 0.74755597, + "learning_rate": 6.020578533797229e-07, + "loss": 0.76896358, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11737061, + "step": 12540, + "time_per_iteration": 2.481917381286621 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.04005086, + "balance_loss_mlp": 1.01794195, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.2865788163951954, + "language_loss": 0.73269844, + "learning_rate": 6.017793563878566e-07, + "loss": 0.75415874, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12231445, + "step": 12541, + "time_per_iteration": 2.3918981552124023 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.03629208, + "balance_loss_mlp": 1.01694429, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.8029677885647402, + "language_loss": 0.72407568, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74543941, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11730957, + "step": 12542, + "time_per_iteration": 2.6935510635375977 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.04154813, + "balance_loss_mlp": 1.01498079, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 2.1105323257675113, + "language_loss": 0.84691429, + "learning_rate": 6.012225214766844e-07, + "loss": 0.8683027, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11975098, + "step": 12543, + "time_per_iteration": 2.455036163330078 + }, + { + "auxiliary_loss_clip": 0.01119199, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.04683077, + "balance_loss_mlp": 1.01952481, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.3485435342574643, + "language_loss": 0.73902094, + "learning_rate": 6.009441835784927e-07, + "loss": 0.7605319, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12353516, + "step": 12544, + "time_per_iteration": 2.479557991027832 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.04012084, + "balance_loss_mlp": 1.01991451, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 1.7869318555143439, + "language_loss": 0.68392986, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70537066, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1104126, + "step": 12545, + "time_per_iteration": 3.8871829509735107 + }, + { + "auxiliary_loss_clip": 0.01117887, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.04494739, + "balance_loss_mlp": 1.01894915, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.8357275920508769, + "language_loss": 0.68677521, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70825571, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11212158, + "step": 12546, + "time_per_iteration": 2.4241344928741455 + }, + { + "auxiliary_loss_clip": 0.01114533, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.04333889, + "balance_loss_mlp": 1.01802778, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 3.33671280504384, + "language_loss": 0.73641437, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75786436, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12438965, + "step": 12547, + "time_per_iteration": 2.4453396797180176 + }, + { + "auxiliary_loss_clip": 0.01119165, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.04768693, + "balance_loss_mlp": 1.01510763, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.8064104768265101, + "language_loss": 0.67720342, + "learning_rate": 5.998313626146099e-07, + "loss": 0.69867146, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12542725, + "step": 12548, + "time_per_iteration": 2.4732935428619385 + }, + { + "auxiliary_loss_clip": 0.01113587, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.04071081, + "balance_loss_mlp": 1.02175283, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.6901510721008246, + "language_loss": 0.87016928, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89164299, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12023926, + "step": 12549, + "time_per_iteration": 2.401763916015625 + }, + { + "auxiliary_loss_clip": 0.01101972, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.03476977, + "balance_loss_mlp": 1.02133512, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.9833544755001382, + "language_loss": 0.77359653, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79494214, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.67236328, + "router_z_loss_mlp": 0.11254883, + "step": 12550, + "time_per_iteration": 2.5139567852020264 + }, + { + "auxiliary_loss_clip": 0.01113958, + "auxiliary_loss_mlp": 0.01023569, + "balance_loss_clip": 1.04203916, + "balance_loss_mlp": 1.01247621, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.458296542600206, + "language_loss": 0.69703633, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71841156, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11096191, + "step": 12551, + "time_per_iteration": 2.5032923221588135 + }, + { + "auxiliary_loss_clip": 0.0111336, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.04231739, + "balance_loss_mlp": 1.02052307, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 2.34965491809651, + "language_loss": 0.86546266, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88694143, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.14001465, + "step": 12552, + "time_per_iteration": 2.408005714416504 + }, + { + "auxiliary_loss_clip": 0.01115975, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.04392827, + "balance_loss_mlp": 1.01737738, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 3.5517451424799145, + "language_loss": 0.77831042, + "learning_rate": 5.98441531115812e-07, + "loss": 0.79975724, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11340332, + "step": 12553, + "time_per_iteration": 2.440297842025757 + }, + { + "auxiliary_loss_clip": 0.01111866, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.04040277, + "balance_loss_mlp": 1.01853764, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.2262959453372435, + "language_loss": 0.63209277, + "learning_rate": 5.981637242156135e-07, + "loss": 0.6535145, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11779785, + "step": 12554, + "time_per_iteration": 2.5129497051239014 + }, + { + "auxiliary_loss_clip": 0.01108467, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.03799951, + "balance_loss_mlp": 1.01996493, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.689887414393976, + "language_loss": 0.7359609, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75735521, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11004639, + "step": 12555, + "time_per_iteration": 2.466564178466797 + }, + { + "auxiliary_loss_clip": 0.01112775, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.04057312, + "balance_loss_mlp": 1.02169156, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 1.749049870281269, + "language_loss": 0.78484398, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80632699, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.13830566, + "step": 12556, + "time_per_iteration": 2.4635157585144043 + }, + { + "auxiliary_loss_clip": 0.01048907, + "auxiliary_loss_mlp": 0.01003231, + "balance_loss_clip": 1.02342868, + "balance_loss_mlp": 1.0017097, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7195071274351594, + "language_loss": 0.50423229, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52475369, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01521301, + "step": 12557, + "time_per_iteration": 3.048833131790161 + }, + { + "auxiliary_loss_clip": 0.01121315, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.04500139, + "balance_loss_mlp": 1.02321124, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.7686219366255254, + "language_loss": 0.71284944, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73441994, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12536621, + "step": 12558, + "time_per_iteration": 3.9182488918304443 + }, + { + "auxiliary_loss_clip": 0.01117206, + "auxiliary_loss_mlp": 0.01025712, + "balance_loss_clip": 1.04563606, + "balance_loss_mlp": 1.01370811, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.6278858211243046, + "language_loss": 0.79687572, + "learning_rate": 5.967754872918187e-07, + "loss": 0.81830484, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12005615, + "step": 12559, + "time_per_iteration": 2.558339834213257 + }, + { + "auxiliary_loss_clip": 0.01113415, + "auxiliary_loss_mlp": 0.01028481, + "balance_loss_clip": 1.04017794, + "balance_loss_mlp": 1.01628613, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 2.6444343203106606, + "language_loss": 0.78695536, + "learning_rate": 5.96497999496199e-07, + "loss": 0.80837429, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12213135, + "step": 12560, + "time_per_iteration": 2.4807395935058594 + }, + { + "auxiliary_loss_clip": 0.01108238, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.03800249, + "balance_loss_mlp": 1.01940012, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 4.213334831942003, + "language_loss": 0.70993924, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73133147, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11584473, + "step": 12561, + "time_per_iteration": 2.4400899410247803 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01030848, + "balance_loss_clip": 1.04004562, + "balance_loss_mlp": 1.01909971, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.5521370358724884, + "language_loss": 0.756495, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77791297, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11755371, + "step": 12562, + "time_per_iteration": 2.5067312717437744 + }, + { + "auxiliary_loss_clip": 0.01106765, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03635335, + "balance_loss_mlp": 1.0161792, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 4.730246287220033, + "language_loss": 0.75893033, + "learning_rate": 5.956658554770371e-07, + "loss": 0.7802794, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11962891, + "step": 12563, + "time_per_iteration": 2.481055736541748 + }, + { + "auxiliary_loss_clip": 0.01122855, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.04457617, + "balance_loss_mlp": 1.01708651, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.21090196288547, + "language_loss": 0.67277682, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69431496, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13885498, + "step": 12564, + "time_per_iteration": 2.5472347736358643 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.03869796, + "balance_loss_mlp": 1.02098787, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 1.8725070873544765, + "language_loss": 0.68767977, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70913708, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12390137, + "step": 12565, + "time_per_iteration": 2.5077977180480957 + }, + { + "auxiliary_loss_clip": 0.01115808, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.04021645, + "balance_loss_mlp": 1.01585591, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 2.2690338761612856, + "language_loss": 0.75204945, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77349389, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12786865, + "step": 12566, + "time_per_iteration": 2.51650071144104 + }, + { + "auxiliary_loss_clip": 0.01120842, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.04575086, + "balance_loss_mlp": 1.02173698, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 23.060669400002038, + "language_loss": 0.73830241, + "learning_rate": 5.945570757020789e-07, + "loss": 0.75986159, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13342285, + "step": 12567, + "time_per_iteration": 2.438657283782959 + }, + { + "auxiliary_loss_clip": 0.01106531, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.03516698, + "balance_loss_mlp": 1.01658297, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.7449227382164665, + "language_loss": 0.63016903, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65151715, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11688232, + "step": 12568, + "time_per_iteration": 2.51491379737854 + }, + { + "auxiliary_loss_clip": 0.01109304, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.03854358, + "balance_loss_mlp": 1.02193117, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 2.4010233879711094, + "language_loss": 0.6594224, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68084788, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11315918, + "step": 12569, + "time_per_iteration": 2.736651659011841 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01036963, + "balance_loss_clip": 1.03701651, + "balance_loss_mlp": 1.02331352, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.9362195995193736, + "language_loss": 0.67896581, + "learning_rate": 5.93726050426697e-07, + "loss": 0.70044899, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13635254, + "step": 12570, + "time_per_iteration": 2.4988231658935547 + }, + { + "auxiliary_loss_clip": 0.01115112, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.04184246, + "balance_loss_mlp": 1.01828766, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 2.178528458512976, + "language_loss": 0.71931952, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74077737, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12390137, + "step": 12571, + "time_per_iteration": 2.74878191947937 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.04410458, + "balance_loss_mlp": 1.02032709, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.6300865485355855, + "language_loss": 0.73717982, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75870371, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12536621, + "step": 12572, + "time_per_iteration": 2.5539889335632324 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01032633, + "balance_loss_clip": 1.04475093, + "balance_loss_mlp": 1.02050924, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.5788599212616212, + "language_loss": 0.77260637, + "learning_rate": 5.928955050857456e-07, + "loss": 0.79413199, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12127686, + "step": 12573, + "time_per_iteration": 2.4684174060821533 + }, + { + "auxiliary_loss_clip": 0.01117651, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.04306877, + "balance_loss_mlp": 1.02027631, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.5672051048871491, + "language_loss": 0.69297528, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71447366, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11914062, + "step": 12574, + "time_per_iteration": 2.4297807216644287 + }, + { + "auxiliary_loss_clip": 0.01106925, + "auxiliary_loss_mlp": 0.01031182, + "balance_loss_clip": 1.03634214, + "balance_loss_mlp": 1.01874268, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 4.0492741075635195, + "language_loss": 0.71566415, + "learning_rate": 5.923420749619974e-07, + "loss": 0.73704517, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.12438965, + "step": 12575, + "time_per_iteration": 2.431419849395752 + }, + { + "auxiliary_loss_clip": 0.01114389, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.04260731, + "balance_loss_mlp": 1.01828277, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.121653847558859, + "language_loss": 0.72432482, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74576402, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11248779, + "step": 12576, + "time_per_iteration": 2.397655487060547 + }, + { + "auxiliary_loss_clip": 0.01117731, + "auxiliary_loss_mlp": 0.01031635, + "balance_loss_clip": 1.04616606, + "balance_loss_mlp": 1.01936221, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 1.959123900170359, + "language_loss": 0.67501932, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69651294, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12280273, + "step": 12577, + "time_per_iteration": 2.4370992183685303 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.04673648, + "balance_loss_mlp": 1.02092552, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.8041835772556223, + "language_loss": 0.7819109, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80341864, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11279297, + "step": 12578, + "time_per_iteration": 3.8152949810028076 + }, + { + "auxiliary_loss_clip": 0.01114742, + "auxiliary_loss_mlp": 0.01028897, + "balance_loss_clip": 1.04272401, + "balance_loss_mlp": 1.01677322, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.5762667009932871, + "language_loss": 0.7569952, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77843159, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12115479, + "step": 12579, + "time_per_iteration": 2.4786880016326904 + }, + { + "auxiliary_loss_clip": 0.01121111, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.04457271, + "balance_loss_mlp": 1.01853275, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 2.214304483029317, + "language_loss": 0.62642688, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64795232, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12902832, + "step": 12580, + "time_per_iteration": 2.5962517261505127 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.05012643, + "balance_loss_mlp": 1.01926887, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.6127512692759498, + "language_loss": 0.74972886, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77126449, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1227417, + "step": 12581, + "time_per_iteration": 2.4826176166534424 + }, + { + "auxiliary_loss_clip": 0.01110445, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.03715968, + "balance_loss_mlp": 1.01801789, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 2.1310895008994772, + "language_loss": 0.62860739, + "learning_rate": 5.904067515031412e-07, + "loss": 0.65001619, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12420654, + "step": 12582, + "time_per_iteration": 3.909710168838501 + }, + { + "auxiliary_loss_clip": 0.01065162, + "auxiliary_loss_mlp": 0.01005645, + "balance_loss_clip": 1.04077649, + "balance_loss_mlp": 1.00420797, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9618518465886065, + "language_loss": 0.60663462, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62734264, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.24389648, + "router_z_loss_mlp": 0.014328, + "step": 12583, + "time_per_iteration": 2.8783884048461914 + }, + { + "auxiliary_loss_clip": 0.01115294, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.04317617, + "balance_loss_mlp": 1.02449656, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.238403691600813, + "language_loss": 0.79561508, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81713319, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12017822, + "step": 12584, + "time_per_iteration": 2.4984331130981445 + }, + { + "auxiliary_loss_clip": 0.01114318, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.04471278, + "balance_loss_mlp": 1.02185416, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.0229474844845923, + "language_loss": 0.77893192, + "learning_rate": 5.895781287327612e-07, + "loss": 0.80042237, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.12866211, + "step": 12585, + "time_per_iteration": 2.457948684692383 + }, + { + "auxiliary_loss_clip": 0.01121841, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_clip": 1.04420364, + "balance_loss_mlp": 1.02954817, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.9517963088033594, + "language_loss": 0.83090723, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85256541, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.14459229, + "step": 12586, + "time_per_iteration": 2.5022432804107666 + }, + { + "auxiliary_loss_clip": 0.01118287, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.04396236, + "balance_loss_mlp": 1.02200854, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.081222135543341, + "language_loss": 0.83903116, + "learning_rate": 5.890259809517459e-07, + "loss": 0.86055773, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1237793, + "step": 12587, + "time_per_iteration": 2.4675862789154053 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03713751, + "balance_loss_mlp": 1.01802182, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.571453102151266, + "language_loss": 0.71243566, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73383301, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.13543701, + "step": 12588, + "time_per_iteration": 3.9455883502960205 + }, + { + "auxiliary_loss_clip": 0.01120621, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.04525399, + "balance_loss_mlp": 1.01679182, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.7633759670685458, + "language_loss": 0.68674934, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70825201, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12866211, + "step": 12589, + "time_per_iteration": 2.5068068504333496 + }, + { + "auxiliary_loss_clip": 0.01113206, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.04085445, + "balance_loss_mlp": 1.02167106, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.588762326388138, + "language_loss": 0.92376345, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94525266, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.14031982, + "step": 12590, + "time_per_iteration": 2.403717279434204 + }, + { + "auxiliary_loss_clip": 0.01112155, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.04005265, + "balance_loss_mlp": 1.02050424, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 2.260422262287541, + "language_loss": 0.65376455, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67522895, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.13787842, + "step": 12591, + "time_per_iteration": 2.5611629486083984 + }, + { + "auxiliary_loss_clip": 0.01115494, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.04261184, + "balance_loss_mlp": 1.01621127, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.5742031336409843, + "language_loss": 0.73567569, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75710893, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1161499, + "step": 12592, + "time_per_iteration": 2.4623260498046875 + }, + { + "auxiliary_loss_clip": 0.01120538, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.0442332, + "balance_loss_mlp": 1.02768862, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.349119036850127, + "language_loss": 0.71394157, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73555177, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12817383, + "step": 12593, + "time_per_iteration": 2.4304544925689697 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.04368424, + "balance_loss_mlp": 1.01720917, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 2.974724887853922, + "language_loss": 0.66262364, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68409371, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12133789, + "step": 12594, + "time_per_iteration": 2.3845341205596924 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.04633951, + "balance_loss_mlp": 1.0183475, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.6963131421220148, + "language_loss": 0.81074357, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83226717, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12109375, + "step": 12595, + "time_per_iteration": 2.439892053604126 + }, + { + "auxiliary_loss_clip": 0.01116885, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.04582381, + "balance_loss_mlp": 1.01732719, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.8668390103290204, + "language_loss": 0.71634632, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73779511, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10662842, + "step": 12596, + "time_per_iteration": 2.430185079574585 + }, + { + "auxiliary_loss_clip": 0.01114621, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.04468393, + "balance_loss_mlp": 1.01886868, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.5449270747652144, + "language_loss": 0.80278486, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82423544, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11566162, + "step": 12597, + "time_per_iteration": 2.4249284267425537 + }, + { + "auxiliary_loss_clip": 0.01124053, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.04562294, + "balance_loss_mlp": 1.01686454, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.6706998328840503, + "language_loss": 0.83641684, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85795581, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12994385, + "step": 12598, + "time_per_iteration": 2.4472877979278564 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.01023699, + "balance_loss_clip": 1.0419811, + "balance_loss_mlp": 1.01276183, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.7652783173901683, + "language_loss": 0.62808895, + "learning_rate": 5.857175915537845e-07, + "loss": 0.6494596, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10931396, + "step": 12599, + "time_per_iteration": 2.4163429737091064 + }, + { + "auxiliary_loss_clip": 0.01123287, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.04717064, + "balance_loss_mlp": 1.01674664, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.9215359559982104, + "language_loss": 0.63034785, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65188444, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13635254, + "step": 12600, + "time_per_iteration": 2.4301207065582275 + }, + { + "auxiliary_loss_clip": 0.01115543, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.04294503, + "balance_loss_mlp": 1.0179894, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.7983167522551893, + "language_loss": 0.66389132, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68534881, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12207031, + "step": 12601, + "time_per_iteration": 2.4286091327667236 + }, + { + "auxiliary_loss_clip": 0.01116712, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.04621673, + "balance_loss_mlp": 1.01644897, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.632306725306395, + "language_loss": 0.68431413, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70575559, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10992432, + "step": 12602, + "time_per_iteration": 3.883967161178589 + }, + { + "auxiliary_loss_clip": 0.01115231, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.04310417, + "balance_loss_mlp": 1.0205853, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 3.637020303162043, + "language_loss": 0.66636419, + "learning_rate": 5.846165103474967e-07, + "loss": 0.68785453, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.13214111, + "step": 12603, + "time_per_iteration": 2.489278793334961 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.03783047, + "balance_loss_mlp": 1.01637542, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 2.0688285071231456, + "language_loss": 0.61118066, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63252246, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10845947, + "step": 12604, + "time_per_iteration": 2.4342498779296875 + }, + { + "auxiliary_loss_clip": 0.01111109, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.040694, + "balance_loss_mlp": 1.02484286, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.7560516305785157, + "language_loss": 0.79813343, + "learning_rate": 5.840662917315076e-07, + "loss": 0.81961632, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.12329102, + "step": 12605, + "time_per_iteration": 2.502173662185669 + }, + { + "auxiliary_loss_clip": 0.0111394, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.03957582, + "balance_loss_mlp": 1.01681662, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 3.419117058553349, + "language_loss": 0.80020571, + "learning_rate": 5.837912629568198e-07, + "loss": 0.82163703, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12384033, + "step": 12606, + "time_per_iteration": 2.432217836380005 + }, + { + "auxiliary_loss_clip": 0.0111485, + "auxiliary_loss_mlp": 0.01027073, + "balance_loss_clip": 1.04486156, + "balance_loss_mlp": 1.01677334, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.3187331498536337, + "language_loss": 0.73176348, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75318277, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10290527, + "step": 12607, + "time_per_iteration": 2.47062611579895 + }, + { + "auxiliary_loss_clip": 0.0111937, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.04057956, + "balance_loss_mlp": 1.02543843, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 1.8222903272078252, + "language_loss": 0.75223464, + "learning_rate": 5.83241366526202e-07, + "loss": 0.77381599, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13348389, + "step": 12608, + "time_per_iteration": 2.3923041820526123 + }, + { + "auxiliary_loss_clip": 0.01111444, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.04023981, + "balance_loss_mlp": 1.01859856, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.5488283004762227, + "language_loss": 0.7145077, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73592603, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11798096, + "step": 12609, + "time_per_iteration": 2.470404863357544 + }, + { + "auxiliary_loss_clip": 0.01113956, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.03977036, + "balance_loss_mlp": 1.01523435, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.6914507288005962, + "language_loss": 0.814987, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83640504, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1262207, + "step": 12610, + "time_per_iteration": 2.4574520587921143 + }, + { + "auxiliary_loss_clip": 0.01118228, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.04128599, + "balance_loss_mlp": 1.02180076, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.6109119424536111, + "language_loss": 0.7067349, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72825885, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12371826, + "step": 12611, + "time_per_iteration": 2.4724059104919434 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.04208207, + "balance_loss_mlp": 1.01831627, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.5914600149778697, + "language_loss": 0.71020412, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73164213, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11627197, + "step": 12612, + "time_per_iteration": 2.4402716159820557 + }, + { + "auxiliary_loss_clip": 0.01124801, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.04949653, + "balance_loss_mlp": 1.02526748, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.5266269790503761, + "language_loss": 0.59582996, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61744457, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11383057, + "step": 12613, + "time_per_iteration": 2.4984517097473145 + }, + { + "auxiliary_loss_clip": 0.0110945, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.03766894, + "balance_loss_mlp": 1.01924455, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5723461656002697, + "language_loss": 0.60122985, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62263614, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1194458, + "step": 12614, + "time_per_iteration": 2.5666494369506836 + }, + { + "auxiliary_loss_clip": 0.01110996, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.03732574, + "balance_loss_mlp": 1.01339555, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.9611032204188612, + "language_loss": 0.73295808, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75432807, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12609863, + "step": 12615, + "time_per_iteration": 2.5335381031036377 + }, + { + "auxiliary_loss_clip": 0.01042371, + "auxiliary_loss_mlp": 0.01003485, + "balance_loss_clip": 1.01744306, + "balance_loss_mlp": 1.00195742, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8128141618598195, + "language_loss": 0.677104, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69756258, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01527405, + "step": 12616, + "time_per_iteration": 3.1360127925872803 + }, + { + "auxiliary_loss_clip": 0.01129986, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.05341446, + "balance_loss_mlp": 1.02039766, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.888659174443893, + "language_loss": 0.84641737, + "learning_rate": 5.807694931114979e-07, + "loss": 0.86804426, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12316895, + "step": 12617, + "time_per_iteration": 2.4364562034606934 + }, + { + "auxiliary_loss_clip": 0.01112923, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.04052544, + "balance_loss_mlp": 1.02097332, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.520441100539312, + "language_loss": 0.74534452, + "learning_rate": 5.804951094578757e-07, + "loss": 0.76679593, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11242676, + "step": 12618, + "time_per_iteration": 2.3848507404327393 + }, + { + "auxiliary_loss_clip": 0.01112528, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.03614652, + "balance_loss_mlp": 1.01655042, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 38.11264601402193, + "language_loss": 0.77521288, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79663026, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12652588, + "step": 12619, + "time_per_iteration": 2.5685832500457764 + }, + { + "auxiliary_loss_clip": 0.01114323, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.04255104, + "balance_loss_mlp": 1.0215497, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.819114691435764, + "language_loss": 0.82560647, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84709156, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12640381, + "step": 12620, + "time_per_iteration": 2.48431658744812 + }, + { + "auxiliary_loss_clip": 0.01125474, + "auxiliary_loss_mlp": 0.01038476, + "balance_loss_clip": 1.04494739, + "balance_loss_mlp": 1.02464151, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.582314623780815, + "language_loss": 0.82418907, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84582859, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13818359, + "step": 12621, + "time_per_iteration": 2.418433904647827 + }, + { + "auxiliary_loss_clip": 0.01114329, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.04200077, + "balance_loss_mlp": 1.02122784, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 2.106450811438222, + "language_loss": 0.73428679, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75576347, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12109375, + "step": 12622, + "time_per_iteration": 2.383765459060669 + }, + { + "auxiliary_loss_clip": 0.01056042, + "auxiliary_loss_mlp": 0.0100166, + "balance_loss_clip": 1.03117895, + "balance_loss_mlp": 1.00032771, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8361989861826673, + "language_loss": 0.60795259, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62852967, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.24853516, + "router_z_loss_mlp": 0.01332092, + "step": 12623, + "time_per_iteration": 4.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01113737, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.04387176, + "balance_loss_mlp": 1.0224787, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 2.1293594165315697, + "language_loss": 0.67436445, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69584018, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11383057, + "step": 12624, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01112497, + "auxiliary_loss_mlp": 0.01027325, + "balance_loss_clip": 1.04290295, + "balance_loss_mlp": 1.01498103, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.6921928150564578, + "language_loss": 0.76124758, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78264576, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.12347412, + "step": 12625, + "time_per_iteration": 2.4651169776916504 + }, + { + "auxiliary_loss_clip": 0.01118857, + "auxiliary_loss_mlp": 0.01038587, + "balance_loss_clip": 1.0453769, + "balance_loss_mlp": 1.02623057, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.7330893098957414, + "language_loss": 0.62828672, + "learning_rate": 5.783019789020977e-07, + "loss": 0.64986122, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12353516, + "step": 12626, + "time_per_iteration": 3.975918769836426 + }, + { + "auxiliary_loss_clip": 0.01120971, + "auxiliary_loss_mlp": 0.01035694, + "balance_loss_clip": 1.04626322, + "balance_loss_mlp": 1.02329063, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 1.947421350265359, + "language_loss": 0.73975843, + "learning_rate": 5.780280800727084e-07, + "loss": 0.76132512, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12414551, + "step": 12627, + "time_per_iteration": 2.435917615890503 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01026721, + "balance_loss_clip": 1.045017, + "balance_loss_mlp": 1.01502657, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.183523814806486, + "language_loss": 0.68846244, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70991516, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11700439, + "step": 12628, + "time_per_iteration": 2.491382122039795 + }, + { + "auxiliary_loss_clip": 0.01127798, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.04867721, + "balance_loss_mlp": 1.02300787, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 1.77119728241927, + "language_loss": 0.63270259, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65434337, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13250732, + "step": 12629, + "time_per_iteration": 2.449720859527588 + }, + { + "auxiliary_loss_clip": 0.01103113, + "auxiliary_loss_mlp": 0.01031159, + "balance_loss_clip": 1.03478193, + "balance_loss_mlp": 1.01966679, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.5181462932124696, + "language_loss": 0.7779448, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79928756, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.1149292, + "step": 12630, + "time_per_iteration": 2.5150108337402344 + }, + { + "auxiliary_loss_clip": 0.0103999, + "auxiliary_loss_mlp": 0.01011678, + "balance_loss_clip": 1.0151782, + "balance_loss_mlp": 1.01019943, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8091749100523682, + "language_loss": 0.61478227, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63529891, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01478577, + "step": 12631, + "time_per_iteration": 4.521322011947632 + }, + { + "auxiliary_loss_clip": 0.01119075, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.04171395, + "balance_loss_mlp": 1.02635288, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.7355410086404517, + "language_loss": 0.74255407, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76415211, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.14385986, + "step": 12632, + "time_per_iteration": 2.4933223724365234 + }, + { + "auxiliary_loss_clip": 0.01113729, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.04161239, + "balance_loss_mlp": 1.01695466, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.8518535063947017, + "language_loss": 0.75106001, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77248937, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12255859, + "step": 12633, + "time_per_iteration": 2.4723713397979736 + }, + { + "auxiliary_loss_clip": 0.01111076, + "auxiliary_loss_mlp": 0.0102791, + "balance_loss_clip": 1.03960586, + "balance_loss_mlp": 1.01695478, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 2.5640210599818194, + "language_loss": 0.73005009, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75143993, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10961914, + "step": 12634, + "time_per_iteration": 2.4346537590026855 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.04050958, + "balance_loss_mlp": 1.02346897, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.6774888981131864, + "language_loss": 0.64618951, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66768658, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12115479, + "step": 12635, + "time_per_iteration": 2.5209403038024902 + }, + { + "auxiliary_loss_clip": 0.01112786, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.03875399, + "balance_loss_mlp": 1.01650596, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.6751327840713774, + "language_loss": 0.69199073, + "learning_rate": 5.7556541831317e-07, + "loss": 0.71340477, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12115479, + "step": 12636, + "time_per_iteration": 2.4128997325897217 + }, + { + "auxiliary_loss_clip": 0.01116126, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.04281139, + "balance_loss_mlp": 1.02263141, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.9293814944215606, + "language_loss": 0.81371987, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83522332, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11590576, + "step": 12637, + "time_per_iteration": 2.4719088077545166 + }, + { + "auxiliary_loss_clip": 0.0112384, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.04741108, + "balance_loss_mlp": 1.01717377, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.8851535221825033, + "language_loss": 0.66384614, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68537652, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12017822, + "step": 12638, + "time_per_iteration": 2.5797555446624756 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.04297543, + "balance_loss_mlp": 1.01879382, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.0672066726919285, + "language_loss": 0.65280449, + "learning_rate": 5.747455029512323e-07, + "loss": 0.6743232, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.13903809, + "step": 12639, + "time_per_iteration": 2.4257302284240723 + }, + { + "auxiliary_loss_clip": 0.01119402, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.04770494, + "balance_loss_mlp": 1.0165931, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.1079140111861614, + "language_loss": 0.70314741, + "learning_rate": 5.744723059083572e-07, + "loss": 0.7246263, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11907959, + "step": 12640, + "time_per_iteration": 2.4146838188171387 + }, + { + "auxiliary_loss_clip": 0.01126476, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.04885674, + "balance_loss_mlp": 1.0187788, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.7522702591044452, + "language_loss": 0.67253506, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69411707, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12939453, + "step": 12641, + "time_per_iteration": 2.441736936569214 + }, + { + "auxiliary_loss_clip": 0.01120704, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.04645264, + "balance_loss_mlp": 1.0205102, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.0690151585763794, + "language_loss": 0.67087597, + "learning_rate": 5.73926074001422e-07, + "loss": 0.6924144, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12628174, + "step": 12642, + "time_per_iteration": 2.4005062580108643 + }, + { + "auxiliary_loss_clip": 0.01121144, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.04988086, + "balance_loss_mlp": 1.01957715, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.9071447881295458, + "language_loss": 0.75531781, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77684516, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12011719, + "step": 12643, + "time_per_iteration": 2.4971017837524414 + }, + { + "auxiliary_loss_clip": 0.01115988, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.03947103, + "balance_loss_mlp": 1.0243423, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 4.7475339535094765, + "language_loss": 0.78928947, + "learning_rate": 5.733800584019508e-07, + "loss": 0.8108269, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13409424, + "step": 12644, + "time_per_iteration": 2.4424004554748535 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.04073966, + "balance_loss_mlp": 1.01665246, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.4898650120333974, + "language_loss": 0.80467981, + "learning_rate": 5.731071317433957e-07, + "loss": 0.8261041, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11663818, + "step": 12645, + "time_per_iteration": 3.8836441040039062 + }, + { + "auxiliary_loss_clip": 0.01117001, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.04101014, + "balance_loss_mlp": 1.0181073, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.5738569654828107, + "language_loss": 0.72678912, + "learning_rate": 5.728342591927611e-07, + "loss": 0.74826688, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12670898, + "step": 12646, + "time_per_iteration": 2.4487547874450684 + }, + { + "auxiliary_loss_clip": 0.01106408, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.03651142, + "balance_loss_mlp": 1.0213784, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 2.0881625165460758, + "language_loss": 0.67334795, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69475412, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.12823486, + "step": 12647, + "time_per_iteration": 2.472285270690918 + }, + { + "auxiliary_loss_clip": 0.01048992, + "auxiliary_loss_mlp": 0.01009327, + "balance_loss_clip": 1.02374673, + "balance_loss_mlp": 1.00779235, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6697681263918025, + "language_loss": 0.49019933, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51078248, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.0153656, + "step": 12648, + "time_per_iteration": 3.026773452758789 + }, + { + "auxiliary_loss_clip": 0.01111324, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.04053104, + "balance_loss_mlp": 1.01993322, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 4.556329405439653, + "language_loss": 0.76470584, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78614825, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12994385, + "step": 12649, + "time_per_iteration": 2.4836432933807373 + }, + { + "auxiliary_loss_clip": 0.01106566, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.03570759, + "balance_loss_mlp": 1.02214336, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.5945532455584954, + "language_loss": 0.68979394, + "learning_rate": 5.717433102763462e-07, + "loss": 0.71121407, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.13299561, + "step": 12650, + "time_per_iteration": 2.5104405879974365 + }, + { + "auxiliary_loss_clip": 0.0104417, + "auxiliary_loss_mlp": 0.01005048, + "balance_loss_clip": 1.01889777, + "balance_loss_mlp": 1.00370407, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7536930452065379, + "language_loss": 0.62791556, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64840776, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.25292969, + "router_z_loss_mlp": 0.01344299, + "step": 12651, + "time_per_iteration": 3.0762410163879395 + }, + { + "auxiliary_loss_clip": 0.01114505, + "auxiliary_loss_mlp": 0.01049443, + "balance_loss_clip": 1.04155922, + "balance_loss_mlp": 1.03505409, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.4764076311477288, + "language_loss": 0.71501887, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73665833, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1439209, + "step": 12652, + "time_per_iteration": 2.4687318801879883 + }, + { + "auxiliary_loss_clip": 0.01115261, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.04099023, + "balance_loss_mlp": 1.02478981, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.2651562892125763, + "language_loss": 0.80512309, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82665074, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12713623, + "step": 12653, + "time_per_iteration": 2.423994779586792 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.0425539, + "balance_loss_mlp": 1.02094984, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.7543171668370594, + "language_loss": 0.79750323, + "learning_rate": 5.706532279140785e-07, + "loss": 0.8190335, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12194824, + "step": 12654, + "time_per_iteration": 2.4715640544891357 + }, + { + "auxiliary_loss_clip": 0.01118547, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.04186296, + "balance_loss_mlp": 1.02347469, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.1325034120607453, + "language_loss": 0.79203188, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81357872, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12670898, + "step": 12655, + "time_per_iteration": 2.4335861206054688 + }, + { + "auxiliary_loss_clip": 0.01115318, + "auxiliary_loss_mlp": 0.01024532, + "balance_loss_clip": 1.04442501, + "balance_loss_mlp": 1.01456606, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.4759072641761528, + "language_loss": 0.6823647, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70376325, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.09960938, + "step": 12656, + "time_per_iteration": 2.506060838699341 + }, + { + "auxiliary_loss_clip": 0.0112917, + "auxiliary_loss_mlp": 0.01026517, + "balance_loss_clip": 1.0507077, + "balance_loss_mlp": 1.01438737, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 2.550027086722204, + "language_loss": 0.72983521, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75139207, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12133789, + "step": 12657, + "time_per_iteration": 2.493021011352539 + }, + { + "auxiliary_loss_clip": 0.01048199, + "auxiliary_loss_mlp": 0.01006641, + "balance_loss_clip": 1.02141726, + "balance_loss_mlp": 1.00507784, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8554593450767634, + "language_loss": 0.64882302, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66937149, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.26855469, + "router_z_loss_mlp": 0.01564026, + "step": 12658, + "time_per_iteration": 3.00935435295105 + }, + { + "auxiliary_loss_clip": 0.0111721, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.04836416, + "balance_loss_mlp": 1.02124715, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.7770408024678324, + "language_loss": 0.79351008, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81501245, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.11791992, + "step": 12659, + "time_per_iteration": 2.422123670578003 + }, + { + "auxiliary_loss_clip": 0.01114708, + "auxiliary_loss_mlp": 0.0102454, + "balance_loss_clip": 1.0436058, + "balance_loss_mlp": 1.01270866, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.4991510854204402, + "language_loss": 0.69080043, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71219289, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11834717, + "step": 12660, + "time_per_iteration": 2.4945735931396484 + }, + { + "auxiliary_loss_clip": 0.0111209, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.03887331, + "balance_loss_mlp": 1.02105677, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.8883001326399376, + "language_loss": 0.70539999, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72686285, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1315918, + "step": 12661, + "time_per_iteration": 2.497969388961792 + }, + { + "auxiliary_loss_clip": 0.01115709, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.04231191, + "balance_loss_mlp": 1.01794732, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.8448421794001009, + "language_loss": 0.83443749, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85589147, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11737061, + "step": 12662, + "time_per_iteration": 2.5124988555908203 + }, + { + "auxiliary_loss_clip": 0.01117, + "auxiliary_loss_mlp": 0.01037999, + "balance_loss_clip": 1.04317665, + "balance_loss_mlp": 1.02614963, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.80988811587216, + "language_loss": 0.68864691, + "learning_rate": 5.682037143624505e-07, + "loss": 0.71019685, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11853027, + "step": 12663, + "time_per_iteration": 2.3987672328948975 + }, + { + "auxiliary_loss_clip": 0.01109135, + "auxiliary_loss_mlp": 0.01023391, + "balance_loss_clip": 1.03833532, + "balance_loss_mlp": 1.01174426, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.7110940762573943, + "language_loss": 0.70378244, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72510767, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11639404, + "step": 12664, + "time_per_iteration": 2.460843324661255 + }, + { + "auxiliary_loss_clip": 0.01117459, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.0408783, + "balance_loss_mlp": 1.02587771, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.7746442449197264, + "language_loss": 0.7938835, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81544489, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12805176, + "step": 12665, + "time_per_iteration": 3.8302366733551025 + }, + { + "auxiliary_loss_clip": 0.01111921, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.04314864, + "balance_loss_mlp": 1.02107275, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 2.3181784106835472, + "language_loss": 0.87834245, + "learning_rate": 5.673881867632959e-07, + "loss": 0.89978796, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.11553955, + "step": 12666, + "time_per_iteration": 2.533411741256714 + }, + { + "auxiliary_loss_clip": 0.01110091, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_clip": 1.03680158, + "balance_loss_mlp": 1.02965498, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 3.0501720459731425, + "language_loss": 0.83841062, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85995167, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.14361572, + "step": 12667, + "time_per_iteration": 2.4232685565948486 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.04127502, + "balance_loss_mlp": 1.02153897, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.9367822244074084, + "language_loss": 0.78530633, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80673671, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11016846, + "step": 12668, + "time_per_iteration": 2.4860098361968994 + }, + { + "auxiliary_loss_clip": 0.0111573, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.04361773, + "balance_loss_mlp": 1.01645088, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.6955219370130743, + "language_loss": 0.63936251, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66080487, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12060547, + "step": 12669, + "time_per_iteration": 3.946300745010376 + }, + { + "auxiliary_loss_clip": 0.01118675, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.04294133, + "balance_loss_mlp": 1.0208391, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 2.16889045564102, + "language_loss": 0.667413, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68894458, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13647461, + "step": 12670, + "time_per_iteration": 2.4384593963623047 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.04304421, + "balance_loss_mlp": 1.02077246, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.9142486721566578, + "language_loss": 0.73324376, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75474137, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11865234, + "step": 12671, + "time_per_iteration": 2.4824728965759277 + }, + { + "auxiliary_loss_clip": 0.01107298, + "auxiliary_loss_mlp": 0.01027998, + "balance_loss_clip": 1.03793168, + "balance_loss_mlp": 1.01719141, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.6031347711901758, + "language_loss": 0.73028731, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75164026, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10809326, + "step": 12672, + "time_per_iteration": 2.4893672466278076 + }, + { + "auxiliary_loss_clip": 0.01042374, + "auxiliary_loss_mlp": 0.0100376, + "balance_loss_clip": 1.01709199, + "balance_loss_mlp": 1.00224495, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7654504001180059, + "language_loss": 0.56679678, + "learning_rate": 5.654871908800506e-07, + "loss": 0.5872581, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01513672, + "step": 12673, + "time_per_iteration": 3.0606932640075684 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.04342794, + "balance_loss_mlp": 1.01914859, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 2.9949031805197577, + "language_loss": 0.74599671, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76748657, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12579346, + "step": 12674, + "time_per_iteration": 3.9769179821014404 + }, + { + "auxiliary_loss_clip": 0.0111584, + "auxiliary_loss_mlp": 0.01026067, + "balance_loss_clip": 1.0449338, + "balance_loss_mlp": 1.0149982, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 2.0298108188534996, + "language_loss": 0.72255605, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74397504, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11065674, + "step": 12675, + "time_per_iteration": 2.520036220550537 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.04758501, + "balance_loss_mlp": 1.01689088, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.073877310420016, + "language_loss": 0.73099583, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75246024, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.1227417, + "step": 12676, + "time_per_iteration": 2.450519561767578 + }, + { + "auxiliary_loss_clip": 0.01128246, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.04778004, + "balance_loss_mlp": 1.02003479, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.5653801083361936, + "language_loss": 0.53780973, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55941439, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12176514, + "step": 12677, + "time_per_iteration": 2.3897643089294434 + }, + { + "auxiliary_loss_clip": 0.01119452, + "auxiliary_loss_mlp": 0.01033497, + "balance_loss_clip": 1.04682744, + "balance_loss_mlp": 1.02112293, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.0404682769533533, + "language_loss": 0.78767401, + "learning_rate": 5.641309683778064e-07, + "loss": 0.80920351, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12371826, + "step": 12678, + "time_per_iteration": 2.4395458698272705 + }, + { + "auxiliary_loss_clip": 0.01110957, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.03720522, + "balance_loss_mlp": 1.02104974, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 1.9193634796915422, + "language_loss": 0.77903932, + "learning_rate": 5.638598871811175e-07, + "loss": 0.80048478, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12548828, + "step": 12679, + "time_per_iteration": 2.4424750804901123 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01024715, + "balance_loss_clip": 1.03868103, + "balance_loss_mlp": 1.01274014, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.52716799983525, + "language_loss": 0.79938507, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82074046, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11968994, + "step": 12680, + "time_per_iteration": 2.5402653217315674 + }, + { + "auxiliary_loss_clip": 0.01109162, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.03669453, + "balance_loss_mlp": 1.01500821, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 2.3431110928466152, + "language_loss": 0.62896311, + "learning_rate": 5.633178881737493e-07, + "loss": 0.65035057, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.14581299, + "step": 12681, + "time_per_iteration": 2.4629836082458496 + }, + { + "auxiliary_loss_clip": 0.01106686, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03746927, + "balance_loss_mlp": 1.01875269, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.8322806524816073, + "language_loss": 0.75746053, + "learning_rate": 5.63046970383622e-07, + "loss": 0.77882439, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10949707, + "step": 12682, + "time_per_iteration": 2.4816970825195312 + }, + { + "auxiliary_loss_clip": 0.01117197, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.04440951, + "balance_loss_mlp": 1.01781929, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.6907345196410017, + "language_loss": 0.68020391, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70166731, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11340332, + "step": 12683, + "time_per_iteration": 2.4777748584747314 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.03879714, + "balance_loss_mlp": 1.02054393, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.4447067434268894, + "language_loss": 0.83420318, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85566092, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13433838, + "step": 12684, + "time_per_iteration": 2.5643444061279297 + }, + { + "auxiliary_loss_clip": 0.01113846, + "auxiliary_loss_mlp": 0.01036026, + "balance_loss_clip": 1.04122794, + "balance_loss_mlp": 1.02303219, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 2.0821929085284787, + "language_loss": 0.82698905, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84848773, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12988281, + "step": 12685, + "time_per_iteration": 2.4230194091796875 + }, + { + "auxiliary_loss_clip": 0.01113946, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.04091763, + "balance_loss_mlp": 1.01286316, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.9180398432806929, + "language_loss": 0.77021682, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79160291, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11804199, + "step": 12686, + "time_per_iteration": 2.461663007736206 + }, + { + "auxiliary_loss_clip": 0.01112686, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.03773379, + "balance_loss_mlp": 1.0245223, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.6736928104635542, + "language_loss": 0.72435153, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74586046, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13684082, + "step": 12687, + "time_per_iteration": 2.4180381298065186 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.04098034, + "balance_loss_mlp": 1.02067494, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 1.9969667943527485, + "language_loss": 0.64937919, + "learning_rate": 5.614226082797369e-07, + "loss": 0.67085451, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.13146973, + "step": 12688, + "time_per_iteration": 2.4224328994750977 + }, + { + "auxiliary_loss_clip": 0.01106543, + "auxiliary_loss_mlp": 0.01025685, + "balance_loss_clip": 1.03745413, + "balance_loss_mlp": 1.01450968, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 2.122065370813275, + "language_loss": 0.70992404, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73124635, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11169434, + "step": 12689, + "time_per_iteration": 3.8527119159698486 + }, + { + "auxiliary_loss_clip": 0.01117201, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.04022276, + "balance_loss_mlp": 1.0267719, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 2.7113754645268173, + "language_loss": 0.70115173, + "learning_rate": 5.608815905436238e-07, + "loss": 0.72271705, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12561035, + "step": 12690, + "time_per_iteration": 2.492884635925293 + }, + { + "auxiliary_loss_clip": 0.01110871, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.03804946, + "balance_loss_mlp": 1.0223186, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.7363697810030272, + "language_loss": 0.69609141, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71755219, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12890625, + "step": 12691, + "time_per_iteration": 2.5358223915100098 + }, + { + "auxiliary_loss_clip": 0.01118419, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.04524636, + "balance_loss_mlp": 1.02011275, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.6947674387881912, + "language_loss": 0.81979084, + "learning_rate": 5.603407910935662e-07, + "loss": 0.841286, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10986328, + "step": 12692, + "time_per_iteration": 2.44574236869812 + }, + { + "auxiliary_loss_clip": 0.01125963, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.05134702, + "balance_loss_mlp": 1.02067721, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 3.9270180208670324, + "language_loss": 0.76836038, + "learning_rate": 5.600704732514438e-07, + "loss": 0.78993887, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11224365, + "step": 12693, + "time_per_iteration": 2.3936047554016113 + }, + { + "auxiliary_loss_clip": 0.01120507, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.04523909, + "balance_loss_mlp": 1.01606059, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.1086774984271086, + "language_loss": 0.72512382, + "learning_rate": 5.598002100115933e-07, + "loss": 0.74661434, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12475586, + "step": 12694, + "time_per_iteration": 2.4083948135375977 + }, + { + "auxiliary_loss_clip": 0.01113002, + "auxiliary_loss_mlp": 0.01026195, + "balance_loss_clip": 1.04218459, + "balance_loss_mlp": 1.0141784, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.697453773749256, + "language_loss": 0.70966536, + "learning_rate": 5.595300013842625e-07, + "loss": 0.73105735, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.12017822, + "step": 12695, + "time_per_iteration": 2.440157651901245 + }, + { + "auxiliary_loss_clip": 0.01112728, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.04144025, + "balance_loss_mlp": 1.01748681, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.580420485494377, + "language_loss": 0.72682786, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74824178, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11175537, + "step": 12696, + "time_per_iteration": 2.575486183166504 + }, + { + "auxiliary_loss_clip": 0.01116956, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.04421234, + "balance_loss_mlp": 1.01866055, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.1044224569610006, + "language_loss": 0.7199465, + "learning_rate": 5.589897480081453e-07, + "loss": 0.74142563, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1229248, + "step": 12697, + "time_per_iteration": 2.3806703090667725 + }, + { + "auxiliary_loss_clip": 0.01110587, + "auxiliary_loss_mlp": 0.01024834, + "balance_loss_clip": 1.04034066, + "balance_loss_mlp": 1.01355636, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 12.622869323059293, + "language_loss": 0.66623425, + "learning_rate": 5.587197032798461e-07, + "loss": 0.68758851, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.112854, + "step": 12698, + "time_per_iteration": 2.5252299308776855 + }, + { + "auxiliary_loss_clip": 0.01112088, + "auxiliary_loss_mlp": 0.01027106, + "balance_loss_clip": 1.03944016, + "balance_loss_mlp": 1.01500082, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 2.246822429752872, + "language_loss": 0.71926081, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74065274, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12103271, + "step": 12699, + "time_per_iteration": 2.47820782661438 + }, + { + "auxiliary_loss_clip": 0.01108714, + "auxiliary_loss_mlp": 0.01030016, + "balance_loss_clip": 1.03800583, + "balance_loss_mlp": 1.01904297, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 2.1939691140799673, + "language_loss": 0.72952032, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75090754, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10980225, + "step": 12700, + "time_per_iteration": 2.6209466457366943 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.04103279, + "balance_loss_mlp": 1.01706576, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 2.0223567470183132, + "language_loss": 0.69410855, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71552765, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11462402, + "step": 12701, + "time_per_iteration": 2.4216959476470947 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.03924, + "balance_loss_mlp": 1.02005863, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 1.6319239745757488, + "language_loss": 0.6471262, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66857576, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.13244629, + "step": 12702, + "time_per_iteration": 2.4356508255004883 + }, + { + "auxiliary_loss_clip": 0.01111984, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.03921735, + "balance_loss_mlp": 1.01800323, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 2.385654120406099, + "language_loss": 0.65807015, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67948622, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11621094, + "step": 12703, + "time_per_iteration": 2.540377140045166 + }, + { + "auxiliary_loss_clip": 0.01110266, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.03772831, + "balance_loss_mlp": 1.01685715, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.9262911155338256, + "language_loss": 0.83403915, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85544068, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.13018799, + "step": 12704, + "time_per_iteration": 2.448350191116333 + }, + { + "auxiliary_loss_clip": 0.01118243, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.04552102, + "balance_loss_mlp": 1.02448893, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.529548380853032, + "language_loss": 0.68103862, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70259196, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1260376, + "step": 12705, + "time_per_iteration": 2.4778530597686768 + }, + { + "auxiliary_loss_clip": 0.01113695, + "auxiliary_loss_mlp": 0.0102381, + "balance_loss_clip": 1.04304874, + "balance_loss_mlp": 1.01224625, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.8723370634560375, + "language_loss": 0.74139714, + "learning_rate": 5.565613138389427e-07, + "loss": 0.7627722, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11566162, + "step": 12706, + "time_per_iteration": 2.4738519191741943 + }, + { + "auxiliary_loss_clip": 0.0112163, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.04808497, + "balance_loss_mlp": 1.01967549, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.8067561126490022, + "language_loss": 0.7868585, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80839068, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11920166, + "step": 12707, + "time_per_iteration": 2.4218568801879883 + }, + { + "auxiliary_loss_clip": 0.01116487, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.041026, + "balance_loss_mlp": 1.01965261, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.9437614385388997, + "language_loss": 0.80094063, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82243091, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12890625, + "step": 12708, + "time_per_iteration": 2.4512596130371094 + }, + { + "auxiliary_loss_clip": 0.0107751, + "auxiliary_loss_mlp": 0.01004564, + "balance_loss_clip": 1.05305541, + "balance_loss_mlp": 1.00313187, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8129215652209341, + "language_loss": 0.56454891, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58536965, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.014328, + "step": 12709, + "time_per_iteration": 4.486665964126587 + }, + { + "auxiliary_loss_clip": 0.01119181, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.04395008, + "balance_loss_mlp": 1.0210458, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.9318410162920967, + "language_loss": 0.63532227, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65685099, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12658691, + "step": 12710, + "time_per_iteration": 2.4342753887176514 + }, + { + "auxiliary_loss_clip": 0.01119625, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.04192019, + "balance_loss_mlp": 1.0229001, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.21402739629379, + "language_loss": 0.64870477, + "learning_rate": 5.552140990044154e-07, + "loss": 0.67026544, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13543701, + "step": 12711, + "time_per_iteration": 2.454387903213501 + }, + { + "auxiliary_loss_clip": 0.01127445, + "auxiliary_loss_mlp": 0.01029324, + "balance_loss_clip": 1.05310619, + "balance_loss_mlp": 1.01792204, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.5880113969552305, + "language_loss": 0.73089391, + "learning_rate": 5.549448203559293e-07, + "loss": 0.75246155, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11401367, + "step": 12712, + "time_per_iteration": 3.8304054737091064 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.04150486, + "balance_loss_mlp": 1.01791859, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.9613141914979022, + "language_loss": 0.80435598, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82576883, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11340332, + "step": 12713, + "time_per_iteration": 2.4650449752807617 + }, + { + "auxiliary_loss_clip": 0.01117455, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.0425806, + "balance_loss_mlp": 1.02051508, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.463877990624412, + "language_loss": 0.83423495, + "learning_rate": 5.544064274590776e-07, + "loss": 0.8557595, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.14508057, + "step": 12714, + "time_per_iteration": 2.471484422683716 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.04563963, + "balance_loss_mlp": 1.0237236, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.5203293506574094, + "language_loss": 0.73049033, + "learning_rate": 5.541373132311287e-07, + "loss": 0.752051, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1237793, + "step": 12715, + "time_per_iteration": 2.4967942237854004 + }, + { + "auxiliary_loss_clip": 0.01118025, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.04545999, + "balance_loss_mlp": 1.01309538, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.752168430585331, + "language_loss": 0.63378525, + "learning_rate": 5.538682538304376e-07, + "loss": 0.655213, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11651611, + "step": 12716, + "time_per_iteration": 2.4676358699798584 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.04186535, + "balance_loss_mlp": 1.01803052, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.5745085650055435, + "language_loss": 0.79859638, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82006782, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12579346, + "step": 12717, + "time_per_iteration": 2.470139741897583 + }, + { + "auxiliary_loss_clip": 0.0111577, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.04261398, + "balance_loss_mlp": 1.01860428, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.6930454092252916, + "language_loss": 0.66680968, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68827033, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11694336, + "step": 12718, + "time_per_iteration": 3.9724888801574707 + }, + { + "auxiliary_loss_clip": 0.01111791, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.03946579, + "balance_loss_mlp": 1.02368248, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.880874798569003, + "language_loss": 0.77147901, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79294598, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11218262, + "step": 12719, + "time_per_iteration": 2.4954283237457275 + }, + { + "auxiliary_loss_clip": 0.01118389, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.04320002, + "balance_loss_mlp": 1.01559365, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.665012795743479, + "language_loss": 0.70012069, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72158861, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12811279, + "step": 12720, + "time_per_iteration": 2.4517886638641357 + }, + { + "auxiliary_loss_clip": 0.01122677, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.04968429, + "balance_loss_mlp": 1.02022791, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.6505209952127653, + "language_loss": 0.73939085, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76093823, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11853027, + "step": 12721, + "time_per_iteration": 2.5428543090820312 + }, + { + "auxiliary_loss_clip": 0.01112369, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.039096, + "balance_loss_mlp": 1.01607084, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.9626881236910134, + "language_loss": 0.73840785, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75981414, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12176514, + "step": 12722, + "time_per_iteration": 2.487325668334961 + }, + { + "auxiliary_loss_clip": 0.01106301, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.03546548, + "balance_loss_mlp": 1.01807022, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 2.369142288108205, + "language_loss": 0.74153793, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76289856, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11706543, + "step": 12723, + "time_per_iteration": 2.5066311359405518 + }, + { + "auxiliary_loss_clip": 0.0111251, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.03832507, + "balance_loss_mlp": 1.02118623, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.7516366234156173, + "language_loss": 0.73490632, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75636315, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11987305, + "step": 12724, + "time_per_iteration": 2.5117604732513428 + }, + { + "auxiliary_loss_clip": 0.0110415, + "auxiliary_loss_mlp": 0.01027552, + "balance_loss_clip": 1.03520513, + "balance_loss_mlp": 1.01638222, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 13.046108177717128, + "language_loss": 0.84022254, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86153954, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.11175537, + "step": 12725, + "time_per_iteration": 2.4936301708221436 + }, + { + "auxiliary_loss_clip": 0.01123297, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.01957512, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.7426490518367141, + "language_loss": 0.77157503, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79313028, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12652588, + "step": 12726, + "time_per_iteration": 2.5494494438171387 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.03566718, + "balance_loss_mlp": 1.01988387, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.9209054426861347, + "language_loss": 0.70735383, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72875804, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.121521, + "step": 12727, + "time_per_iteration": 2.5296761989593506 + }, + { + "auxiliary_loss_clip": 0.01106147, + "auxiliary_loss_mlp": 0.01023706, + "balance_loss_clip": 1.03607106, + "balance_loss_mlp": 1.01217294, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.702152197741265, + "language_loss": 0.79357052, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81486905, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11535645, + "step": 12728, + "time_per_iteration": 2.4801783561706543 + }, + { + "auxiliary_loss_clip": 0.01118952, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.04511476, + "balance_loss_mlp": 1.01532626, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 2.0772519213685507, + "language_loss": 0.5554353, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57689965, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12158203, + "step": 12729, + "time_per_iteration": 2.5779950618743896 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01030273, + "balance_loss_clip": 1.04340303, + "balance_loss_mlp": 1.01789355, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 1.8874409941673775, + "language_loss": 0.78119069, + "learning_rate": 5.501071847926055e-07, + "loss": 0.80267185, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12384033, + "step": 12730, + "time_per_iteration": 2.4738686084747314 + }, + { + "auxiliary_loss_clip": 0.01115147, + "auxiliary_loss_mlp": 0.01043764, + "balance_loss_clip": 1.0415113, + "balance_loss_mlp": 1.03117573, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.8987235664881525, + "language_loss": 0.68899751, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71058661, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12585449, + "step": 12731, + "time_per_iteration": 2.4825241565704346 + }, + { + "auxiliary_loss_clip": 0.01128153, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.05319834, + "balance_loss_mlp": 1.01899624, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.7989067858457997, + "language_loss": 0.70072341, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72231799, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12322998, + "step": 12732, + "time_per_iteration": 3.8358957767486572 + }, + { + "auxiliary_loss_clip": 0.01125016, + "auxiliary_loss_mlp": 0.01028955, + "balance_loss_clip": 1.04949164, + "balance_loss_mlp": 1.01618755, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.5093376842296227, + "language_loss": 0.78046834, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80200809, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12780762, + "step": 12733, + "time_per_iteration": 2.4839560985565186 + }, + { + "auxiliary_loss_clip": 0.01112427, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.04203248, + "balance_loss_mlp": 1.01906228, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7165876937266205, + "language_loss": 0.77537572, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79680842, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11791992, + "step": 12734, + "time_per_iteration": 2.4243314266204834 + }, + { + "auxiliary_loss_clip": 0.01116967, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.04110646, + "balance_loss_mlp": 1.0174582, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 2.058587258701411, + "language_loss": 0.73246038, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75393391, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12945557, + "step": 12735, + "time_per_iteration": 2.4916844367980957 + }, + { + "auxiliary_loss_clip": 0.01114787, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.0418818, + "balance_loss_mlp": 1.02062082, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.4351271876846456, + "language_loss": 0.72670317, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74817383, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11669922, + "step": 12736, + "time_per_iteration": 2.524980068206787 + }, + { + "auxiliary_loss_clip": 0.01111847, + "auxiliary_loss_mlp": 0.01040254, + "balance_loss_clip": 1.03659368, + "balance_loss_mlp": 1.02588928, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 2.6731396421022646, + "language_loss": 0.77863592, + "learning_rate": 5.482306895631728e-07, + "loss": 0.80015695, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.14367676, + "step": 12737, + "time_per_iteration": 2.4278993606567383 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.04052401, + "balance_loss_mlp": 1.01958621, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.5976373428340316, + "language_loss": 0.76341748, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78487474, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12518311, + "step": 12738, + "time_per_iteration": 2.475888252258301 + }, + { + "auxiliary_loss_clip": 0.01114496, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.03985882, + "balance_loss_mlp": 1.01787734, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 2.055595366771679, + "language_loss": 0.63043255, + "learning_rate": 5.476950433777603e-07, + "loss": 0.65188038, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12384033, + "step": 12739, + "time_per_iteration": 2.474449396133423 + }, + { + "auxiliary_loss_clip": 0.01110854, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.03851819, + "balance_loss_mlp": 1.02016044, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 2.0768536032694414, + "language_loss": 0.79903942, + "learning_rate": 5.474273028873004e-07, + "loss": 0.8204875, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.13793945, + "step": 12740, + "time_per_iteration": 2.4819424152374268 + }, + { + "auxiliary_loss_clip": 0.01109467, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.03690243, + "balance_loss_mlp": 1.02185845, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 2.088249533679058, + "language_loss": 0.65852106, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67997372, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.13946533, + "step": 12741, + "time_per_iteration": 2.5716373920440674 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.04201376, + "balance_loss_mlp": 1.01455617, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.8220789070135797, + "language_loss": 0.75547808, + "learning_rate": 5.468919871616386e-07, + "loss": 0.77688944, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1204834, + "step": 12742, + "time_per_iteration": 2.441047191619873 + }, + { + "auxiliary_loss_clip": 0.01113312, + "auxiliary_loss_mlp": 0.01033133, + "balance_loss_clip": 1.04065609, + "balance_loss_mlp": 1.02029395, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.4066204347979723, + "language_loss": 0.76659584, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78806019, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12854004, + "step": 12743, + "time_per_iteration": 2.5006978511810303 + }, + { + "auxiliary_loss_clip": 0.01115546, + "auxiliary_loss_mlp": 0.01030198, + "balance_loss_clip": 1.04400921, + "balance_loss_mlp": 1.01816368, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.8881325217493212, + "language_loss": 0.75121719, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77267462, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12042236, + "step": 12744, + "time_per_iteration": 2.482180595397949 + }, + { + "auxiliary_loss_clip": 0.01121894, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.04567504, + "balance_loss_mlp": 1.01800966, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.2472309357645357, + "language_loss": 0.71301973, + "learning_rate": 5.460894268635181e-07, + "loss": 0.7345494, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.1305542, + "step": 12745, + "time_per_iteration": 2.427255630493164 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.04115248, + "balance_loss_mlp": 1.02519464, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 3.346175681670436, + "language_loss": 0.77256012, + "learning_rate": 5.458220170154896e-07, + "loss": 0.794146, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13061523, + "step": 12746, + "time_per_iteration": 2.429644823074341 + }, + { + "auxiliary_loss_clip": 0.01055187, + "auxiliary_loss_mlp": 0.01005967, + "balance_loss_clip": 1.02972436, + "balance_loss_mlp": 1.00467086, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6621251796169291, + "language_loss": 0.56805164, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58866316, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.25463867, + "router_z_loss_mlp": 0.01296997, + "step": 12747, + "time_per_iteration": 3.133760929107666 + }, + { + "auxiliary_loss_clip": 0.01119898, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.04789591, + "balance_loss_mlp": 1.01990426, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.4983884581406657, + "language_loss": 0.72260928, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74411726, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10986328, + "step": 12748, + "time_per_iteration": 2.526904821395874 + }, + { + "auxiliary_loss_clip": 0.01112454, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.04025149, + "balance_loss_mlp": 1.01817322, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 1.9772031161237407, + "language_loss": 0.69385731, + "learning_rate": 5.450201183674052e-07, + "loss": 0.7152859, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12219238, + "step": 12749, + "time_per_iteration": 2.459463357925415 + }, + { + "auxiliary_loss_clip": 0.01123489, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.04983222, + "balance_loss_mlp": 1.01645851, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.8325814285918811, + "language_loss": 0.73686826, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75839812, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1305542, + "step": 12750, + "time_per_iteration": 2.50138258934021 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.04178381, + "balance_loss_mlp": 1.01716697, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 2.136471508703676, + "language_loss": 0.76128691, + "learning_rate": 5.444857951167026e-07, + "loss": 0.78269804, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11334229, + "step": 12751, + "time_per_iteration": 2.4250235557556152 + }, + { + "auxiliary_loss_clip": 0.01122243, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.0479691, + "balance_loss_mlp": 1.0235666, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 2.6243811971511475, + "language_loss": 0.6147573, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63633657, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12109375, + "step": 12752, + "time_per_iteration": 2.488759994506836 + }, + { + "auxiliary_loss_clip": 0.01117209, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.04394472, + "balance_loss_mlp": 1.0201211, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.745018406414463, + "language_loss": 0.68873858, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71024132, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1295166, + "step": 12753, + "time_per_iteration": 3.9055733680725098 + }, + { + "auxiliary_loss_clip": 0.01112469, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.03947902, + "balance_loss_mlp": 1.02353907, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 3.368812624475305, + "language_loss": 0.6283409, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64981574, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11480713, + "step": 12754, + "time_per_iteration": 2.4077343940734863 + }, + { + "auxiliary_loss_clip": 0.01122652, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.05104041, + "balance_loss_mlp": 1.01881433, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.2672577133032035, + "language_loss": 0.79997516, + "learning_rate": 5.434178110152401e-07, + "loss": 0.82150877, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11889648, + "step": 12755, + "time_per_iteration": 3.901595115661621 + }, + { + "auxiliary_loss_clip": 0.01113459, + "auxiliary_loss_mlp": 0.01028781, + "balance_loss_clip": 1.04202604, + "balance_loss_mlp": 1.0167408, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 2.027353513139015, + "language_loss": 0.70476931, + "learning_rate": 5.431509530489242e-07, + "loss": 0.7261917, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12036133, + "step": 12756, + "time_per_iteration": 2.462681770324707 + }, + { + "auxiliary_loss_clip": 0.01117766, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.04542255, + "balance_loss_mlp": 1.02465355, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.5209164434665603, + "language_loss": 0.70281136, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72435111, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11560059, + "step": 12757, + "time_per_iteration": 2.4877254962921143 + }, + { + "auxiliary_loss_clip": 0.01116621, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.04424214, + "balance_loss_mlp": 1.01969719, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 3.247428340575318, + "language_loss": 0.76231885, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78380358, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12145996, + "step": 12758, + "time_per_iteration": 2.4467570781707764 + }, + { + "auxiliary_loss_clip": 0.01113078, + "auxiliary_loss_mlp": 0.01036113, + "balance_loss_clip": 1.04209232, + "balance_loss_mlp": 1.02438247, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.8374377268259285, + "language_loss": 0.76253885, + "learning_rate": 5.423507106536156e-07, + "loss": 0.7840308, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11724854, + "step": 12759, + "time_per_iteration": 2.4570887088775635 + }, + { + "auxiliary_loss_clip": 0.01115658, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.04191637, + "balance_loss_mlp": 1.01672745, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.4309789217695603, + "language_loss": 0.68225551, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70369488, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11541748, + "step": 12760, + "time_per_iteration": 2.568363904953003 + }, + { + "auxiliary_loss_clip": 0.01115425, + "auxiliary_loss_mlp": 0.01030078, + "balance_loss_clip": 1.04240012, + "balance_loss_mlp": 1.01771569, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.5321776726695404, + "language_loss": 0.79367661, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81513166, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12365723, + "step": 12761, + "time_per_iteration": 3.9696478843688965 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.04228044, + "balance_loss_mlp": 1.0185535, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.8292070570043297, + "language_loss": 0.66057205, + "learning_rate": 5.415509657261589e-07, + "loss": 0.6820029, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11798096, + "step": 12762, + "time_per_iteration": 2.4760828018188477 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.0102938, + "balance_loss_clip": 1.03733826, + "balance_loss_mlp": 1.0164814, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 2.229986526132047, + "language_loss": 0.7445277, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76593679, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12908936, + "step": 12763, + "time_per_iteration": 2.4597718715667725 + }, + { + "auxiliary_loss_clip": 0.01119823, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.04602361, + "balance_loss_mlp": 1.01842034, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.399424410360446, + "language_loss": 0.70705897, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72856152, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12011719, + "step": 12764, + "time_per_iteration": 2.6863701343536377 + }, + { + "auxiliary_loss_clip": 0.01114403, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.03996861, + "balance_loss_mlp": 1.02209997, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.8024216690667803, + "language_loss": 0.69401914, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71550876, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12475586, + "step": 12765, + "time_per_iteration": 2.482001781463623 + }, + { + "auxiliary_loss_clip": 0.0111764, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.04416788, + "balance_loss_mlp": 1.01771712, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 2.35815268686876, + "language_loss": 0.60712689, + "learning_rate": 5.404854134668162e-07, + "loss": 0.62858927, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.10876465, + "step": 12766, + "time_per_iteration": 2.47847318649292 + }, + { + "auxiliary_loss_clip": 0.01057732, + "auxiliary_loss_mlp": 0.01000775, + "balance_loss_clip": 1.03312898, + "balance_loss_mlp": 0.99939048, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7350481500928677, + "language_loss": 0.6079759, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62856096, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01383972, + "step": 12767, + "time_per_iteration": 3.2010881900787354 + }, + { + "auxiliary_loss_clip": 0.0111591, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.04554355, + "balance_loss_mlp": 1.016289, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 2.6033071552026996, + "language_loss": 0.69431353, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71574575, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11035156, + "step": 12768, + "time_per_iteration": 2.673825979232788 + }, + { + "auxiliary_loss_clip": 0.01118027, + "auxiliary_loss_mlp": 0.01039753, + "balance_loss_clip": 1.04196143, + "balance_loss_mlp": 1.02649128, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.700705260812844, + "language_loss": 0.70276213, + "learning_rate": 5.3968683035881e-07, + "loss": 0.72433996, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13262939, + "step": 12769, + "time_per_iteration": 2.5566511154174805 + }, + { + "auxiliary_loss_clip": 0.01115024, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.04227853, + "balance_loss_mlp": 1.01584387, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 2.534413223132685, + "language_loss": 0.80740798, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82883871, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12207031, + "step": 12770, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01115355, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.04391885, + "balance_loss_mlp": 1.01683629, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.6841999722404257, + "language_loss": 0.7855947, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80702531, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10864258, + "step": 12771, + "time_per_iteration": 2.5769405364990234 + }, + { + "auxiliary_loss_clip": 0.0111279, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.04055595, + "balance_loss_mlp": 1.01753819, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.4780804458290655, + "language_loss": 0.68575668, + "learning_rate": 5.388887456277876e-07, + "loss": 0.707187, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.1270752, + "step": 12772, + "time_per_iteration": 2.471785306930542 + }, + { + "auxiliary_loss_clip": 0.01115622, + "auxiliary_loss_mlp": 0.01025191, + "balance_loss_clip": 1.04663754, + "balance_loss_mlp": 1.0141108, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 2.070965685590417, + "language_loss": 0.73568237, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75709051, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.11083984, + "step": 12773, + "time_per_iteration": 2.583479166030884 + }, + { + "auxiliary_loss_clip": 0.01118999, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.04456651, + "balance_loss_mlp": 1.01798475, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.6561144053278907, + "language_loss": 0.80778861, + "learning_rate": 5.383569661510512e-07, + "loss": 0.82926965, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11120605, + "step": 12774, + "time_per_iteration": 2.4677867889404297 + }, + { + "auxiliary_loss_clip": 0.01118927, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.04820907, + "balance_loss_mlp": 1.01886201, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.6701291815820527, + "language_loss": 0.698183, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71968067, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11975098, + "step": 12775, + "time_per_iteration": 2.450056314468384 + }, + { + "auxiliary_loss_clip": 0.01051549, + "auxiliary_loss_mlp": 0.01002087, + "balance_loss_clip": 1.024863, + "balance_loss_mlp": 1.00054491, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.6913176911953429, + "language_loss": 0.56813121, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58866757, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.26708984, + "router_z_loss_mlp": 0.01542664, + "step": 12776, + "time_per_iteration": 4.628050088882446 + }, + { + "auxiliary_loss_clip": 0.01114469, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.04442751, + "balance_loss_mlp": 1.02050745, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.91101229256682, + "language_loss": 0.73618394, + "learning_rate": 5.375597126535188e-07, + "loss": 0.75764561, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11193848, + "step": 12777, + "time_per_iteration": 2.5112321376800537 + }, + { + "auxiliary_loss_clip": 0.01121222, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.04938245, + "balance_loss_mlp": 1.02073479, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 4.495834639075943, + "language_loss": 0.70208395, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72361815, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11462402, + "step": 12778, + "time_per_iteration": 2.426192045211792 + }, + { + "auxiliary_loss_clip": 0.01121919, + "auxiliary_loss_mlp": 0.01028603, + "balance_loss_clip": 1.04728222, + "balance_loss_mlp": 1.01739764, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.7039821015993775, + "language_loss": 0.70300156, + "learning_rate": 5.37028487584446e-07, + "loss": 0.7245068, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11206055, + "step": 12779, + "time_per_iteration": 2.4515364170074463 + }, + { + "auxiliary_loss_clip": 0.01113218, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.04249454, + "balance_loss_mlp": 1.01642919, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.6728023166271733, + "language_loss": 0.59015381, + "learning_rate": 5.367629582589133e-07, + "loss": 0.61156869, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11828613, + "step": 12780, + "time_per_iteration": 2.8381810188293457 + }, + { + "auxiliary_loss_clip": 0.01119107, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.04090285, + "balance_loss_mlp": 1.02431083, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 2.6327958364126265, + "language_loss": 0.68328857, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70486516, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.14245605, + "step": 12781, + "time_per_iteration": 2.4533400535583496 + }, + { + "auxiliary_loss_clip": 0.01117827, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.04143667, + "balance_loss_mlp": 1.02060306, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.5055068805973741, + "language_loss": 0.79522491, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81673747, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12817383, + "step": 12782, + "time_per_iteration": 2.4500041007995605 + }, + { + "auxiliary_loss_clip": 0.01113956, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.04086399, + "balance_loss_mlp": 1.01653814, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.7379578254892085, + "language_loss": 0.66630578, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68773288, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12213135, + "step": 12783, + "time_per_iteration": 2.4821972846984863 + }, + { + "auxiliary_loss_clip": 0.01115449, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.04269791, + "balance_loss_mlp": 1.01894331, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.8952187200570052, + "language_loss": 0.69154125, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71300936, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12432861, + "step": 12784, + "time_per_iteration": 2.4526708126068115 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01026486, + "balance_loss_clip": 1.04366541, + "balance_loss_mlp": 1.01586461, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.8352236944461313, + "language_loss": 0.80558079, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82703131, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.10620117, + "step": 12785, + "time_per_iteration": 2.4699132442474365 + }, + { + "auxiliary_loss_clip": 0.01113044, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.04137063, + "balance_loss_mlp": 1.01774561, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.7107561332719252, + "language_loss": 0.77386373, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79529536, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1237793, + "step": 12786, + "time_per_iteration": 2.676520824432373 + }, + { + "auxiliary_loss_clip": 0.01112227, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.04119778, + "balance_loss_mlp": 1.01792264, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 1.9179374832851857, + "language_loss": 0.58961594, + "learning_rate": 5.349058071544468e-07, + "loss": 0.61102736, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10986328, + "step": 12787, + "time_per_iteration": 2.534982204437256 + }, + { + "auxiliary_loss_clip": 0.01108665, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.0385226, + "balance_loss_mlp": 1.01740086, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.642495690376676, + "language_loss": 0.76066804, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78204173, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11303711, + "step": 12788, + "time_per_iteration": 2.420703649520874 + }, + { + "auxiliary_loss_clip": 0.01118063, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.0451324, + "balance_loss_mlp": 1.02504981, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.6618507117153638, + "language_loss": 0.66614759, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68770802, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12939453, + "step": 12789, + "time_per_iteration": 2.5773963928222656 + }, + { + "auxiliary_loss_clip": 0.01117557, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.02222002, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.8176266584767433, + "language_loss": 0.68640989, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70793533, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12768555, + "step": 12790, + "time_per_iteration": 2.5429041385650635 + }, + { + "auxiliary_loss_clip": 0.01114421, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.04250348, + "balance_loss_mlp": 1.02023232, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.5465686351213381, + "language_loss": 0.6849848, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70646048, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12921143, + "step": 12791, + "time_per_iteration": 2.4319820404052734 + }, + { + "auxiliary_loss_clip": 0.01106695, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.03681171, + "balance_loss_mlp": 1.02019131, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.8662890596646864, + "language_loss": 0.79760647, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81899869, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.12329102, + "step": 12792, + "time_per_iteration": 2.4188735485076904 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.01028904, + "balance_loss_clip": 1.04468775, + "balance_loss_mlp": 1.01642895, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.7940196155738626, + "language_loss": 0.72832549, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74982697, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12463379, + "step": 12793, + "time_per_iteration": 2.567082405090332 + }, + { + "auxiliary_loss_clip": 0.01118669, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.0443399, + "balance_loss_mlp": 1.02116585, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.741666359162754, + "language_loss": 0.63535273, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65686947, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1184082, + "step": 12794, + "time_per_iteration": 2.590622901916504 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.0414449, + "balance_loss_mlp": 1.01992309, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.4100241554879713, + "language_loss": 0.76697087, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78844464, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1171875, + "step": 12795, + "time_per_iteration": 2.470386505126953 + }, + { + "auxiliary_loss_clip": 0.01120906, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.04636943, + "balance_loss_mlp": 1.01552653, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.9111327465172052, + "language_loss": 0.71762812, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73911351, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12109375, + "step": 12796, + "time_per_iteration": 2.4782729148864746 + }, + { + "auxiliary_loss_clip": 0.01120356, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.04641366, + "balance_loss_mlp": 1.016518, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 1.9196498212633062, + "language_loss": 0.65268481, + "learning_rate": 5.32257457305499e-07, + "loss": 0.67417139, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11785889, + "step": 12797, + "time_per_iteration": 2.4105489253997803 + }, + { + "auxiliary_loss_clip": 0.01117104, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.04334795, + "balance_loss_mlp": 1.01973963, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 2.2907472528504624, + "language_loss": 0.91537392, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93686593, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12359619, + "step": 12798, + "time_per_iteration": 3.9844346046447754 + }, + { + "auxiliary_loss_clip": 0.01110845, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.03769708, + "balance_loss_mlp": 1.01811767, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.8524797083691493, + "language_loss": 0.82354373, + "learning_rate": 5.317284548978418e-07, + "loss": 0.844962, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12872314, + "step": 12799, + "time_per_iteration": 3.8726532459259033 + }, + { + "auxiliary_loss_clip": 0.01117851, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.04536116, + "balance_loss_mlp": 1.01693261, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.331438166325265, + "language_loss": 0.78385103, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80531889, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11999512, + "step": 12800, + "time_per_iteration": 2.4235761165618896 + }, + { + "auxiliary_loss_clip": 0.01121938, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.04222846, + "balance_loss_mlp": 1.0190419, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 5.799681816233148, + "language_loss": 0.83761805, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85916394, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13616943, + "step": 12801, + "time_per_iteration": 2.449885129928589 + }, + { + "auxiliary_loss_clip": 0.0111812, + "auxiliary_loss_mlp": 0.01026251, + "balance_loss_clip": 1.04520178, + "balance_loss_mlp": 1.01441324, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.9111060040885324, + "language_loss": 0.72005141, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74149513, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1184082, + "step": 12802, + "time_per_iteration": 2.4103164672851562 + }, + { + "auxiliary_loss_clip": 0.01111393, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.03971505, + "balance_loss_mlp": 1.01906228, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 2.193644807211392, + "language_loss": 0.76295859, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78437859, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11547852, + "step": 12803, + "time_per_iteration": 2.455573558807373 + }, + { + "auxiliary_loss_clip": 0.01087557, + "auxiliary_loss_mlp": 0.0100331, + "balance_loss_clip": 1.06300235, + "balance_loss_mlp": 1.00190508, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7393948347177504, + "language_loss": 0.55783904, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57874769, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01405334, + "step": 12804, + "time_per_iteration": 3.1084578037261963 + }, + { + "auxiliary_loss_clip": 0.0104177, + "auxiliary_loss_mlp": 0.01001253, + "balance_loss_clip": 1.01559114, + "balance_loss_mlp": 0.99970734, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7445302354230877, + "language_loss": 0.53985912, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56028938, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01545715, + "step": 12805, + "time_per_iteration": 4.632069826126099 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.04541135, + "balance_loss_mlp": 1.02004385, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 2.021288889615618, + "language_loss": 0.72998095, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75149512, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12097168, + "step": 12806, + "time_per_iteration": 2.4460155963897705 + }, + { + "auxiliary_loss_clip": 0.0111629, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.04343593, + "balance_loss_mlp": 1.02058792, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 1.844497766232243, + "language_loss": 0.7499854, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77147251, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11834717, + "step": 12807, + "time_per_iteration": 2.4373152256011963 + }, + { + "auxiliary_loss_clip": 0.01113865, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.01837003, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 3.7886972104804357, + "language_loss": 0.80239755, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82385075, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13085938, + "step": 12808, + "time_per_iteration": 2.467902898788452 + }, + { + "auxiliary_loss_clip": 0.01119747, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.04436576, + "balance_loss_mlp": 1.0237211, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 2.1073104699739353, + "language_loss": 0.79109955, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81266242, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12811279, + "step": 12809, + "time_per_iteration": 2.518627166748047 + }, + { + "auxiliary_loss_clip": 0.01104993, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.0364387, + "balance_loss_mlp": 1.01895285, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 2.0006407345709376, + "language_loss": 0.70497775, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72634637, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.12945557, + "step": 12810, + "time_per_iteration": 2.540903329849243 + }, + { + "auxiliary_loss_clip": 0.01122575, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.04415679, + "balance_loss_mlp": 1.01784301, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 3.9080399760625943, + "language_loss": 0.78877091, + "learning_rate": 5.285591201262079e-07, + "loss": 0.81031239, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.1373291, + "step": 12811, + "time_per_iteration": 2.4641051292419434 + }, + { + "auxiliary_loss_clip": 0.01041211, + "auxiliary_loss_mlp": 0.01003794, + "balance_loss_clip": 1.01566291, + "balance_loss_mlp": 1.00227833, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8089255489325411, + "language_loss": 0.56664944, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58709955, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01516724, + "step": 12812, + "time_per_iteration": 3.143472671508789 + }, + { + "auxiliary_loss_clip": 0.01116517, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04317749, + "balance_loss_mlp": 1.01847672, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.6379547478275032, + "language_loss": 0.72109628, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74256426, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11804199, + "step": 12813, + "time_per_iteration": 2.5180327892303467 + }, + { + "auxiliary_loss_clip": 0.01117322, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.04393578, + "balance_loss_mlp": 1.01658058, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.6636422525744279, + "language_loss": 0.66266477, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68412721, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12341309, + "step": 12814, + "time_per_iteration": 2.444469451904297 + }, + { + "auxiliary_loss_clip": 0.01130013, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.05301666, + "balance_loss_mlp": 1.01924932, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.874951481439897, + "language_loss": 0.65297735, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67458737, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11743164, + "step": 12815, + "time_per_iteration": 2.47733998298645 + }, + { + "auxiliary_loss_clip": 0.01114662, + "auxiliary_loss_mlp": 0.01028192, + "balance_loss_clip": 1.04166532, + "balance_loss_mlp": 1.01618791, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 5.632010217270234, + "language_loss": 0.64666027, + "learning_rate": 5.272409343590322e-07, + "loss": 0.66808879, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12011719, + "step": 12816, + "time_per_iteration": 2.420891523361206 + }, + { + "auxiliary_loss_clip": 0.01118702, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.04383206, + "balance_loss_mlp": 1.01742876, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.688887896983161, + "language_loss": 0.7217896, + "learning_rate": 5.26977464707133e-07, + "loss": 0.743267, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1161499, + "step": 12817, + "time_per_iteration": 2.454738140106201 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.04996049, + "balance_loss_mlp": 1.01624262, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.8115453981405207, + "language_loss": 0.61451089, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63601422, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11279297, + "step": 12818, + "time_per_iteration": 2.515007972717285 + }, + { + "auxiliary_loss_clip": 0.01113754, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.0443058, + "balance_loss_mlp": 1.01572633, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.9546976622145404, + "language_loss": 0.67315358, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69455332, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.1048584, + "step": 12819, + "time_per_iteration": 3.8693926334381104 + }, + { + "auxiliary_loss_clip": 0.01114652, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.04163194, + "balance_loss_mlp": 1.01806188, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 2.0627142882930354, + "language_loss": 0.57851225, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59995639, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11706543, + "step": 12820, + "time_per_iteration": 2.450665235519409 + }, + { + "auxiliary_loss_clip": 0.01115501, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.04165292, + "balance_loss_mlp": 1.01876593, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 2.138196689815019, + "language_loss": 0.81165189, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83312559, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.13104248, + "step": 12821, + "time_per_iteration": 2.494588613510132 + }, + { + "auxiliary_loss_clip": 0.01111795, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.03900981, + "balance_loss_mlp": 1.02144098, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.3834157137204524, + "language_loss": 0.6911099, + "learning_rate": 5.256609545048114e-07, + "loss": 0.71256441, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12213135, + "step": 12822, + "time_per_iteration": 2.451582670211792 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.037148, + "balance_loss_mlp": 1.02295268, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.9937360647352094, + "language_loss": 0.72664392, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74808645, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12335205, + "step": 12823, + "time_per_iteration": 2.5827908515930176 + }, + { + "auxiliary_loss_clip": 0.01123564, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.04482341, + "balance_loss_mlp": 1.02395284, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.651589439691931, + "language_loss": 0.76411414, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78572667, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13720703, + "step": 12824, + "time_per_iteration": 2.4909818172454834 + }, + { + "auxiliary_loss_clip": 0.01115042, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.04113698, + "balance_loss_mlp": 1.01517212, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 1.8640868699835318, + "language_loss": 0.726964, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74838948, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12353516, + "step": 12825, + "time_per_iteration": 2.4126462936401367 + }, + { + "auxiliary_loss_clip": 0.01110072, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.04224479, + "balance_loss_mlp": 1.02078712, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.5323177480631687, + "language_loss": 0.73741794, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75882691, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10040283, + "step": 12826, + "time_per_iteration": 2.4347243309020996 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.03629959, + "balance_loss_mlp": 1.01705456, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 3.9786490890511272, + "language_loss": 0.81016457, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83156717, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12286377, + "step": 12827, + "time_per_iteration": 2.4684102535247803 + }, + { + "auxiliary_loss_clip": 0.01043131, + "auxiliary_loss_mlp": 0.01004711, + "balance_loss_clip": 1.01814473, + "balance_loss_mlp": 1.00327015, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.857267135811234, + "language_loss": 0.5517894, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57226777, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01441956, + "step": 12828, + "time_per_iteration": 3.2164318561553955 + }, + { + "auxiliary_loss_clip": 0.0110443, + "auxiliary_loss_mlp": 0.01027566, + "balance_loss_clip": 1.03493571, + "balance_loss_mlp": 1.01637185, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.8408464480227251, + "language_loss": 0.69684458, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71816456, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11193848, + "step": 12829, + "time_per_iteration": 2.453187942504883 + }, + { + "auxiliary_loss_clip": 0.01113497, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.0393362, + "balance_loss_mlp": 1.01821494, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 2.966382444509992, + "language_loss": 0.7937243, + "learning_rate": 5.235574458679579e-07, + "loss": 0.81516492, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12347412, + "step": 12830, + "time_per_iteration": 2.4227066040039062 + }, + { + "auxiliary_loss_clip": 0.01114635, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.03988755, + "balance_loss_mlp": 1.01947403, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.6327885475250326, + "language_loss": 0.77899933, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80047011, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12976074, + "step": 12831, + "time_per_iteration": 2.4612176418304443 + }, + { + "auxiliary_loss_clip": 0.01114323, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.04309916, + "balance_loss_mlp": 1.01738787, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.5551182131069876, + "language_loss": 0.61067408, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63210833, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11724854, + "step": 12832, + "time_per_iteration": 2.521306276321411 + }, + { + "auxiliary_loss_clip": 0.01117302, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.04066825, + "balance_loss_mlp": 1.02286518, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.690976579248931, + "language_loss": 0.79498988, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81651318, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12164307, + "step": 12833, + "time_per_iteration": 2.4892001152038574 + }, + { + "auxiliary_loss_clip": 0.01045956, + "auxiliary_loss_mlp": 0.01003648, + "balance_loss_clip": 1.01901388, + "balance_loss_mlp": 1.00226915, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8427759728619488, + "language_loss": 0.55340117, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57389724, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.01377869, + "step": 12834, + "time_per_iteration": 3.0705578327178955 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.04594088, + "balance_loss_mlp": 1.0180757, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.7073066163406807, + "language_loss": 0.73089296, + "learning_rate": 5.222445722184903e-07, + "loss": 0.75239861, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12408447, + "step": 12835, + "time_per_iteration": 2.514439582824707 + }, + { + "auxiliary_loss_clip": 0.0111666, + "auxiliary_loss_mlp": 0.01036758, + "balance_loss_clip": 1.04167283, + "balance_loss_mlp": 1.02425909, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 2.0518362250193816, + "language_loss": 0.7010361, + "learning_rate": 5.219821655586814e-07, + "loss": 0.7225703, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.125, + "step": 12836, + "time_per_iteration": 2.4384753704071045 + }, + { + "auxiliary_loss_clip": 0.01107515, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.03754485, + "balance_loss_mlp": 1.01929688, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 3.3431770288522795, + "language_loss": 0.59516203, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61655617, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.12609863, + "step": 12837, + "time_per_iteration": 2.629747152328491 + }, + { + "auxiliary_loss_clip": 0.01039789, + "auxiliary_loss_mlp": 0.0100469, + "balance_loss_clip": 1.01473022, + "balance_loss_mlp": 1.00314927, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.861046330022209, + "language_loss": 0.55762792, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57807273, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01541138, + "step": 12838, + "time_per_iteration": 3.0434818267822266 + }, + { + "auxiliary_loss_clip": 0.01114975, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.04340982, + "balance_loss_mlp": 1.02169752, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.7612697334177776, + "language_loss": 0.69951022, + "learning_rate": 5.211952818985538e-07, + "loss": 0.7209875, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1105957, + "step": 12839, + "time_per_iteration": 2.4099981784820557 + }, + { + "auxiliary_loss_clip": 0.01106821, + "auxiliary_loss_mlp": 0.01029707, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.01715994, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.8054846498559098, + "language_loss": 0.80116987, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82253516, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.12548828, + "step": 12840, + "time_per_iteration": 2.437344551086426 + }, + { + "auxiliary_loss_clip": 0.01117109, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.04468179, + "balance_loss_mlp": 1.01740897, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 2.063638114293588, + "language_loss": 0.80060852, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82208061, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12713623, + "step": 12841, + "time_per_iteration": 3.8822169303894043 + }, + { + "auxiliary_loss_clip": 0.01112206, + "auxiliary_loss_mlp": 0.01027818, + "balance_loss_clip": 1.04008865, + "balance_loss_mlp": 1.01518214, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 2.032731839952734, + "language_loss": 0.76757157, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78897184, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12628174, + "step": 12842, + "time_per_iteration": 3.883577585220337 + }, + { + "auxiliary_loss_clip": 0.01116958, + "auxiliary_loss_mlp": 0.01040713, + "balance_loss_clip": 1.04144967, + "balance_loss_mlp": 1.0267117, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 1.8568681197477082, + "language_loss": 0.68680769, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70838439, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.14013672, + "step": 12843, + "time_per_iteration": 2.4484312534332275 + }, + { + "auxiliary_loss_clip": 0.01116761, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.04134381, + "balance_loss_mlp": 1.01772261, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 7.330580885609492, + "language_loss": 0.73794842, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75940359, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11053467, + "step": 12844, + "time_per_iteration": 2.435702085494995 + }, + { + "auxiliary_loss_clip": 0.01114883, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.04302883, + "balance_loss_mlp": 1.01644731, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.605103450238086, + "language_loss": 0.71655327, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73798585, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11938477, + "step": 12845, + "time_per_iteration": 2.4895310401916504 + }, + { + "auxiliary_loss_clip": 0.01115192, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.04387498, + "balance_loss_mlp": 1.0229845, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 2.141976276681346, + "language_loss": 0.64249283, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66398954, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.1149292, + "step": 12846, + "time_per_iteration": 2.5605666637420654 + }, + { + "auxiliary_loss_clip": 0.01040409, + "auxiliary_loss_mlp": 0.01005237, + "balance_loss_clip": 1.01480603, + "balance_loss_mlp": 1.00373483, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7780966653576041, + "language_loss": 0.61696124, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63741767, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01501465, + "step": 12847, + "time_per_iteration": 3.0008769035339355 + }, + { + "auxiliary_loss_clip": 0.01114589, + "auxiliary_loss_mlp": 0.01025382, + "balance_loss_clip": 1.0419004, + "balance_loss_mlp": 1.01352108, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.7546227098303335, + "language_loss": 0.79552138, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81692117, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11853027, + "step": 12848, + "time_per_iteration": 2.4773619174957275 + }, + { + "auxiliary_loss_clip": 0.0111849, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.04324532, + "balance_loss_mlp": 1.02044153, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.9528512372748248, + "language_loss": 0.72464526, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74615234, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11767578, + "step": 12849, + "time_per_iteration": 3.8499388694763184 + }, + { + "auxiliary_loss_clip": 0.01117602, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.04417002, + "balance_loss_mlp": 1.0175488, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.7749180518065977, + "language_loss": 0.78282702, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80429631, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11779785, + "step": 12850, + "time_per_iteration": 2.4957404136657715 + }, + { + "auxiliary_loss_clip": 0.01118907, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.04355907, + "balance_loss_mlp": 1.02230334, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.6566318213603735, + "language_loss": 0.80145824, + "learning_rate": 5.180527968188935e-07, + "loss": 0.82299185, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12158203, + "step": 12851, + "time_per_iteration": 2.5240085124969482 + }, + { + "auxiliary_loss_clip": 0.01115433, + "auxiliary_loss_mlp": 0.01028212, + "balance_loss_clip": 1.04216909, + "balance_loss_mlp": 1.01506281, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 2.179571023691577, + "language_loss": 0.73677546, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75821191, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.13140869, + "step": 12852, + "time_per_iteration": 2.467414140701294 + }, + { + "auxiliary_loss_clip": 0.01111955, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.03883862, + "balance_loss_mlp": 1.03256631, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.8981748840122106, + "language_loss": 0.82242382, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84400719, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.13806152, + "step": 12853, + "time_per_iteration": 2.479979991912842 + }, + { + "auxiliary_loss_clip": 0.010508, + "auxiliary_loss_mlp": 0.01001349, + "balance_loss_clip": 1.02486694, + "balance_loss_mlp": 0.99983644, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.8090788242976781, + "language_loss": 0.54510373, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56562519, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01512146, + "step": 12854, + "time_per_iteration": 3.1468088626861572 + }, + { + "auxiliary_loss_clip": 0.01121909, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.04549098, + "balance_loss_mlp": 1.01429188, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.5474954387236135, + "language_loss": 0.71928298, + "learning_rate": 5.170070992041826e-07, + "loss": 0.74077189, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12695312, + "step": 12855, + "time_per_iteration": 2.687574863433838 + }, + { + "auxiliary_loss_clip": 0.01116024, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.04344654, + "balance_loss_mlp": 1.01550984, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 2.0681718709000405, + "language_loss": 0.6787557, + "learning_rate": 5.167458153638254e-07, + "loss": 0.70019525, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12420654, + "step": 12856, + "time_per_iteration": 2.439256191253662 + }, + { + "auxiliary_loss_clip": 0.01125168, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.05105519, + "balance_loss_mlp": 1.0175674, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.7166044878256481, + "language_loss": 0.79362559, + "learning_rate": 5.164845877686162e-07, + "loss": 0.81516612, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11297607, + "step": 12857, + "time_per_iteration": 2.5638182163238525 + }, + { + "auxiliary_loss_clip": 0.01112351, + "auxiliary_loss_mlp": 0.01027209, + "balance_loss_clip": 1.04162908, + "balance_loss_mlp": 1.01519299, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 1.7937806734721904, + "language_loss": 0.78389621, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80529189, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.12017822, + "step": 12858, + "time_per_iteration": 2.4219343662261963 + }, + { + "auxiliary_loss_clip": 0.01119051, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04538465, + "balance_loss_mlp": 1.01843119, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.007505995991686, + "language_loss": 0.76917279, + "learning_rate": 5.159623013532591e-07, + "loss": 0.79066932, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1217041, + "step": 12859, + "time_per_iteration": 2.440427541732788 + }, + { + "auxiliary_loss_clip": 0.01114201, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.04586267, + "balance_loss_mlp": 1.01634073, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.5252185236782814, + "language_loss": 0.67685974, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69826913, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10394287, + "step": 12860, + "time_per_iteration": 2.52647066116333 + }, + { + "auxiliary_loss_clip": 0.01115108, + "auxiliary_loss_mlp": 0.01038282, + "balance_loss_clip": 1.03792787, + "balance_loss_mlp": 1.02562761, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.757561156226721, + "language_loss": 0.7486065, + "learning_rate": 5.154402400373343e-07, + "loss": 0.77014041, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12664795, + "step": 12861, + "time_per_iteration": 2.4026882648468018 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.04453921, + "balance_loss_mlp": 1.01952696, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.6542931687823375, + "language_loss": 0.74972761, + "learning_rate": 5.15179293816405e-07, + "loss": 0.77128261, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12481689, + "step": 12862, + "time_per_iteration": 2.537376880645752 + }, + { + "auxiliary_loss_clip": 0.01112723, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.04135561, + "balance_loss_mlp": 1.01880336, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.56680448115144, + "language_loss": 0.82670748, + "learning_rate": 5.149184039000256e-07, + "loss": 0.84814322, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1204834, + "step": 12863, + "time_per_iteration": 3.907125949859619 + }, + { + "auxiliary_loss_clip": 0.01117867, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.04256368, + "balance_loss_mlp": 1.02067637, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.6157149902037902, + "language_loss": 0.73294592, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75445223, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12084961, + "step": 12864, + "time_per_iteration": 2.4278430938720703 + }, + { + "auxiliary_loss_clip": 0.01115078, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.04299569, + "balance_loss_mlp": 1.02026951, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.769398712319479, + "language_loss": 0.82011044, + "learning_rate": 5.143967930204871e-07, + "loss": 0.8415724, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10852051, + "step": 12865, + "time_per_iteration": 2.480496644973755 + }, + { + "auxiliary_loss_clip": 0.01118881, + "auxiliary_loss_mlp": 0.0104237, + "balance_loss_clip": 1.04122818, + "balance_loss_mlp": 1.02647913, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.2408565429287814, + "language_loss": 0.71817559, + "learning_rate": 5.141360720771077e-07, + "loss": 0.73978806, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.15893555, + "step": 12866, + "time_per_iteration": 2.5351126194000244 + }, + { + "auxiliary_loss_clip": 0.0112279, + "auxiliary_loss_mlp": 0.01033748, + "balance_loss_clip": 1.04791021, + "balance_loss_mlp": 1.02084374, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.5694718746014145, + "language_loss": 0.64930463, + "learning_rate": 5.138754074778371e-07, + "loss": 0.67086995, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12902832, + "step": 12867, + "time_per_iteration": 2.4409549236297607 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01044928, + "balance_loss_clip": 1.04103243, + "balance_loss_mlp": 1.0316962, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.434099472370122, + "language_loss": 0.70899504, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73060966, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13226318, + "step": 12868, + "time_per_iteration": 2.484588623046875 + }, + { + "auxiliary_loss_clip": 0.01122718, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.04318893, + "balance_loss_mlp": 1.01793838, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.7012604636930435, + "language_loss": 0.77300239, + "learning_rate": 5.133542473511578e-07, + "loss": 0.79453027, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12133789, + "step": 12869, + "time_per_iteration": 2.4915542602539062 + }, + { + "auxiliary_loss_clip": 0.01109939, + "auxiliary_loss_mlp": 0.01026529, + "balance_loss_clip": 1.04027081, + "balance_loss_mlp": 1.01492977, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.5667704129607158, + "language_loss": 0.73596585, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75733054, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11602783, + "step": 12870, + "time_per_iteration": 2.556851625442505 + }, + { + "auxiliary_loss_clip": 0.01116219, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.04363346, + "balance_loss_mlp": 1.01794255, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.0095385381654878, + "language_loss": 0.75495487, + "learning_rate": 5.12833312719501e-07, + "loss": 0.77641344, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11694336, + "step": 12871, + "time_per_iteration": 2.3897297382354736 + }, + { + "auxiliary_loss_clip": 0.01112425, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.03940511, + "balance_loss_mlp": 1.02079558, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.6008881785202551, + "language_loss": 0.68689823, + "learning_rate": 5.12572929988999e-07, + "loss": 0.70834279, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11236572, + "step": 12872, + "time_per_iteration": 2.4353647232055664 + }, + { + "auxiliary_loss_clip": 0.0111731, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.04406309, + "balance_loss_mlp": 1.01914239, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 1.9700922133989403, + "language_loss": 0.84977221, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87126666, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12988281, + "step": 12873, + "time_per_iteration": 2.465801954269409 + }, + { + "auxiliary_loss_clip": 0.01115731, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.0428679, + "balance_loss_mlp": 1.02275932, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.6651790919237692, + "language_loss": 0.65380812, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67530966, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11663818, + "step": 12874, + "time_per_iteration": 2.5036673545837402 + }, + { + "auxiliary_loss_clip": 0.01113351, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.04098439, + "balance_loss_mlp": 1.01767218, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 2.9339490165392417, + "language_loss": 0.62120616, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64263308, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11669922, + "step": 12875, + "time_per_iteration": 2.4624228477478027 + }, + { + "auxiliary_loss_clip": 0.01115325, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.0421102, + "balance_loss_mlp": 1.01838732, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.8916914711362174, + "language_loss": 0.6541934, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67565024, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11981201, + "step": 12876, + "time_per_iteration": 2.477994441986084 + }, + { + "auxiliary_loss_clip": 0.01115756, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.04295421, + "balance_loss_mlp": 1.0224309, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 1.9236512577666596, + "language_loss": 0.71051991, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73201573, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11376953, + "step": 12877, + "time_per_iteration": 2.489464521408081 + }, + { + "auxiliary_loss_clip": 0.01122088, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.04567719, + "balance_loss_mlp": 1.01782203, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.7055735937453094, + "language_loss": 0.82914877, + "learning_rate": 5.110118184224736e-07, + "loss": 0.85067093, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12298584, + "step": 12878, + "time_per_iteration": 2.4672164916992188 + }, + { + "auxiliary_loss_clip": 0.01115484, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.04294968, + "balance_loss_mlp": 1.02325344, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.6885067087153436, + "language_loss": 0.73216605, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75368237, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12896729, + "step": 12879, + "time_per_iteration": 2.4706215858459473 + }, + { + "auxiliary_loss_clip": 0.01110591, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.04157495, + "balance_loss_mlp": 1.01721382, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 1.6756741402809632, + "language_loss": 0.79576772, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81716192, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.11608887, + "step": 12880, + "time_per_iteration": 2.4835193157196045 + }, + { + "auxiliary_loss_clip": 0.01115471, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.04293489, + "balance_loss_mlp": 1.02279019, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 2.7462209802635495, + "language_loss": 0.70158237, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72308171, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11657715, + "step": 12881, + "time_per_iteration": 2.723651885986328 + }, + { + "auxiliary_loss_clip": 0.01118407, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.04198372, + "balance_loss_mlp": 1.026721, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.549639800795656, + "language_loss": 0.84233463, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86391884, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1328125, + "step": 12882, + "time_per_iteration": 2.4722766876220703 + }, + { + "auxiliary_loss_clip": 0.01049224, + "auxiliary_loss_mlp": 0.01001044, + "balance_loss_clip": 1.02415562, + "balance_loss_mlp": 0.99964291, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7650470534480254, + "language_loss": 0.60462892, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62513155, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01400757, + "step": 12883, + "time_per_iteration": 3.0531225204467773 + }, + { + "auxiliary_loss_clip": 0.01120362, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.04618168, + "balance_loss_mlp": 1.01581037, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 3.232541266929934, + "language_loss": 0.72636712, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74785322, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12426758, + "step": 12884, + "time_per_iteration": 2.413738489151001 + }, + { + "auxiliary_loss_clip": 0.0111309, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.0415833, + "balance_loss_mlp": 1.02182841, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 2.6151878098947443, + "language_loss": 0.81280315, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83426023, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10778809, + "step": 12885, + "time_per_iteration": 3.8760428428649902 + }, + { + "auxiliary_loss_clip": 0.0111725, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.04562056, + "balance_loss_mlp": 1.02067709, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.7558258309741133, + "language_loss": 0.63866353, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66015041, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10760498, + "step": 12886, + "time_per_iteration": 2.5685598850250244 + }, + { + "auxiliary_loss_clip": 0.01115714, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.04204917, + "balance_loss_mlp": 1.02293777, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 3.3196090514636896, + "language_loss": 0.69785941, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71935332, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10742188, + "step": 12887, + "time_per_iteration": 3.834542989730835 + }, + { + "auxiliary_loss_clip": 0.01111255, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.04087353, + "balance_loss_mlp": 1.01668549, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.96840890916956, + "language_loss": 0.70192099, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72330964, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10913086, + "step": 12888, + "time_per_iteration": 2.4532132148742676 + }, + { + "auxiliary_loss_clip": 0.0111312, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.03938913, + "balance_loss_mlp": 1.01709557, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.6972557803163937, + "language_loss": 0.8205412, + "learning_rate": 5.081550613368279e-07, + "loss": 0.84196144, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1182251, + "step": 12889, + "time_per_iteration": 2.4846956729888916 + }, + { + "auxiliary_loss_clip": 0.01126579, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.05401051, + "balance_loss_mlp": 1.01828146, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 3.031642079228375, + "language_loss": 0.79866809, + "learning_rate": 5.07895695375838e-07, + "loss": 0.82023227, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11560059, + "step": 12890, + "time_per_iteration": 2.456223487854004 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.04461026, + "balance_loss_mlp": 1.01789188, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 2.2188016265603836, + "language_loss": 0.66205418, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68355542, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12194824, + "step": 12891, + "time_per_iteration": 2.4577999114990234 + }, + { + "auxiliary_loss_clip": 0.0111634, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.0419718, + "balance_loss_mlp": 1.01876879, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.847798834110224, + "language_loss": 0.7897324, + "learning_rate": 5.073771332059257e-07, + "loss": 0.81120467, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12127686, + "step": 12892, + "time_per_iteration": 3.987574577331543 + }, + { + "auxiliary_loss_clip": 0.01121633, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.04323149, + "balance_loss_mlp": 1.02839446, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 2.651714231030843, + "language_loss": 0.6722858, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69393289, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.14697266, + "step": 12893, + "time_per_iteration": 2.410356044769287 + }, + { + "auxiliary_loss_clip": 0.01048786, + "auxiliary_loss_mlp": 0.01005924, + "balance_loss_clip": 1.02312517, + "balance_loss_mlp": 1.00428009, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8063671568299086, + "language_loss": 0.58440346, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60495055, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01643372, + "step": 12894, + "time_per_iteration": 3.1358730792999268 + }, + { + "auxiliary_loss_clip": 0.01120951, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.04707146, + "balance_loss_mlp": 1.02078426, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.749360583856106, + "language_loss": 0.7854147, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80696207, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.13006592, + "step": 12895, + "time_per_iteration": 2.4415669441223145 + }, + { + "auxiliary_loss_clip": 0.0111821, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.04308176, + "balance_loss_mlp": 1.02187347, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 2.203461133711119, + "language_loss": 0.67713094, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69866294, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13116455, + "step": 12896, + "time_per_iteration": 2.4521596431732178 + }, + { + "auxiliary_loss_clip": 0.01114573, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.04121017, + "balance_loss_mlp": 1.02295649, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.7034490931489288, + "language_loss": 0.68899029, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71047449, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10888672, + "step": 12897, + "time_per_iteration": 2.4365203380584717 + }, + { + "auxiliary_loss_clip": 0.01110689, + "auxiliary_loss_mlp": 0.01037123, + "balance_loss_clip": 1.03749335, + "balance_loss_mlp": 1.02383709, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.7498486298935476, + "language_loss": 0.7516942, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77317232, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.13293457, + "step": 12898, + "time_per_iteration": 2.495652675628662 + }, + { + "auxiliary_loss_clip": 0.01116112, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.04301345, + "balance_loss_mlp": 1.01698613, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 1.868992724846569, + "language_loss": 0.69839567, + "learning_rate": 5.055639490399588e-07, + "loss": 0.71985519, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.128479, + "step": 12899, + "time_per_iteration": 2.434767961502075 + }, + { + "auxiliary_loss_clip": 0.01115934, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.04357505, + "balance_loss_mlp": 1.02232981, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 2.4241282629977694, + "language_loss": 0.75475287, + "learning_rate": 5.053051493286453e-07, + "loss": 0.77625906, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12371826, + "step": 12900, + "time_per_iteration": 2.429136037826538 + }, + { + "auxiliary_loss_clip": 0.01112875, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.03991485, + "balance_loss_mlp": 1.02213705, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 2.19798555497809, + "language_loss": 0.77300763, + "learning_rate": 5.050464062963113e-07, + "loss": 0.7944715, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11376953, + "step": 12901, + "time_per_iteration": 2.5610625743865967 + }, + { + "auxiliary_loss_clip": 0.01127359, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.05283117, + "balance_loss_mlp": 1.01732111, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.4800355725101038, + "language_loss": 0.77144766, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79301858, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12408447, + "step": 12902, + "time_per_iteration": 2.5147311687469482 + }, + { + "auxiliary_loss_clip": 0.01118065, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.04628897, + "balance_loss_mlp": 1.0158596, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.7424387795549494, + "language_loss": 0.73355162, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75500858, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11767578, + "step": 12903, + "time_per_iteration": 2.5076675415039062 + }, + { + "auxiliary_loss_clip": 0.01108239, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.03677511, + "balance_loss_mlp": 1.0170114, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 4.314651552907949, + "language_loss": 0.76509476, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78648138, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.13415527, + "step": 12904, + "time_per_iteration": 2.5266101360321045 + }, + { + "auxiliary_loss_clip": 0.01108951, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.04110026, + "balance_loss_mlp": 1.01653671, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.3876450287847693, + "language_loss": 0.6825707, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70393395, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10852051, + "step": 12905, + "time_per_iteration": 2.489551544189453 + }, + { + "auxiliary_loss_clip": 0.01110686, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.04199076, + "balance_loss_mlp": 1.01651287, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 2.241715623886821, + "language_loss": 0.67541087, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69679976, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.11706543, + "step": 12906, + "time_per_iteration": 2.488957643508911 + }, + { + "auxiliary_loss_clip": 0.01112885, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.04141557, + "balance_loss_mlp": 1.01699424, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 8.048073423052843, + "language_loss": 0.81449473, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83590901, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11547852, + "step": 12907, + "time_per_iteration": 3.820080041885376 + }, + { + "auxiliary_loss_clip": 0.01114674, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.04374063, + "balance_loss_mlp": 1.02712297, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.1857252812792005, + "language_loss": 0.67370301, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69525158, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1305542, + "step": 12908, + "time_per_iteration": 2.411519765853882 + }, + { + "auxiliary_loss_clip": 0.01124183, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.04963672, + "balance_loss_mlp": 1.02105999, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.6004678482370407, + "language_loss": 0.70385528, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72542083, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11315918, + "step": 12909, + "time_per_iteration": 2.420025587081909 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.04274976, + "balance_loss_mlp": 1.02534914, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.597701735539304, + "language_loss": 0.68171507, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70323002, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11547852, + "step": 12910, + "time_per_iteration": 2.4978983402252197 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.03857112, + "balance_loss_mlp": 1.02009225, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 2.6955323438467302, + "language_loss": 0.71741045, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73882675, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10943604, + "step": 12911, + "time_per_iteration": 2.562906503677368 + }, + { + "auxiliary_loss_clip": 0.01118513, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.04289341, + "balance_loss_mlp": 1.01948285, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 3.1380249517892715, + "language_loss": 0.62988544, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65139759, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13226318, + "step": 12912, + "time_per_iteration": 2.4542276859283447 + }, + { + "auxiliary_loss_clip": 0.01047911, + "auxiliary_loss_mlp": 0.01004208, + "balance_loss_clip": 1.02218318, + "balance_loss_mlp": 1.00284886, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.7688315113306563, + "language_loss": 0.53199899, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55252016, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01359558, + "step": 12913, + "time_per_iteration": 3.130092144012451 + }, + { + "auxiliary_loss_clip": 0.01120984, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.04708922, + "balance_loss_mlp": 1.02169526, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 1.8362344950352587, + "language_loss": 0.62184048, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64339072, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12353516, + "step": 12914, + "time_per_iteration": 2.4733870029449463 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.04339671, + "balance_loss_mlp": 1.01834965, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 3.176759315445082, + "language_loss": 0.82286382, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84434122, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11914062, + "step": 12915, + "time_per_iteration": 2.438220739364624 + }, + { + "auxiliary_loss_clip": 0.01113452, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.03727031, + "balance_loss_mlp": 1.01922107, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.6735177455666843, + "language_loss": 0.74822617, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76968324, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13037109, + "step": 12916, + "time_per_iteration": 2.709099531173706 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.03852725, + "balance_loss_mlp": 1.0186038, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.4998206988759937, + "language_loss": 0.65607244, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67750329, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1192627, + "step": 12917, + "time_per_iteration": 2.788208246231079 + }, + { + "auxiliary_loss_clip": 0.01108827, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.03478432, + "balance_loss_mlp": 1.02193284, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.8402234851781907, + "language_loss": 0.64384699, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66527301, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11853027, + "step": 12918, + "time_per_iteration": 2.541132688522339 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.04052329, + "balance_loss_mlp": 1.01766777, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.2644043748361358, + "language_loss": 0.73772144, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75913584, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11383057, + "step": 12919, + "time_per_iteration": 2.466200351715088 + }, + { + "auxiliary_loss_clip": 0.01114536, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.04161608, + "balance_loss_mlp": 1.02123559, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 3.0411640065930423, + "language_loss": 0.79524863, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81673026, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12384033, + "step": 12920, + "time_per_iteration": 2.4979050159454346 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.04111385, + "balance_loss_mlp": 1.01636147, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.9222504685740998, + "language_loss": 0.70887345, + "learning_rate": 4.998834633291829e-07, + "loss": 0.7303139, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.13153076, + "step": 12921, + "time_per_iteration": 2.4308815002441406 + }, + { + "auxiliary_loss_clip": 0.01124288, + "auxiliary_loss_mlp": 0.0103996, + "balance_loss_clip": 1.04388177, + "balance_loss_mlp": 1.02676928, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.9191746700025623, + "language_loss": 0.7633487, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78499115, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13208008, + "step": 12922, + "time_per_iteration": 2.460679531097412 + }, + { + "auxiliary_loss_clip": 0.01134528, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.05868125, + "balance_loss_mlp": 1.01806259, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.615656026670855, + "language_loss": 0.80643713, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82808459, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1217041, + "step": 12923, + "time_per_iteration": 2.4330010414123535 + }, + { + "auxiliary_loss_clip": 0.01119658, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.04720759, + "balance_loss_mlp": 1.02217638, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 10.92130404377246, + "language_loss": 0.92399096, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94551581, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10656738, + "step": 12924, + "time_per_iteration": 2.525999069213867 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.03859711, + "balance_loss_mlp": 1.01592922, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 25.879659705503876, + "language_loss": 0.66249573, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68388307, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12017822, + "step": 12925, + "time_per_iteration": 2.4617955684661865 + }, + { + "auxiliary_loss_clip": 0.01117692, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.04424286, + "balance_loss_mlp": 1.01810098, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 2.4496672611756134, + "language_loss": 0.72139812, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74287331, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11712646, + "step": 12926, + "time_per_iteration": 2.4969277381896973 + }, + { + "auxiliary_loss_clip": 0.0111852, + "auxiliary_loss_mlp": 0.01028736, + "balance_loss_clip": 1.04381502, + "balance_loss_mlp": 1.01542664, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.822368818251396, + "language_loss": 0.65783668, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67930925, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.13311768, + "step": 12927, + "time_per_iteration": 2.484880208969116 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.04863274, + "balance_loss_mlp": 1.02288496, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 2.131028868961606, + "language_loss": 0.72573322, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74730194, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12823486, + "step": 12928, + "time_per_iteration": 3.940680980682373 + }, + { + "auxiliary_loss_clip": 0.01109759, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.03870082, + "balance_loss_mlp": 1.01871467, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.8777565414339246, + "language_loss": 0.74272013, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76413381, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.12890625, + "step": 12929, + "time_per_iteration": 3.8474392890930176 + }, + { + "auxiliary_loss_clip": 0.01115088, + "auxiliary_loss_mlp": 0.01033479, + "balance_loss_clip": 1.04127216, + "balance_loss_mlp": 1.0214808, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 2.220249537123541, + "language_loss": 0.77570498, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79719067, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12005615, + "step": 12930, + "time_per_iteration": 2.4473183155059814 + }, + { + "auxiliary_loss_clip": 0.01115497, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.04344058, + "balance_loss_mlp": 1.02003753, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 2.1026288901581216, + "language_loss": 0.79445267, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81592983, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12182617, + "step": 12931, + "time_per_iteration": 2.4863805770874023 + }, + { + "auxiliary_loss_clip": 0.01071826, + "auxiliary_loss_mlp": 0.010035, + "balance_loss_clip": 1.04730976, + "balance_loss_mlp": 1.00208783, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.833341905340339, + "language_loss": 0.5973475, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61810082, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.01409912, + "step": 12932, + "time_per_iteration": 3.0410659313201904 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.04404879, + "balance_loss_mlp": 1.02076888, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.5240150010252713, + "language_loss": 0.760203, + "learning_rate": 4.967966143748595e-07, + "loss": 0.7817179, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12054443, + "step": 12933, + "time_per_iteration": 2.4957785606384277 + }, + { + "auxiliary_loss_clip": 0.01116016, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.04146326, + "balance_loss_mlp": 1.02293599, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.2283335263719337, + "language_loss": 0.7360819, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75759637, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12506104, + "step": 12934, + "time_per_iteration": 2.476609706878662 + }, + { + "auxiliary_loss_clip": 0.01111807, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.03946507, + "balance_loss_mlp": 1.01568723, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 2.5029390230149917, + "language_loss": 0.7034291, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72482967, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12567139, + "step": 12935, + "time_per_iteration": 2.4324686527252197 + }, + { + "auxiliary_loss_clip": 0.01119439, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.04293656, + "balance_loss_mlp": 1.02560401, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.7108413986176363, + "language_loss": 0.83610308, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85768402, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13037109, + "step": 12936, + "time_per_iteration": 3.820065975189209 + }, + { + "auxiliary_loss_clip": 0.01121388, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.04420352, + "balance_loss_mlp": 1.01851392, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 1.920336357127306, + "language_loss": 0.67662013, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69813538, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11633301, + "step": 12937, + "time_per_iteration": 2.4359447956085205 + }, + { + "auxiliary_loss_clip": 0.01118606, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.04478991, + "balance_loss_mlp": 1.01621628, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.5569940681780408, + "language_loss": 0.87245047, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89391923, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12054443, + "step": 12938, + "time_per_iteration": 2.4810092449188232 + }, + { + "auxiliary_loss_clip": 0.01112802, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.03958607, + "balance_loss_mlp": 1.01932144, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 2.163595834703442, + "language_loss": 0.85457629, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87603831, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.14056396, + "step": 12939, + "time_per_iteration": 2.437375545501709 + }, + { + "auxiliary_loss_clip": 0.01115695, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.0433681, + "balance_loss_mlp": 1.0189507, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.8174733228491637, + "language_loss": 0.68973374, + "learning_rate": 4.949997420117915e-07, + "loss": 0.71119392, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.1138916, + "step": 12940, + "time_per_iteration": 2.4388210773468018 + }, + { + "auxiliary_loss_clip": 0.01113914, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.03975463, + "balance_loss_mlp": 1.01711178, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.7437581919276723, + "language_loss": 0.77769369, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79911673, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11291504, + "step": 12941, + "time_per_iteration": 2.474076747894287 + }, + { + "auxiliary_loss_clip": 0.01116444, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.04091239, + "balance_loss_mlp": 1.02262235, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 4.588872114038905, + "language_loss": 0.73445857, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75598258, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.13317871, + "step": 12942, + "time_per_iteration": 2.529797077178955 + }, + { + "auxiliary_loss_clip": 0.01107513, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.03618896, + "balance_loss_mlp": 1.01976991, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 2.48303847831404, + "language_loss": 0.67775649, + "learning_rate": 4.942305097079751e-07, + "loss": 0.6991446, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11535645, + "step": 12943, + "time_per_iteration": 2.478273868560791 + }, + { + "auxiliary_loss_clip": 0.01044129, + "auxiliary_loss_mlp": 0.01000678, + "balance_loss_clip": 1.01858556, + "balance_loss_mlp": 0.9992519, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7741940814401512, + "language_loss": 0.58504307, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60549122, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.25537109, + "router_z_loss_mlp": 0.01426697, + "step": 12944, + "time_per_iteration": 3.2153406143188477 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.04728174, + "balance_loss_mlp": 1.01931047, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.0264054679553944, + "language_loss": 0.67863405, + "learning_rate": 4.937179736505428e-07, + "loss": 0.70018166, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13110352, + "step": 12945, + "time_per_iteration": 2.4604411125183105 + }, + { + "auxiliary_loss_clip": 0.01124088, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.04977107, + "balance_loss_mlp": 1.02477944, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.807229424090848, + "language_loss": 0.68662584, + "learning_rate": 4.93461791294516e-07, + "loss": 0.70823836, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12384033, + "step": 12946, + "time_per_iteration": 2.5815422534942627 + }, + { + "auxiliary_loss_clip": 0.01125515, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.05103219, + "balance_loss_mlp": 1.01732588, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 1.8446068462555392, + "language_loss": 0.65564406, + "learning_rate": 4.932056660665689e-07, + "loss": 0.6771946, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12219238, + "step": 12947, + "time_per_iteration": 2.4471774101257324 + }, + { + "auxiliary_loss_clip": 0.0111512, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.04386282, + "balance_loss_mlp": 1.02053785, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.1053956736082777, + "language_loss": 0.65278244, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67425698, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11804199, + "step": 12948, + "time_per_iteration": 2.4390058517456055 + }, + { + "auxiliary_loss_clip": 0.01116048, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.04418325, + "balance_loss_mlp": 1.01937032, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.9094988295588207, + "language_loss": 0.75173664, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77321273, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12176514, + "step": 12949, + "time_per_iteration": 2.4527177810668945 + }, + { + "auxiliary_loss_clip": 0.01128902, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.053303, + "balance_loss_mlp": 1.01959991, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.9925005525546204, + "language_loss": 0.69015819, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71177256, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1293335, + "step": 12950, + "time_per_iteration": 2.52181077003479 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.03913033, + "balance_loss_mlp": 1.01709342, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.7043095868952634, + "language_loss": 0.71644437, + "learning_rate": 4.921817366297938e-07, + "loss": 0.73787725, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11883545, + "step": 12951, + "time_per_iteration": 3.947885513305664 + }, + { + "auxiliary_loss_clip": 0.01118307, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.04830933, + "balance_loss_mlp": 1.02132261, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.7330074127418946, + "language_loss": 0.65567291, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67719209, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.12286377, + "step": 12952, + "time_per_iteration": 2.4812583923339844 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.04295969, + "balance_loss_mlp": 1.01833439, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.6140775102540037, + "language_loss": 0.81085986, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83227098, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.67626953, + "router_z_loss_mlp": 0.12194824, + "step": 12953, + "time_per_iteration": 2.4904541969299316 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.04578853, + "balance_loss_mlp": 1.0181818, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.262797190787493, + "language_loss": 0.76851237, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79001808, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11950684, + "step": 12954, + "time_per_iteration": 2.4301350116729736 + }, + { + "auxiliary_loss_clip": 0.01116491, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.04053187, + "balance_loss_mlp": 1.02340174, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.8122496229985907, + "language_loss": 0.73137343, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7528944, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12200928, + "step": 12955, + "time_per_iteration": 2.4440536499023438 + }, + { + "auxiliary_loss_clip": 0.01116018, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.0431242, + "balance_loss_mlp": 1.01841784, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.4203285065832447, + "language_loss": 0.68813992, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70959848, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11425781, + "step": 12956, + "time_per_iteration": 2.502784013748169 + }, + { + "auxiliary_loss_clip": 0.01114419, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.04140162, + "balance_loss_mlp": 1.01984406, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.5570391047021224, + "language_loss": 0.7625066, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78396297, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11376953, + "step": 12957, + "time_per_iteration": 2.515782356262207 + }, + { + "auxiliary_loss_clip": 0.01114825, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.04079795, + "balance_loss_mlp": 1.01362574, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.6131029795825784, + "language_loss": 0.770697, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79210615, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12457275, + "step": 12958, + "time_per_iteration": 2.515172004699707 + }, + { + "auxiliary_loss_clip": 0.01116775, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.04305995, + "balance_loss_mlp": 1.0220747, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 2.9365216451844356, + "language_loss": 0.72090203, + "learning_rate": 4.901366228545418e-07, + "loss": 0.7424134, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12298584, + "step": 12959, + "time_per_iteration": 2.407813310623169 + }, + { + "auxiliary_loss_clip": 0.01112756, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.04156423, + "balance_loss_mlp": 1.02271628, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.561026899234285, + "language_loss": 0.7817328, + "learning_rate": 4.898812411746632e-07, + "loss": 0.80320275, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11523438, + "step": 12960, + "time_per_iteration": 2.4384515285491943 + }, + { + "auxiliary_loss_clip": 0.011196, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.04613376, + "balance_loss_mlp": 1.01884687, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 2.0228184142979777, + "language_loss": 0.7526266, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77413231, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12139893, + "step": 12961, + "time_per_iteration": 2.5326569080352783 + }, + { + "auxiliary_loss_clip": 0.01109355, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.04104996, + "balance_loss_mlp": 1.02549481, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.6626020299215774, + "language_loss": 0.73534322, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75681758, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.12591553, + "step": 12962, + "time_per_iteration": 2.4765422344207764 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01023686, + "balance_loss_clip": 1.04405403, + "balance_loss_mlp": 1.01252198, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.8861578480162164, + "language_loss": 0.69777471, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71915698, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11157227, + "step": 12963, + "time_per_iteration": 2.4294066429138184 + }, + { + "auxiliary_loss_clip": 0.01114584, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.04296923, + "balance_loss_mlp": 1.01858032, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.9319829629386371, + "language_loss": 0.63828748, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65973324, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11425781, + "step": 12964, + "time_per_iteration": 2.477997303009033 + }, + { + "auxiliary_loss_clip": 0.01114661, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.0411973, + "balance_loss_mlp": 1.01903558, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 3.0277264220432034, + "language_loss": 0.76869714, + "learning_rate": 4.88605191926694e-07, + "loss": 0.7901504, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11633301, + "step": 12965, + "time_per_iteration": 2.516335964202881 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.03944063, + "balance_loss_mlp": 1.01931977, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.5452427425067186, + "language_loss": 0.73020959, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75156462, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.65869141, + "router_z_loss_mlp": 0.10791016, + "step": 12966, + "time_per_iteration": 2.6430325508117676 + }, + { + "auxiliary_loss_clip": 0.01119562, + "auxiliary_loss_mlp": 0.01025257, + "balance_loss_clip": 1.0502789, + "balance_loss_mlp": 1.01519597, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.4619512748330998, + "language_loss": 0.74481052, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76625872, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10058594, + "step": 12967, + "time_per_iteration": 2.485032081604004 + }, + { + "auxiliary_loss_clip": 0.01109318, + "auxiliary_loss_mlp": 0.01029555, + "balance_loss_clip": 1.03709245, + "balance_loss_mlp": 1.01743126, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 3.9114715034769487, + "language_loss": 0.72414917, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74553788, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12133789, + "step": 12968, + "time_per_iteration": 2.5204808712005615 + }, + { + "auxiliary_loss_clip": 0.01109184, + "auxiliary_loss_mlp": 0.01039262, + "balance_loss_clip": 1.03865123, + "balance_loss_mlp": 1.02591681, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1.9672196791791854, + "language_loss": 0.60684782, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62833226, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.13323975, + "step": 12969, + "time_per_iteration": 2.4427638053894043 + }, + { + "auxiliary_loss_clip": 0.01113088, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.04200351, + "balance_loss_mlp": 1.01909637, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.9101769585204578, + "language_loss": 0.70015848, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72159457, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11425781, + "step": 12970, + "time_per_iteration": 2.4724247455596924 + }, + { + "auxiliary_loss_clip": 0.01124543, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.05056667, + "balance_loss_mlp": 1.01786625, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.6057030810213924, + "language_loss": 0.72371519, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74526435, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12506104, + "step": 12971, + "time_per_iteration": 2.660917043685913 + }, + { + "auxiliary_loss_clip": 0.01123126, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.0466485, + "balance_loss_mlp": 1.01420808, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.719790200312521, + "language_loss": 0.74330235, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76479501, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11932373, + "step": 12972, + "time_per_iteration": 3.9397964477539062 + }, + { + "auxiliary_loss_clip": 0.0111711, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.04517019, + "balance_loss_mlp": 1.01238835, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 1.8971072892257144, + "language_loss": 0.71436238, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73577845, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12109375, + "step": 12973, + "time_per_iteration": 3.87752628326416 + }, + { + "auxiliary_loss_clip": 0.01115613, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.0452956, + "balance_loss_mlp": 1.0194695, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 2.283787019351409, + "language_loss": 0.78048044, + "learning_rate": 4.863119147634089e-07, + "loss": 0.80193913, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10784912, + "step": 12974, + "time_per_iteration": 2.4325056076049805 + }, + { + "auxiliary_loss_clip": 0.01111123, + "auxiliary_loss_mlp": 0.01028298, + "balance_loss_clip": 1.03972363, + "balance_loss_mlp": 1.01629984, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.6347928254848065, + "language_loss": 0.69187742, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71327162, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12005615, + "step": 12975, + "time_per_iteration": 2.391244411468506 + }, + { + "auxiliary_loss_clip": 0.0111417, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.04214799, + "balance_loss_mlp": 1.0202564, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 2.1365405390358765, + "language_loss": 0.82576847, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84722853, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11572266, + "step": 12976, + "time_per_iteration": 2.4018213748931885 + }, + { + "auxiliary_loss_clip": 0.01116222, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.04105926, + "balance_loss_mlp": 1.01807046, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.4512108035603635, + "language_loss": 0.65905195, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68051672, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12188721, + "step": 12977, + "time_per_iteration": 2.4524505138397217 + }, + { + "auxiliary_loss_clip": 0.01112051, + "auxiliary_loss_mlp": 0.01031674, + "balance_loss_clip": 1.04210865, + "balance_loss_mlp": 1.01955605, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 2.2129997257485012, + "language_loss": 0.74599469, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76743197, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.12121582, + "step": 12978, + "time_per_iteration": 2.4955692291259766 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.04126501, + "balance_loss_mlp": 1.01957273, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 2.11557768603608, + "language_loss": 0.61956501, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64103705, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12432861, + "step": 12979, + "time_per_iteration": 3.9582364559173584 + }, + { + "auxiliary_loss_clip": 0.01116277, + "auxiliary_loss_mlp": 0.01027532, + "balance_loss_clip": 1.04488993, + "balance_loss_mlp": 1.01520634, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 2.152493788419947, + "language_loss": 0.77517599, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79661405, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12329102, + "step": 12980, + "time_per_iteration": 2.4756762981414795 + }, + { + "auxiliary_loss_clip": 0.0111522, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.04071712, + "balance_loss_mlp": 1.0174247, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 2.5486247060624625, + "language_loss": 0.77951044, + "learning_rate": 4.845314687419046e-07, + "loss": 0.800951, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11407471, + "step": 12981, + "time_per_iteration": 2.430269956588745 + }, + { + "auxiliary_loss_clip": 0.01123178, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.04828119, + "balance_loss_mlp": 1.01827621, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 2.18108867312494, + "language_loss": 0.72779286, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74932402, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11657715, + "step": 12982, + "time_per_iteration": 2.4648218154907227 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.042979, + "balance_loss_mlp": 1.01810753, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.3585753673272725, + "language_loss": 0.73410648, + "learning_rate": 4.840232869344636e-07, + "loss": 0.7555536, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10821533, + "step": 12983, + "time_per_iteration": 2.482961893081665 + }, + { + "auxiliary_loss_clip": 0.0111969, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.04707789, + "balance_loss_mlp": 1.01666212, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.7439657504583934, + "language_loss": 0.74973929, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77121556, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11279297, + "step": 12984, + "time_per_iteration": 2.429091691970825 + }, + { + "auxiliary_loss_clip": 0.01108948, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.03677392, + "balance_loss_mlp": 1.02167225, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 1.8824256708346077, + "language_loss": 0.81273717, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83416575, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12231445, + "step": 12985, + "time_per_iteration": 2.4751408100128174 + }, + { + "auxiliary_loss_clip": 0.01109834, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.03949738, + "balance_loss_mlp": 1.02149701, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.6433766259923046, + "language_loss": 0.76697683, + "learning_rate": 4.832614453922915e-07, + "loss": 0.78840709, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11706543, + "step": 12986, + "time_per_iteration": 2.4635062217712402 + }, + { + "auxiliary_loss_clip": 0.0111354, + "auxiliary_loss_mlp": 0.01032108, + "balance_loss_clip": 1.04005754, + "balance_loss_mlp": 1.0203892, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 1.7944341310396172, + "language_loss": 0.73884797, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76030445, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11730957, + "step": 12987, + "time_per_iteration": 2.515380382537842 + }, + { + "auxiliary_loss_clip": 0.01044053, + "auxiliary_loss_mlp": 0.01012005, + "balance_loss_clip": 1.01823044, + "balance_loss_mlp": 1.01051521, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7313367376939559, + "language_loss": 0.55064392, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57120448, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01490784, + "step": 12988, + "time_per_iteration": 3.0950260162353516 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.04649782, + "balance_loss_mlp": 1.01945972, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.44222601881018, + "language_loss": 0.80878991, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83024913, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10906982, + "step": 12989, + "time_per_iteration": 2.4590704441070557 + }, + { + "auxiliary_loss_clip": 0.01108468, + "auxiliary_loss_mlp": 0.01027489, + "balance_loss_clip": 1.03864336, + "balance_loss_mlp": 1.01571679, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.6743695454143228, + "language_loss": 0.70378095, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72514051, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11767578, + "step": 12990, + "time_per_iteration": 2.447849750518799 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.0405643, + "balance_loss_mlp": 1.01766634, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 2.080313260433669, + "language_loss": 0.78102505, + "learning_rate": 4.819928599145184e-07, + "loss": 0.80245876, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.13018799, + "step": 12991, + "time_per_iteration": 2.4095778465270996 + }, + { + "auxiliary_loss_clip": 0.01120616, + "auxiliary_loss_mlp": 0.01034044, + "balance_loss_clip": 1.04649901, + "balance_loss_mlp": 1.02193284, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.435016860291261, + "language_loss": 0.65644622, + "learning_rate": 4.817393154694398e-07, + "loss": 0.67799282, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12121582, + "step": 12992, + "time_per_iteration": 2.659592390060425 + }, + { + "auxiliary_loss_clip": 0.01117717, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.04531336, + "balance_loss_mlp": 1.01767266, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 2.0869753137951115, + "language_loss": 0.61885822, + "learning_rate": 4.814858285969578e-07, + "loss": 0.64032769, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11566162, + "step": 12993, + "time_per_iteration": 2.4629948139190674 + }, + { + "auxiliary_loss_clip": 0.0111393, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.04037488, + "balance_loss_mlp": 1.01918232, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.6159689901738787, + "language_loss": 0.68616915, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70762223, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12194824, + "step": 12994, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01112085, + "auxiliary_loss_mlp": 0.01024997, + "balance_loss_clip": 1.0406177, + "balance_loss_mlp": 1.01372528, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.9530478558920488, + "language_loss": 0.69331896, + "learning_rate": 4.809790276082335e-07, + "loss": 0.71468985, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11273193, + "step": 12995, + "time_per_iteration": 3.8447265625 + }, + { + "auxiliary_loss_clip": 0.01105467, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.03578591, + "balance_loss_mlp": 1.02081716, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.7376205127419242, + "language_loss": 0.74919134, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77057612, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.12194824, + "step": 12996, + "time_per_iteration": 2.4565954208374023 + }, + { + "auxiliary_loss_clip": 0.01119049, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.04164064, + "balance_loss_mlp": 1.02097321, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.2677608466094603, + "language_loss": 0.68326294, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70478618, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12310791, + "step": 12997, + "time_per_iteration": 2.4350874423980713 + }, + { + "auxiliary_loss_clip": 0.01126944, + "auxiliary_loss_mlp": 0.01034087, + "balance_loss_clip": 1.04972363, + "balance_loss_mlp": 1.02125955, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.7584035305047936, + "language_loss": 0.8215059, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84311622, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12805176, + "step": 12998, + "time_per_iteration": 2.4951536655426025 + }, + { + "auxiliary_loss_clip": 0.01114363, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.04087913, + "balance_loss_mlp": 1.01764202, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 2.8207506882972164, + "language_loss": 0.74609804, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76753736, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11932373, + "step": 12999, + "time_per_iteration": 2.4597668647766113 + }, + { + "auxiliary_loss_clip": 0.01125968, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.04915762, + "balance_loss_mlp": 1.02223265, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.52104946471477, + "language_loss": 0.841272, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86288047, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12646484, + "step": 13000, + "time_per_iteration": 2.4234397411346436 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.0457685, + "balance_loss_mlp": 1.0177002, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 2.0498100947029014, + "language_loss": 0.66300631, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68449283, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11883545, + "step": 13001, + "time_per_iteration": 2.406001329421997 + }, + { + "auxiliary_loss_clip": 0.01116765, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.04361558, + "balance_loss_mlp": 1.01688409, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.7858404733534807, + "language_loss": 0.66760731, + "learning_rate": 4.792070390968027e-07, + "loss": 0.68905723, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11346436, + "step": 13002, + "time_per_iteration": 2.454843521118164 + }, + { + "auxiliary_loss_clip": 0.01120344, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.04699278, + "balance_loss_mlp": 1.0198288, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.4958964678300397, + "language_loss": 0.73079765, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75232506, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12567139, + "step": 13003, + "time_per_iteration": 2.43854022026062 + }, + { + "auxiliary_loss_clip": 0.01125349, + "auxiliary_loss_mlp": 0.01028935, + "balance_loss_clip": 1.05229735, + "balance_loss_mlp": 1.01790833, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 2.034925587787853, + "language_loss": 0.62252426, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64406705, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11022949, + "step": 13004, + "time_per_iteration": 2.4266786575317383 + }, + { + "auxiliary_loss_clip": 0.01109249, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.04217434, + "balance_loss_mlp": 1.01925421, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 1.917807958025593, + "language_loss": 0.83175367, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85314798, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.67089844, + "router_z_loss_mlp": 0.109375, + "step": 13005, + "time_per_iteration": 2.4106154441833496 + }, + { + "auxiliary_loss_clip": 0.0110927, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.03713441, + "balance_loss_mlp": 1.01706469, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 2.0957951754246618, + "language_loss": 0.72468901, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74607003, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11773682, + "step": 13006, + "time_per_iteration": 2.455355405807495 + }, + { + "auxiliary_loss_clip": 0.01109194, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.03635585, + "balance_loss_mlp": 1.01541209, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.7451548206694885, + "language_loss": 0.72165757, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74302304, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11938477, + "step": 13007, + "time_per_iteration": 2.4561634063720703 + }, + { + "auxiliary_loss_clip": 0.01110297, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03545892, + "balance_loss_mlp": 1.01747978, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.9669863525850937, + "language_loss": 0.6949572, + "learning_rate": 4.776904407525397e-07, + "loss": 0.7163744, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13952637, + "step": 13008, + "time_per_iteration": 2.437934160232544 + }, + { + "auxiliary_loss_clip": 0.01113142, + "auxiliary_loss_mlp": 0.01024683, + "balance_loss_clip": 1.04231167, + "balance_loss_mlp": 1.0128392, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.9851433751916137, + "language_loss": 0.69824266, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71962094, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.1184082, + "step": 13009, + "time_per_iteration": 2.5104727745056152 + }, + { + "auxiliary_loss_clip": 0.01105151, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.03440785, + "balance_loss_mlp": 1.01480508, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 2.140090458781339, + "language_loss": 0.81949145, + "learning_rate": 4.771853696779586e-07, + "loss": 0.84081066, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11968994, + "step": 13010, + "time_per_iteration": 2.4449145793914795 + }, + { + "auxiliary_loss_clip": 0.01115361, + "auxiliary_loss_mlp": 0.01034894, + "balance_loss_clip": 1.04533005, + "balance_loss_mlp": 1.02228808, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.4962108142325057, + "language_loss": 0.62193286, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64343542, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.12591553, + "step": 13011, + "time_per_iteration": 2.5560615062713623 + }, + { + "auxiliary_loss_clip": 0.01114448, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.04359913, + "balance_loss_mlp": 1.01814878, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.6092666762041001, + "language_loss": 0.7019996, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72343862, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11309814, + "step": 13012, + "time_per_iteration": 2.515199899673462 + }, + { + "auxiliary_loss_clip": 0.01040649, + "auxiliary_loss_mlp": 0.01007068, + "balance_loss_clip": 1.01486242, + "balance_loss_mlp": 1.00554657, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7046874234288104, + "language_loss": 0.55045354, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57093072, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.25830078, + "router_z_loss_mlp": 0.01521301, + "step": 13013, + "time_per_iteration": 3.1914260387420654 + }, + { + "auxiliary_loss_clip": 0.01117081, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.04319811, + "balance_loss_mlp": 1.02592123, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 2.073940709799806, + "language_loss": 0.65553892, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67709589, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12719727, + "step": 13014, + "time_per_iteration": 2.4092376232147217 + }, + { + "auxiliary_loss_clip": 0.01040292, + "auxiliary_loss_mlp": 0.01000661, + "balance_loss_clip": 1.01472664, + "balance_loss_mlp": 0.99912214, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.7211288393641128, + "language_loss": 0.5843116, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60472113, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.25537109, + "router_z_loss_mlp": 0.01539612, + "step": 13015, + "time_per_iteration": 4.635339736938477 + }, + { + "auxiliary_loss_clip": 0.01117124, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.04508173, + "balance_loss_mlp": 1.02023327, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.7114852213059675, + "language_loss": 0.74747175, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76895559, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11029053, + "step": 13016, + "time_per_iteration": 2.499971389770508 + }, + { + "auxiliary_loss_clip": 0.0111689, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.04401898, + "balance_loss_mlp": 1.01845479, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 2.421934160975032, + "language_loss": 0.74876946, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.7702533, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13043213, + "step": 13017, + "time_per_iteration": 3.8578662872314453 + }, + { + "auxiliary_loss_clip": 0.01119656, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.04559946, + "balance_loss_mlp": 1.01698017, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 2.2613685408547237, + "language_loss": 0.75729322, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77878594, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12640381, + "step": 13018, + "time_per_iteration": 2.3917973041534424 + }, + { + "auxiliary_loss_clip": 0.01116693, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.04284835, + "balance_loss_mlp": 1.01726556, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4262158568458085, + "language_loss": 0.7732603, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79472625, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12634277, + "step": 13019, + "time_per_iteration": 2.439720392227173 + }, + { + "auxiliary_loss_clip": 0.01113168, + "auxiliary_loss_mlp": 0.01026001, + "balance_loss_clip": 1.0404098, + "balance_loss_mlp": 1.01464665, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.4384354279974718, + "language_loss": 0.67673516, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69812679, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1137085, + "step": 13020, + "time_per_iteration": 2.5185205936431885 + }, + { + "auxiliary_loss_clip": 0.01120353, + "auxiliary_loss_mlp": 0.01029696, + "balance_loss_clip": 1.04474759, + "balance_loss_mlp": 1.01770973, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.5290625382142062, + "language_loss": 0.62846309, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64996362, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11993408, + "step": 13021, + "time_per_iteration": 2.424856185913086 + }, + { + "auxiliary_loss_clip": 0.01115412, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.04385996, + "balance_loss_mlp": 1.01876724, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 2.0040786878260377, + "language_loss": 0.68932593, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71077311, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10552979, + "step": 13022, + "time_per_iteration": 2.454385995864868 + }, + { + "auxiliary_loss_clip": 0.01072611, + "auxiliary_loss_mlp": 0.01004713, + "balance_loss_clip": 1.04743242, + "balance_loss_mlp": 1.003075, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.653304029591942, + "language_loss": 0.5612126, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58198583, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.01638794, + "step": 13023, + "time_per_iteration": 4.640944957733154 + }, + { + "auxiliary_loss_clip": 0.01113432, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.04468989, + "balance_loss_mlp": 1.01646543, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.6846571920124276, + "language_loss": 0.67544532, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69685113, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10681152, + "step": 13024, + "time_per_iteration": 2.4485185146331787 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.04490876, + "balance_loss_mlp": 1.01524496, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 2.3462466545962664, + "language_loss": 0.77813113, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79958844, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12231445, + "step": 13025, + "time_per_iteration": 2.480306386947632 + }, + { + "auxiliary_loss_clip": 0.01112064, + "auxiliary_loss_mlp": 0.0103611, + "balance_loss_clip": 1.04037595, + "balance_loss_mlp": 1.02333105, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.7145788707555518, + "language_loss": 0.77609324, + "learning_rate": 4.731531228298673e-07, + "loss": 0.797575, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12792969, + "step": 13026, + "time_per_iteration": 2.458204746246338 + }, + { + "auxiliary_loss_clip": 0.01118199, + "auxiliary_loss_mlp": 0.01025038, + "balance_loss_clip": 1.04666018, + "balance_loss_mlp": 1.01384425, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 2.358876746778571, + "language_loss": 0.75297189, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77440423, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11199951, + "step": 13027, + "time_per_iteration": 2.3990209102630615 + }, + { + "auxiliary_loss_clip": 0.01116249, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.04575181, + "balance_loss_mlp": 1.01660347, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 2.763932057875512, + "language_loss": 0.70606649, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72750598, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11083984, + "step": 13028, + "time_per_iteration": 2.4330973625183105 + }, + { + "auxiliary_loss_clip": 0.01118559, + "auxiliary_loss_mlp": 0.0103645, + "balance_loss_clip": 1.04131103, + "balance_loss_mlp": 1.02406383, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.1177984981814695, + "language_loss": 0.69121993, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71276999, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.1237793, + "step": 13029, + "time_per_iteration": 2.414073944091797 + }, + { + "auxiliary_loss_clip": 0.01119109, + "auxiliary_loss_mlp": 0.01026274, + "balance_loss_clip": 1.04491353, + "balance_loss_mlp": 1.01398993, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 2.540173704400774, + "language_loss": 0.80847186, + "learning_rate": 4.721473755175698e-07, + "loss": 0.82992572, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12286377, + "step": 13030, + "time_per_iteration": 2.5071961879730225 + }, + { + "auxiliary_loss_clip": 0.0111554, + "auxiliary_loss_mlp": 0.01028762, + "balance_loss_clip": 1.04221547, + "balance_loss_mlp": 1.01698387, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 3.6229698678511517, + "language_loss": 0.70433569, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72577864, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11767578, + "step": 13031, + "time_per_iteration": 2.537513017654419 + }, + { + "auxiliary_loss_clip": 0.01111307, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.03786302, + "balance_loss_mlp": 1.02432108, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 1.816408339075763, + "language_loss": 0.78783101, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80932283, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.13562012, + "step": 13032, + "time_per_iteration": 2.3899292945861816 + }, + { + "auxiliary_loss_clip": 0.011218, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.04617977, + "balance_loss_mlp": 1.02239704, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 1.8168101891085395, + "language_loss": 0.62873858, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65030581, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12530518, + "step": 13033, + "time_per_iteration": 2.4165923595428467 + }, + { + "auxiliary_loss_clip": 0.01117942, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.04523778, + "balance_loss_mlp": 1.02138352, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.5960340121651446, + "language_loss": 0.72134662, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74285948, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11975098, + "step": 13034, + "time_per_iteration": 2.3999950885772705 + }, + { + "auxiliary_loss_clip": 0.01115445, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04193497, + "balance_loss_mlp": 1.01794934, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 2.207866671445772, + "language_loss": 0.72390789, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.74536318, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12133789, + "step": 13035, + "time_per_iteration": 2.48407244682312 + }, + { + "auxiliary_loss_clip": 0.01112503, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.03928268, + "balance_loss_mlp": 1.02567959, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.8064298096934441, + "language_loss": 0.66271043, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68422157, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1293335, + "step": 13036, + "time_per_iteration": 2.4961166381835938 + }, + { + "auxiliary_loss_clip": 0.01116566, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.0399816, + "balance_loss_mlp": 1.02303839, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.0471682334802397, + "language_loss": 0.72340226, + "learning_rate": 4.703895486362031e-07, + "loss": 0.74492723, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12908936, + "step": 13037, + "time_per_iteration": 2.4456543922424316 + }, + { + "auxiliary_loss_clip": 0.01109324, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.03604698, + "balance_loss_mlp": 1.0176934, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.747969850026065, + "language_loss": 0.60239977, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62379563, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12579346, + "step": 13038, + "time_per_iteration": 2.419376850128174 + }, + { + "auxiliary_loss_clip": 0.01108916, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.03812051, + "balance_loss_mlp": 1.01867032, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 3.5923871470172877, + "language_loss": 0.67937946, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70076627, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11096191, + "step": 13039, + "time_per_iteration": 2.6094486713409424 + }, + { + "auxiliary_loss_clip": 0.01110774, + "auxiliary_loss_mlp": 0.01023822, + "balance_loss_clip": 1.04053783, + "balance_loss_mlp": 1.01289654, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.8427366784400738, + "language_loss": 0.69399083, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.7153368, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10931396, + "step": 13040, + "time_per_iteration": 3.977637767791748 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.04513264, + "balance_loss_mlp": 1.01905751, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.5112291084535103, + "language_loss": 0.67536503, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69686997, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.13006592, + "step": 13041, + "time_per_iteration": 2.3901143074035645 + }, + { + "auxiliary_loss_clip": 0.01034786, + "auxiliary_loss_mlp": 0.01003839, + "balance_loss_clip": 1.01024771, + "balance_loss_mlp": 1.00244451, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6596415961556856, + "language_loss": 0.57394397, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59433019, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.01397705, + "step": 13042, + "time_per_iteration": 3.051729202270508 + }, + { + "auxiliary_loss_clip": 0.011153, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.04116392, + "balance_loss_mlp": 1.01510215, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 1.9940760796387063, + "language_loss": 0.83905053, + "learning_rate": 4.688851018730369e-07, + "loss": 0.8604756, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12097168, + "step": 13043, + "time_per_iteration": 2.502821445465088 + }, + { + "auxiliary_loss_clip": 0.01108245, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.03893542, + "balance_loss_mlp": 1.01709032, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.5318208533571722, + "language_loss": 0.88399196, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90535361, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10839844, + "step": 13044, + "time_per_iteration": 2.4791104793548584 + }, + { + "auxiliary_loss_clip": 0.01121027, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.0427227, + "balance_loss_mlp": 1.01818419, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.622311819009267, + "language_loss": 0.79124331, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81275833, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12286377, + "step": 13045, + "time_per_iteration": 2.4338550567626953 + }, + { + "auxiliary_loss_clip": 0.01106275, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.0360837, + "balance_loss_mlp": 1.01791573, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.5289698920499724, + "language_loss": 0.72459275, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74594915, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11456299, + "step": 13046, + "time_per_iteration": 2.4686126708984375 + }, + { + "auxiliary_loss_clip": 0.01110365, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.03941607, + "balance_loss_mlp": 1.02830839, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.6032759004510706, + "language_loss": 0.63698518, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65851796, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.14605713, + "step": 13047, + "time_per_iteration": 2.4597721099853516 + }, + { + "auxiliary_loss_clip": 0.0111658, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.04509366, + "balance_loss_mlp": 1.01601195, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.5178608004667302, + "language_loss": 0.73098433, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75242472, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11462402, + "step": 13048, + "time_per_iteration": 2.450418472290039 + }, + { + "auxiliary_loss_clip": 0.01126148, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.04955065, + "balance_loss_mlp": 1.02069843, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 1.7105090254713817, + "language_loss": 0.74560726, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76720518, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12927246, + "step": 13049, + "time_per_iteration": 2.4449174404144287 + }, + { + "auxiliary_loss_clip": 0.01119495, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.04298019, + "balance_loss_mlp": 1.01954317, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 2.6620173664573046, + "language_loss": 0.72793108, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.74944913, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12768555, + "step": 13050, + "time_per_iteration": 2.4290947914123535 + }, + { + "auxiliary_loss_clip": 0.01118657, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.0473237, + "balance_loss_mlp": 1.02102804, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.222197832035738, + "language_loss": 0.74051595, + "learning_rate": 4.668824245713825e-07, + "loss": 0.76203245, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11968994, + "step": 13051, + "time_per_iteration": 2.4269485473632812 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.03779531, + "balance_loss_mlp": 1.01825166, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 1.8050277421898886, + "language_loss": 0.72922742, + "learning_rate": 4.666323514209227e-07, + "loss": 0.75063837, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12475586, + "step": 13052, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0111161, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.04243088, + "balance_loss_mlp": 1.02002931, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 2.1625569430824627, + "language_loss": 0.68849957, + "learning_rate": 4.663823364159183e-07, + "loss": 0.70993286, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.11694336, + "step": 13053, + "time_per_iteration": 2.417444944381714 + }, + { + "auxiliary_loss_clip": 0.01109082, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.03907752, + "balance_loss_mlp": 1.01459169, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 1.9822497811899256, + "language_loss": 0.70365751, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72500229, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10809326, + "step": 13054, + "time_per_iteration": 2.484443187713623 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.03952205, + "balance_loss_mlp": 1.01915073, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.7052167891232715, + "language_loss": 0.76071358, + "learning_rate": 4.658824808801938e-07, + "loss": 0.78217924, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13287354, + "step": 13055, + "time_per_iteration": 2.4887583255767822 + }, + { + "auxiliary_loss_clip": 0.01111358, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.0367378, + "balance_loss_mlp": 1.01860523, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.8075841291900137, + "language_loss": 0.74597782, + "learning_rate": 4.656326403684283e-07, + "loss": 0.76742017, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1427002, + "step": 13056, + "time_per_iteration": 2.4908385276794434 + }, + { + "auxiliary_loss_clip": 0.01118371, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.04508054, + "balance_loss_mlp": 1.01567495, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 4.396610738738971, + "language_loss": 0.70179701, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72326028, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12286377, + "step": 13057, + "time_per_iteration": 2.454423427581787 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.04694605, + "balance_loss_mlp": 1.0205816, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 2.6075642934696663, + "language_loss": 0.76736879, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78889155, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11645508, + "step": 13058, + "time_per_iteration": 3.851511240005493 + }, + { + "auxiliary_loss_clip": 0.01110802, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.04005444, + "balance_loss_mlp": 1.01809871, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.5382839924167613, + "language_loss": 0.71118015, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.73258764, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11834717, + "step": 13059, + "time_per_iteration": 3.896242380142212 + }, + { + "auxiliary_loss_clip": 0.0111901, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.04241848, + "balance_loss_mlp": 1.02502179, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.8584694835631383, + "language_loss": 0.76706988, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78863728, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1270752, + "step": 13060, + "time_per_iteration": 2.510115623474121 + }, + { + "auxiliary_loss_clip": 0.01113219, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.04121518, + "balance_loss_mlp": 1.02313292, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.112418760781799, + "language_loss": 0.77364278, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79513967, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.13342285, + "step": 13061, + "time_per_iteration": 2.4580461978912354 + }, + { + "auxiliary_loss_clip": 0.01120523, + "auxiliary_loss_mlp": 0.01027082, + "balance_loss_clip": 1.04488885, + "balance_loss_mlp": 1.01463079, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 1.868758903165542, + "language_loss": 0.74298865, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76446468, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12457275, + "step": 13062, + "time_per_iteration": 2.4719934463500977 + }, + { + "auxiliary_loss_clip": 0.0111772, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.04687738, + "balance_loss_mlp": 1.01698697, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.745210118109633, + "language_loss": 0.68726987, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70873255, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11572266, + "step": 13063, + "time_per_iteration": 2.4444541931152344 + }, + { + "auxiliary_loss_clip": 0.01122704, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.05170727, + "balance_loss_mlp": 1.02275825, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 2.1520557656768062, + "language_loss": 0.73130858, + "learning_rate": 4.636360116707625e-07, + "loss": 0.7528832, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.12005615, + "step": 13064, + "time_per_iteration": 2.472764730453491 + }, + { + "auxiliary_loss_clip": 0.01116687, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.04151082, + "balance_loss_mlp": 1.0223161, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 2.121644310767893, + "language_loss": 0.67712891, + "learning_rate": 4.633866951500718e-07, + "loss": 0.69863981, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12091064, + "step": 13065, + "time_per_iteration": 2.6099274158477783 + }, + { + "auxiliary_loss_clip": 0.01117745, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.04454947, + "balance_loss_mlp": 1.02411962, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 2.387192923404952, + "language_loss": 0.76137328, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78290975, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11779785, + "step": 13066, + "time_per_iteration": 2.5093491077423096 + }, + { + "auxiliary_loss_clip": 0.01048267, + "auxiliary_loss_mlp": 0.01005079, + "balance_loss_clip": 1.02215219, + "balance_loss_mlp": 1.0036099, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7287624761154495, + "language_loss": 0.53433681, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55487025, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01469421, + "step": 13067, + "time_per_iteration": 4.618236541748047 + }, + { + "auxiliary_loss_clip": 0.0111365, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.04051626, + "balance_loss_mlp": 1.0199492, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.5454467984089466, + "language_loss": 0.6727131, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69417465, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12573242, + "step": 13068, + "time_per_iteration": 2.4267561435699463 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.04825306, + "balance_loss_mlp": 1.02062416, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 2.2932952809839717, + "language_loss": 0.68007416, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70157641, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11138916, + "step": 13069, + "time_per_iteration": 2.469187021255493 + }, + { + "auxiliary_loss_clip": 0.01115787, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.04026532, + "balance_loss_mlp": 1.02342463, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.8229443249148838, + "language_loss": 0.77067143, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79218984, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12634277, + "step": 13070, + "time_per_iteration": 2.44158935546875 + }, + { + "auxiliary_loss_clip": 0.01114266, + "auxiliary_loss_mlp": 0.01027528, + "balance_loss_clip": 1.04284859, + "balance_loss_mlp": 1.01648295, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.7311090573617234, + "language_loss": 0.65600181, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67741972, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11047363, + "step": 13071, + "time_per_iteration": 2.423936605453491 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.04027987, + "balance_loss_mlp": 1.0224067, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.6287823045928547, + "language_loss": 0.7399506, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76142931, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11572266, + "step": 13072, + "time_per_iteration": 2.440228223800659 + }, + { + "auxiliary_loss_clip": 0.01116469, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.04286551, + "balance_loss_mlp": 1.01537955, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.025166266215531, + "language_loss": 0.71257353, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73401618, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12426758, + "step": 13073, + "time_per_iteration": 2.4748034477233887 + }, + { + "auxiliary_loss_clip": 0.01117637, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.04263151, + "balance_loss_mlp": 1.01771212, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 2.2902227643084245, + "language_loss": 0.76787168, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78935194, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12670898, + "step": 13074, + "time_per_iteration": 2.4339687824249268 + }, + { + "auxiliary_loss_clip": 0.01116797, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.04497278, + "balance_loss_mlp": 1.01867151, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.7413738959896523, + "language_loss": 0.74775821, + "learning_rate": 4.608967362711782e-07, + "loss": 0.76922619, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11322021, + "step": 13075, + "time_per_iteration": 2.51076340675354 + }, + { + "auxiliary_loss_clip": 0.01116951, + "auxiliary_loss_mlp": 0.01026597, + "balance_loss_clip": 1.04560542, + "balance_loss_mlp": 1.01534319, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 2.309032170799184, + "language_loss": 0.68828273, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.70971817, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11248779, + "step": 13076, + "time_per_iteration": 2.4660534858703613 + }, + { + "auxiliary_loss_clip": 0.01116208, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.04505658, + "balance_loss_mlp": 1.01856649, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 1.945080152852906, + "language_loss": 0.80021548, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82168072, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11749268, + "step": 13077, + "time_per_iteration": 2.459303855895996 + }, + { + "auxiliary_loss_clip": 0.01120515, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.0472213, + "balance_loss_mlp": 1.01894176, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.8339896544791099, + "language_loss": 0.70897812, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73049629, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12353516, + "step": 13078, + "time_per_iteration": 2.575477123260498 + }, + { + "auxiliary_loss_clip": 0.01118417, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.04412723, + "balance_loss_mlp": 1.01957464, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.477635474947732, + "language_loss": 0.81449574, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83599102, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11541748, + "step": 13079, + "time_per_iteration": 2.5125515460968018 + }, + { + "auxiliary_loss_clip": 0.01116502, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.04247355, + "balance_loss_mlp": 1.02280903, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.5155793227019474, + "language_loss": 0.68254006, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70405668, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12347412, + "step": 13080, + "time_per_iteration": 2.4815726280212402 + }, + { + "auxiliary_loss_clip": 0.01114448, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.04216039, + "balance_loss_mlp": 1.01860356, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.9021238855859623, + "language_loss": 0.69412339, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71558142, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12756348, + "step": 13081, + "time_per_iteration": 2.5005319118499756 + }, + { + "auxiliary_loss_clip": 0.01115113, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.04147434, + "balance_loss_mlp": 1.02002978, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.6178017076863558, + "language_loss": 0.68157279, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70304012, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1159668, + "step": 13082, + "time_per_iteration": 2.4206979274749756 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.04365253, + "balance_loss_mlp": 1.02214146, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.871231865809576, + "language_loss": 0.66006207, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68159473, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12286377, + "step": 13083, + "time_per_iteration": 3.9610068798065186 + }, + { + "auxiliary_loss_clip": 0.0112289, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.04712558, + "balance_loss_mlp": 1.0215342, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 3.024784361743039, + "language_loss": 0.74824864, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76982045, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12768555, + "step": 13084, + "time_per_iteration": 2.4683563709259033 + }, + { + "auxiliary_loss_clip": 0.01118823, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.04799831, + "balance_loss_mlp": 1.01849234, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 2.673428516096565, + "language_loss": 0.70361829, + "learning_rate": 4.584126136854591e-07, + "loss": 0.7251057, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11419678, + "step": 13085, + "time_per_iteration": 2.410203218460083 + }, + { + "auxiliary_loss_clip": 0.01117442, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.04251432, + "balance_loss_mlp": 1.01746774, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 2.323185935008732, + "language_loss": 0.72230899, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74377841, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12042236, + "step": 13086, + "time_per_iteration": 2.4254629611968994 + }, + { + "auxiliary_loss_clip": 0.0110736, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.03581405, + "balance_loss_mlp": 1.01523018, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.8983818697590953, + "language_loss": 0.74572527, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.76706326, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11212158, + "step": 13087, + "time_per_iteration": 2.474045753479004 + }, + { + "auxiliary_loss_clip": 0.01106481, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.0352813, + "balance_loss_mlp": 1.01866579, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.6004897240124378, + "language_loss": 0.71364707, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73501182, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11328125, + "step": 13088, + "time_per_iteration": 2.4792706966400146 + }, + { + "auxiliary_loss_clip": 0.01038184, + "auxiliary_loss_mlp": 0.01000913, + "balance_loss_clip": 1.01317692, + "balance_loss_mlp": 0.99952716, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.674568578112315, + "language_loss": 0.55427349, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57466441, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.01385498, + "step": 13089, + "time_per_iteration": 3.190437078475952 + }, + { + "auxiliary_loss_clip": 0.0104532, + "auxiliary_loss_mlp": 0.01003538, + "balance_loss_clip": 1.01994586, + "balance_loss_mlp": 1.00205529, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.732384492206356, + "language_loss": 0.49991244, + "learning_rate": 4.571727439470976e-07, + "loss": 0.520401, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.25366211, + "router_z_loss_mlp": 0.01481628, + "step": 13090, + "time_per_iteration": 3.159290313720703 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.01588786, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.514741579263258, + "language_loss": 0.83731282, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.85865319, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11340332, + "step": 13091, + "time_per_iteration": 2.5454397201538086 + }, + { + "auxiliary_loss_clip": 0.01067369, + "auxiliary_loss_mlp": 0.01011522, + "balance_loss_clip": 1.042822, + "balance_loss_mlp": 1.00991225, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.713519864777517, + "language_loss": 0.63980144, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66059041, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01608276, + "step": 13092, + "time_per_iteration": 3.132638692855835 + }, + { + "auxiliary_loss_clip": 0.0111625, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.04181147, + "balance_loss_mlp": 1.02072465, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.0216624675914283, + "language_loss": 0.78939247, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81088781, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12567139, + "step": 13093, + "time_per_iteration": 2.5028674602508545 + }, + { + "auxiliary_loss_clip": 0.01111552, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.04078197, + "balance_loss_mlp": 1.01518643, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 1.7338605614103262, + "language_loss": 0.75761938, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77900493, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11828613, + "step": 13094, + "time_per_iteration": 2.4532740116119385 + }, + { + "auxiliary_loss_clip": 0.01116301, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.04177904, + "balance_loss_mlp": 1.02335978, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6671137705122843, + "language_loss": 0.79713809, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81865174, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11712646, + "step": 13095, + "time_per_iteration": 2.497220039367676 + }, + { + "auxiliary_loss_clip": 0.01111635, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.03772867, + "balance_loss_mlp": 1.01996398, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.1307253601330256, + "language_loss": 0.67947268, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70091397, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12530518, + "step": 13096, + "time_per_iteration": 2.598210334777832 + }, + { + "auxiliary_loss_clip": 0.01106239, + "auxiliary_loss_mlp": 0.01024528, + "balance_loss_clip": 1.03759837, + "balance_loss_mlp": 1.01454473, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.4143929337369512, + "language_loss": 0.70320898, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72451663, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.09979248, + "step": 13097, + "time_per_iteration": 2.767240047454834 + }, + { + "auxiliary_loss_clip": 0.01121009, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.0462364, + "balance_loss_mlp": 1.02060723, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.5482672294390385, + "language_loss": 0.80483496, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82637286, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12176514, + "step": 13098, + "time_per_iteration": 2.5051891803741455 + }, + { + "auxiliary_loss_clip": 0.01108864, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.03749514, + "balance_loss_mlp": 1.01819122, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 5.06835258037191, + "language_loss": 0.74193132, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76330936, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10754395, + "step": 13099, + "time_per_iteration": 2.5213370323181152 + }, + { + "auxiliary_loss_clip": 0.01108298, + "auxiliary_loss_mlp": 0.01025296, + "balance_loss_clip": 1.03702974, + "balance_loss_mlp": 1.01315427, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.5148722634140248, + "language_loss": 0.78226686, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80360281, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.12139893, + "step": 13100, + "time_per_iteration": 2.501121759414673 + }, + { + "auxiliary_loss_clip": 0.01122285, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.04238546, + "balance_loss_mlp": 1.01741123, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.2206590395926074, + "language_loss": 0.66527724, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68680829, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13409424, + "step": 13101, + "time_per_iteration": 3.9193851947784424 + }, + { + "auxiliary_loss_clip": 0.01120684, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.04573798, + "balance_loss_mlp": 1.01947165, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.4362894479277966, + "language_loss": 0.77694029, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79845864, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11688232, + "step": 13102, + "time_per_iteration": 2.5695137977600098 + }, + { + "auxiliary_loss_clip": 0.01120051, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.04772353, + "balance_loss_mlp": 1.0229466, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 1.9469395266939014, + "language_loss": 0.82654762, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84808642, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10882568, + "step": 13103, + "time_per_iteration": 3.8761630058288574 + }, + { + "auxiliary_loss_clip": 0.0112237, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.04886627, + "balance_loss_mlp": 1.01909494, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 2.27787525518032, + "language_loss": 0.80731982, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82885575, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12145996, + "step": 13104, + "time_per_iteration": 2.4804248809814453 + }, + { + "auxiliary_loss_clip": 0.01121339, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.04732025, + "balance_loss_mlp": 1.02369022, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.7400970100469273, + "language_loss": 0.74059379, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76216847, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12432861, + "step": 13105, + "time_per_iteration": 2.571835517883301 + }, + { + "auxiliary_loss_clip": 0.01123669, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.0492208, + "balance_loss_mlp": 1.02126861, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.5616695016419881, + "language_loss": 0.75713122, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77870536, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12463379, + "step": 13106, + "time_per_iteration": 2.504422903060913 + }, + { + "auxiliary_loss_clip": 0.0111922, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.04511595, + "balance_loss_mlp": 1.02130723, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.282466522019848, + "language_loss": 0.73693609, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75845706, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11578369, + "step": 13107, + "time_per_iteration": 2.4273438453674316 + }, + { + "auxiliary_loss_clip": 0.01115511, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.04396796, + "balance_loss_mlp": 1.02073956, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.9849535551579889, + "language_loss": 0.73410857, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75559163, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12054443, + "step": 13108, + "time_per_iteration": 2.4755570888519287 + }, + { + "auxiliary_loss_clip": 0.01048067, + "auxiliary_loss_mlp": 0.01001241, + "balance_loss_clip": 1.02295828, + "balance_loss_mlp": 1.00002229, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.8815895741570935, + "language_loss": 0.60279536, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62328839, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01217651, + "step": 13109, + "time_per_iteration": 3.0757699012756348 + }, + { + "auxiliary_loss_clip": 0.01113926, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.04456246, + "balance_loss_mlp": 1.01905036, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.9447911331685512, + "language_loss": 0.72428966, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74573338, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.1138916, + "step": 13110, + "time_per_iteration": 3.899162769317627 + }, + { + "auxiliary_loss_clip": 0.01113098, + "auxiliary_loss_mlp": 0.01029364, + "balance_loss_clip": 1.04288363, + "balance_loss_mlp": 1.01782501, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.332330103847199, + "language_loss": 0.75198114, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77340579, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11541748, + "step": 13111, + "time_per_iteration": 2.5114595890045166 + }, + { + "auxiliary_loss_clip": 0.01115142, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.04305339, + "balance_loss_mlp": 1.02011931, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 2.2446474661917626, + "language_loss": 0.61186635, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63334125, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12225342, + "step": 13112, + "time_per_iteration": 2.488154172897339 + }, + { + "auxiliary_loss_clip": 0.01110037, + "auxiliary_loss_mlp": 0.01029536, + "balance_loss_clip": 1.03769612, + "balance_loss_mlp": 1.01648903, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.7252685092639855, + "language_loss": 0.67585433, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69725001, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.13049316, + "step": 13113, + "time_per_iteration": 2.540958881378174 + }, + { + "auxiliary_loss_clip": 0.01114903, + "auxiliary_loss_mlp": 0.01029024, + "balance_loss_clip": 1.0426172, + "balance_loss_mlp": 1.01762784, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 2.1369850580542997, + "language_loss": 0.58188367, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60332298, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11407471, + "step": 13114, + "time_per_iteration": 2.425149440765381 + }, + { + "auxiliary_loss_clip": 0.01114821, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.04000545, + "balance_loss_mlp": 1.01726627, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 2.327532396857868, + "language_loss": 0.67027795, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69172204, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12322998, + "step": 13115, + "time_per_iteration": 2.4858274459838867 + }, + { + "auxiliary_loss_clip": 0.01119883, + "auxiliary_loss_mlp": 0.01023337, + "balance_loss_clip": 1.04814076, + "balance_loss_mlp": 1.0117085, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 2.085238322720729, + "language_loss": 0.88811958, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90955174, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11627197, + "step": 13116, + "time_per_iteration": 2.419020175933838 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.04122126, + "balance_loss_mlp": 1.02038431, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 2.235089238125252, + "language_loss": 0.72604883, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74758649, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13061523, + "step": 13117, + "time_per_iteration": 2.447390079498291 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01026006, + "balance_loss_clip": 1.0426209, + "balance_loss_mlp": 1.01490164, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.5937653980044484, + "language_loss": 0.80064106, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82203364, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11096191, + "step": 13118, + "time_per_iteration": 2.389582872390747 + }, + { + "auxiliary_loss_clip": 0.01116969, + "auxiliary_loss_mlp": 0.01025136, + "balance_loss_clip": 1.044873, + "balance_loss_mlp": 1.01307845, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.702821452744025, + "language_loss": 0.72836339, + "learning_rate": 4.500103790161878e-07, + "loss": 0.74978447, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12054443, + "step": 13119, + "time_per_iteration": 2.4658303260803223 + }, + { + "auxiliary_loss_clip": 0.0110932, + "auxiliary_loss_mlp": 0.01026489, + "balance_loss_clip": 1.036834, + "balance_loss_mlp": 1.01475286, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.4831577055416512, + "language_loss": 0.71966285, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74102092, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11737061, + "step": 13120, + "time_per_iteration": 2.5860602855682373 + }, + { + "auxiliary_loss_clip": 0.01114362, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.04195976, + "balance_loss_mlp": 1.01896071, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.660906634423871, + "language_loss": 0.78747356, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80892873, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12219238, + "step": 13121, + "time_per_iteration": 2.587526559829712 + }, + { + "auxiliary_loss_clip": 0.01113848, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.04207873, + "balance_loss_mlp": 1.01674008, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.4660512126488663, + "language_loss": 0.80185795, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82328045, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11663818, + "step": 13122, + "time_per_iteration": 2.528630018234253 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.04444242, + "balance_loss_mlp": 1.0148437, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 1.9790593875157543, + "language_loss": 0.78201991, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80347228, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11724854, + "step": 13123, + "time_per_iteration": 2.4664382934570312 + }, + { + "auxiliary_loss_clip": 0.0111818, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.0431633, + "balance_loss_mlp": 1.01697946, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 11.754381761075257, + "language_loss": 0.67352796, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69500053, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12103271, + "step": 13124, + "time_per_iteration": 2.4629197120666504 + }, + { + "auxiliary_loss_clip": 0.01121951, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.04732203, + "balance_loss_mlp": 1.01239526, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 1.9558379304867488, + "language_loss": 0.72439939, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.74586922, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12634277, + "step": 13125, + "time_per_iteration": 2.501842737197876 + }, + { + "auxiliary_loss_clip": 0.01108858, + "auxiliary_loss_mlp": 0.01027257, + "balance_loss_clip": 1.03516293, + "balance_loss_mlp": 1.01497269, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 1.813359768783274, + "language_loss": 0.7284956, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74985677, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12286377, + "step": 13126, + "time_per_iteration": 2.4559783935546875 + }, + { + "auxiliary_loss_clip": 0.01124412, + "auxiliary_loss_mlp": 0.01030209, + "balance_loss_clip": 1.04631519, + "balance_loss_mlp": 1.01745927, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 1.986752059165984, + "language_loss": 0.76925743, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79080367, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12762451, + "step": 13127, + "time_per_iteration": 3.851759672164917 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.04133093, + "balance_loss_mlp": 1.0225991, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.6500813340099412, + "language_loss": 0.85931545, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88078034, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.12493896, + "step": 13128, + "time_per_iteration": 2.4672656059265137 + }, + { + "auxiliary_loss_clip": 0.01114532, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.04409432, + "balance_loss_mlp": 1.02324605, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 3.43641745553469, + "language_loss": 0.6914897, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71297342, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10595703, + "step": 13129, + "time_per_iteration": 2.4474565982818604 + }, + { + "auxiliary_loss_clip": 0.01044393, + "auxiliary_loss_mlp": 0.01006516, + "balance_loss_clip": 1.01896882, + "balance_loss_mlp": 1.00513291, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7213064978135779, + "language_loss": 0.61642563, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63693476, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01383972, + "step": 13130, + "time_per_iteration": 3.0405304431915283 + }, + { + "auxiliary_loss_clip": 0.01115825, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.04336143, + "balance_loss_mlp": 1.01707995, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.632135440968758, + "language_loss": 0.73856032, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76000381, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11450195, + "step": 13131, + "time_per_iteration": 2.473111629486084 + }, + { + "auxiliary_loss_clip": 0.01125181, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.04193783, + "balance_loss_mlp": 1.01562572, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.379345851363859, + "language_loss": 0.69883871, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.72038943, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.1428833, + "step": 13132, + "time_per_iteration": 2.431593418121338 + }, + { + "auxiliary_loss_clip": 0.01119645, + "auxiliary_loss_mlp": 0.01035894, + "balance_loss_clip": 1.04478681, + "balance_loss_mlp": 1.02356803, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.543742677245761, + "language_loss": 0.62347865, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64503407, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12335205, + "step": 13133, + "time_per_iteration": 2.4732136726379395 + }, + { + "auxiliary_loss_clip": 0.01114609, + "auxiliary_loss_mlp": 0.01038265, + "balance_loss_clip": 1.03858614, + "balance_loss_mlp": 1.02489543, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.170678145193999, + "language_loss": 0.79792476, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81945354, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.1338501, + "step": 13134, + "time_per_iteration": 2.4304728507995605 + }, + { + "auxiliary_loss_clip": 0.01113375, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.03926933, + "balance_loss_mlp": 1.01877832, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.8550510745192508, + "language_loss": 0.80309939, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82453907, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11810303, + "step": 13135, + "time_per_iteration": 2.417590618133545 + }, + { + "auxiliary_loss_clip": 0.01112882, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.0403688, + "balance_loss_mlp": 1.01397645, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.9389330306615853, + "language_loss": 0.72494996, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74634355, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12518311, + "step": 13136, + "time_per_iteration": 2.4926538467407227 + }, + { + "auxiliary_loss_clip": 0.01125151, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.04566824, + "balance_loss_mlp": 1.02209592, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 2.4471340569055755, + "language_loss": 0.70821339, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72981262, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12670898, + "step": 13137, + "time_per_iteration": 2.4588327407836914 + }, + { + "auxiliary_loss_clip": 0.01111117, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.03957534, + "balance_loss_mlp": 1.01979828, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.6279121639172631, + "language_loss": 0.74363315, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.76506388, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12158203, + "step": 13138, + "time_per_iteration": 2.413623094558716 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.04475617, + "balance_loss_mlp": 1.01799369, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 1.9857514694208287, + "language_loss": 0.68394303, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70540863, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11535645, + "step": 13139, + "time_per_iteration": 2.3977465629577637 + }, + { + "auxiliary_loss_clip": 0.01036874, + "auxiliary_loss_mlp": 0.01003949, + "balance_loss_clip": 1.01176882, + "balance_loss_mlp": 1.00232446, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8922730291696638, + "language_loss": 0.60171282, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62212104, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.25170898, + "router_z_loss_mlp": 0.01625061, + "step": 13140, + "time_per_iteration": 3.1405985355377197 + }, + { + "auxiliary_loss_clip": 0.01119259, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.04454529, + "balance_loss_mlp": 1.02007961, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.618085560840669, + "language_loss": 0.75807273, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.77959061, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12420654, + "step": 13141, + "time_per_iteration": 2.5268123149871826 + }, + { + "auxiliary_loss_clip": 0.01120573, + "auxiliary_loss_mlp": 0.01038686, + "balance_loss_clip": 1.04365528, + "balance_loss_mlp": 1.0256741, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 1.9956094549160674, + "language_loss": 0.68560743, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70720005, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13012695, + "step": 13142, + "time_per_iteration": 2.431713581085205 + }, + { + "auxiliary_loss_clip": 0.01052694, + "auxiliary_loss_mlp": 0.01002168, + "balance_loss_clip": 1.02784622, + "balance_loss_mlp": 1.00091362, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.824512372324468, + "language_loss": 0.59959459, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62014323, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01254272, + "step": 13143, + "time_per_iteration": 2.8675737380981445 + }, + { + "auxiliary_loss_clip": 0.01127102, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.05134094, + "balance_loss_mlp": 1.01836967, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.563521384334357, + "language_loss": 0.74562061, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76720011, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12487793, + "step": 13144, + "time_per_iteration": 2.5584352016448975 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01035569, + "balance_loss_clip": 1.05369675, + "balance_loss_mlp": 1.02210486, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.825635498688725, + "language_loss": 0.83152074, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85321546, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13464355, + "step": 13145, + "time_per_iteration": 3.895559549331665 + }, + { + "auxiliary_loss_clip": 0.01115854, + "auxiliary_loss_mlp": 0.01025311, + "balance_loss_clip": 1.04606581, + "balance_loss_mlp": 1.01452255, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.6388798673734941, + "language_loss": 0.73089635, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75230801, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10797119, + "step": 13146, + "time_per_iteration": 4.0361878871917725 + }, + { + "auxiliary_loss_clip": 0.01114899, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.04108477, + "balance_loss_mlp": 1.01938736, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 2.11537777685167, + "language_loss": 0.75724441, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77870524, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11791992, + "step": 13147, + "time_per_iteration": 2.5009279251098633 + }, + { + "auxiliary_loss_clip": 0.01113279, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.04250693, + "balance_loss_mlp": 1.02074218, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 3.0239020530302287, + "language_loss": 0.7206285, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74209249, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.12384033, + "step": 13148, + "time_per_iteration": 2.438647985458374 + }, + { + "auxiliary_loss_clip": 0.01127438, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.05271363, + "balance_loss_mlp": 1.01849461, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 2.2118038730172747, + "language_loss": 0.71838462, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73996735, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12341309, + "step": 13149, + "time_per_iteration": 2.506598711013794 + }, + { + "auxiliary_loss_clip": 0.01119675, + "auxiliary_loss_mlp": 0.01031774, + "balance_loss_clip": 1.04407001, + "balance_loss_mlp": 1.01816666, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 1.9220359046064435, + "language_loss": 0.65490484, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67641926, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.13616943, + "step": 13150, + "time_per_iteration": 2.510105609893799 + }, + { + "auxiliary_loss_clip": 0.01118085, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.04636443, + "balance_loss_mlp": 1.01687312, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 1.9047097003875444, + "language_loss": 0.69812775, + "learning_rate": 4.421644538650231e-07, + "loss": 0.71959281, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11553955, + "step": 13151, + "time_per_iteration": 2.4480936527252197 + }, + { + "auxiliary_loss_clip": 0.01115072, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.03968465, + "balance_loss_mlp": 1.01962447, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.7647794015031115, + "language_loss": 0.70123416, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72270441, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12322998, + "step": 13152, + "time_per_iteration": 2.612642526626587 + }, + { + "auxiliary_loss_clip": 0.01112599, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.03955698, + "balance_loss_mlp": 1.01989579, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.8751096239903087, + "language_loss": 0.73060775, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.75205261, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11981201, + "step": 13153, + "time_per_iteration": 2.453322410583496 + }, + { + "auxiliary_loss_clip": 0.01120007, + "auxiliary_loss_mlp": 0.01028198, + "balance_loss_clip": 1.0460726, + "balance_loss_mlp": 1.01618159, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.651967303689085, + "language_loss": 0.78800142, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.80948341, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12011719, + "step": 13154, + "time_per_iteration": 3.9171056747436523 + }, + { + "auxiliary_loss_clip": 0.0111997, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.04150271, + "balance_loss_mlp": 1.01326084, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 2.2169862160769447, + "language_loss": 0.70288503, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72435337, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13616943, + "step": 13155, + "time_per_iteration": 2.4551050662994385 + }, + { + "auxiliary_loss_clip": 0.0112096, + "auxiliary_loss_mlp": 0.01027907, + "balance_loss_clip": 1.04514146, + "balance_loss_mlp": 1.01518714, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 1.720598087424, + "language_loss": 0.77364159, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79513019, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12713623, + "step": 13156, + "time_per_iteration": 2.4315993785858154 + }, + { + "auxiliary_loss_clip": 0.01115295, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.0422765, + "balance_loss_mlp": 1.01687884, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.596605594069786, + "language_loss": 0.65392923, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67537308, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12225342, + "step": 13157, + "time_per_iteration": 2.4836819171905518 + }, + { + "auxiliary_loss_clip": 0.01125888, + "auxiliary_loss_mlp": 0.01037943, + "balance_loss_clip": 1.05186033, + "balance_loss_mlp": 1.02432299, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.7717851960829616, + "language_loss": 0.74311608, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76475441, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1362915, + "step": 13158, + "time_per_iteration": 2.5592007637023926 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.038342, + "balance_loss_mlp": 1.01750016, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.0648400331567127, + "language_loss": 0.6702503, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69163239, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11212158, + "step": 13159, + "time_per_iteration": 2.4308340549468994 + }, + { + "auxiliary_loss_clip": 0.011139, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.04312682, + "balance_loss_mlp": 1.01841974, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 2.0382856872268533, + "language_loss": 0.67210662, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69354951, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11968994, + "step": 13160, + "time_per_iteration": 2.4399778842926025 + }, + { + "auxiliary_loss_clip": 0.0110848, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.0380919, + "balance_loss_mlp": 1.02052271, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 1.8850273992024116, + "language_loss": 0.72562742, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74704021, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.12280273, + "step": 13161, + "time_per_iteration": 2.452730655670166 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.04070246, + "balance_loss_mlp": 1.01682377, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 1.8945620181387393, + "language_loss": 0.73356229, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75497997, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12103271, + "step": 13162, + "time_per_iteration": 2.5046579837799072 + }, + { + "auxiliary_loss_clip": 0.01112704, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.03838301, + "balance_loss_mlp": 1.01909494, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.7612336429770177, + "language_loss": 0.71905583, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74049437, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12054443, + "step": 13163, + "time_per_iteration": 2.567054510116577 + }, + { + "auxiliary_loss_clip": 0.0111436, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.04133952, + "balance_loss_mlp": 1.01723742, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 3.2686574263075445, + "language_loss": 0.70240867, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72385567, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.13098145, + "step": 13164, + "time_per_iteration": 2.539259672164917 + }, + { + "auxiliary_loss_clip": 0.01112655, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.0388335, + "balance_loss_mlp": 1.01949716, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 2.150010427438764, + "language_loss": 0.66989982, + "learning_rate": 4.387508652677177e-07, + "loss": 0.69133818, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11688232, + "step": 13165, + "time_per_iteration": 2.440150022506714 + }, + { + "auxiliary_loss_clip": 0.01105595, + "auxiliary_loss_mlp": 0.01024971, + "balance_loss_clip": 1.035936, + "balance_loss_mlp": 1.01403952, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.8499083237453617, + "language_loss": 0.72480839, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74611402, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.109375, + "step": 13166, + "time_per_iteration": 2.40295672416687 + }, + { + "auxiliary_loss_clip": 0.01113552, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.04128957, + "balance_loss_mlp": 1.01846874, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 2.3186938631210587, + "language_loss": 0.7750082, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79645842, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.13012695, + "step": 13167, + "time_per_iteration": 2.5043880939483643 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.04017019, + "balance_loss_mlp": 1.0196321, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6499174758466464, + "language_loss": 0.84473217, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86616868, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12548828, + "step": 13168, + "time_per_iteration": 2.5076568126678467 + }, + { + "auxiliary_loss_clip": 0.01112049, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.03943825, + "balance_loss_mlp": 1.02036273, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.836804762812616, + "language_loss": 0.72797096, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.7494247, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12969971, + "step": 13169, + "time_per_iteration": 2.444613456726074 + }, + { + "auxiliary_loss_clip": 0.01124625, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.04934597, + "balance_loss_mlp": 1.02067995, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.795866978944702, + "language_loss": 0.6755442, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69712138, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12420654, + "step": 13170, + "time_per_iteration": 2.5964925289154053 + }, + { + "auxiliary_loss_clip": 0.0111924, + "auxiliary_loss_mlp": 0.01025506, + "balance_loss_clip": 1.04596472, + "balance_loss_mlp": 1.0141151, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.6979863352848747, + "language_loss": 0.71144831, + "learning_rate": 4.372914494109412e-07, + "loss": 0.73289579, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1138916, + "step": 13171, + "time_per_iteration": 3.9563379287719727 + }, + { + "auxiliary_loss_clip": 0.01116732, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.04376209, + "balance_loss_mlp": 1.01647496, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.9457081028911363, + "language_loss": 0.67007267, + "learning_rate": 4.370484207842553e-07, + "loss": 0.69152814, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12353516, + "step": 13172, + "time_per_iteration": 2.5716428756713867 + }, + { + "auxiliary_loss_clip": 0.01117957, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.04378843, + "balance_loss_mlp": 1.02272463, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.7493786199063883, + "language_loss": 0.79549909, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.817029, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12298584, + "step": 13173, + "time_per_iteration": 2.52785325050354 + }, + { + "auxiliary_loss_clip": 0.01113771, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.0407145, + "balance_loss_mlp": 1.01790261, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8581770382402465, + "language_loss": 0.76384801, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78527236, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10760498, + "step": 13174, + "time_per_iteration": 2.4713082313537598 + }, + { + "auxiliary_loss_clip": 0.01117968, + "auxiliary_loss_mlp": 0.01042283, + "balance_loss_clip": 1.04395163, + "balance_loss_mlp": 1.03026116, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.600662474291059, + "language_loss": 0.71765321, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73925573, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12023926, + "step": 13175, + "time_per_iteration": 2.4984819889068604 + }, + { + "auxiliary_loss_clip": 0.01120248, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.04587936, + "balance_loss_mlp": 1.01630342, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 2.249209848110205, + "language_loss": 0.59611404, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61760652, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12701416, + "step": 13176, + "time_per_iteration": 2.422579765319824 + }, + { + "auxiliary_loss_clip": 0.0112333, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.05216539, + "balance_loss_mlp": 1.02092242, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.8198514950314022, + "language_loss": 0.73398244, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.7555424, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11743164, + "step": 13177, + "time_per_iteration": 2.4294941425323486 + }, + { + "auxiliary_loss_clip": 0.01120549, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.04685664, + "balance_loss_mlp": 1.01963735, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 3.2513419880769665, + "language_loss": 0.64468384, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66620624, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1204834, + "step": 13178, + "time_per_iteration": 2.4267704486846924 + }, + { + "auxiliary_loss_clip": 0.01113854, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.04179454, + "balance_loss_mlp": 1.01853764, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.5976486222444606, + "language_loss": 0.68878013, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.71020919, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10510254, + "step": 13179, + "time_per_iteration": 2.50140118598938 + }, + { + "auxiliary_loss_clip": 0.01123255, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.04731023, + "balance_loss_mlp": 1.01742721, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 2.3442173225547287, + "language_loss": 0.74395281, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76548439, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12481689, + "step": 13180, + "time_per_iteration": 2.4432363510131836 + }, + { + "auxiliary_loss_clip": 0.01123269, + "auxiliary_loss_mlp": 0.01035015, + "balance_loss_clip": 1.04808557, + "balance_loss_mlp": 1.02211094, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 1.9408910475085845, + "language_loss": 0.8135581, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.835141, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12890625, + "step": 13181, + "time_per_iteration": 2.449522018432617 + }, + { + "auxiliary_loss_clip": 0.01109957, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.03961074, + "balance_loss_mlp": 1.0219624, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.836058507824754, + "language_loss": 0.77753353, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79897386, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12109375, + "step": 13182, + "time_per_iteration": 2.49324107170105 + }, + { + "auxiliary_loss_clip": 0.01121046, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.0454638, + "balance_loss_mlp": 1.02166855, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 2.3733622458248496, + "language_loss": 0.74433959, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76590282, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13616943, + "step": 13183, + "time_per_iteration": 2.469282388687134 + }, + { + "auxiliary_loss_clip": 0.0111543, + "auxiliary_loss_mlp": 0.01023877, + "balance_loss_clip": 1.04119635, + "balance_loss_mlp": 1.01171207, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.8464506438027497, + "language_loss": 0.68460536, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70599842, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12176514, + "step": 13184, + "time_per_iteration": 2.595019817352295 + }, + { + "auxiliary_loss_clip": 0.01124676, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.04855514, + "balance_loss_mlp": 1.01927352, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 2.3759537511253632, + "language_loss": 0.70670605, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72826558, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12011719, + "step": 13185, + "time_per_iteration": 2.4620628356933594 + }, + { + "auxiliary_loss_clip": 0.0111513, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.04149437, + "balance_loss_mlp": 1.01617718, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 1.8602628198416387, + "language_loss": 0.65590763, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67734027, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11962891, + "step": 13186, + "time_per_iteration": 2.4288322925567627 + }, + { + "auxiliary_loss_clip": 0.01119931, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.04713154, + "balance_loss_mlp": 1.02146888, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 2.0318424228912426, + "language_loss": 0.76834822, + "learning_rate": 4.334101086130408e-07, + "loss": 0.78987783, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11566162, + "step": 13187, + "time_per_iteration": 2.49763822555542 + }, + { + "auxiliary_loss_clip": 0.01118739, + "auxiliary_loss_mlp": 0.01027052, + "balance_loss_clip": 1.04579139, + "balance_loss_mlp": 1.01546526, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.0045868975005066, + "language_loss": 0.72145391, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74291182, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11602783, + "step": 13188, + "time_per_iteration": 3.8662381172180176 + }, + { + "auxiliary_loss_clip": 0.01118962, + "auxiliary_loss_mlp": 0.01040512, + "balance_loss_clip": 1.04212701, + "balance_loss_mlp": 1.02686834, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.968475414337568, + "language_loss": 0.6327126, + "learning_rate": 4.329260095357725e-07, + "loss": 0.65430737, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13653564, + "step": 13189, + "time_per_iteration": 2.4348206520080566 + }, + { + "auxiliary_loss_clip": 0.01113937, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.0412581, + "balance_loss_mlp": 1.01709867, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 1.813442202193339, + "language_loss": 0.7231195, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.7445451, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11517334, + "step": 13190, + "time_per_iteration": 2.4417190551757812 + }, + { + "auxiliary_loss_clip": 0.01115519, + "auxiliary_loss_mlp": 0.01025379, + "balance_loss_clip": 1.04765344, + "balance_loss_mlp": 1.01503134, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 2.5793050374717037, + "language_loss": 0.73712879, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75853777, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.10345459, + "step": 13191, + "time_per_iteration": 3.8830089569091797 + }, + { + "auxiliary_loss_clip": 0.01114161, + "auxiliary_loss_mlp": 0.01036377, + "balance_loss_clip": 1.04182708, + "balance_loss_mlp": 1.02378213, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.730490835300643, + "language_loss": 0.6916545, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71315992, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12585449, + "step": 13192, + "time_per_iteration": 2.483719825744629 + }, + { + "auxiliary_loss_clip": 0.01109752, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.0367918, + "balance_loss_mlp": 1.027843, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.6098743481059057, + "language_loss": 0.75069225, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77218884, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12072754, + "step": 13193, + "time_per_iteration": 2.5200958251953125 + }, + { + "auxiliary_loss_clip": 0.01116274, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.0426352, + "balance_loss_mlp": 1.02376699, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.5375740599551444, + "language_loss": 0.71841604, + "learning_rate": 4.317168019161741e-07, + "loss": 0.73995048, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.1340332, + "step": 13194, + "time_per_iteration": 2.535130500793457 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.04096889, + "balance_loss_mlp": 1.01861703, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 2.1330070974269346, + "language_loss": 0.70360231, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72510707, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12518311, + "step": 13195, + "time_per_iteration": 2.455331802368164 + }, + { + "auxiliary_loss_clip": 0.01113564, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.04266787, + "balance_loss_mlp": 1.01961792, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 2.083299421665837, + "language_loss": 0.77415282, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79560465, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.12005615, + "step": 13196, + "time_per_iteration": 2.517491102218628 + }, + { + "auxiliary_loss_clip": 0.0112033, + "auxiliary_loss_mlp": 0.01050924, + "balance_loss_clip": 1.04289985, + "balance_loss_mlp": 1.03702378, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 2.233851330880103, + "language_loss": 0.68904394, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71075642, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13891602, + "step": 13197, + "time_per_iteration": 2.5555615425109863 + }, + { + "auxiliary_loss_clip": 0.01114488, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.04284692, + "balance_loss_mlp": 1.01844299, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.6852441551361257, + "language_loss": 0.64858168, + "learning_rate": 4.30750506215646e-07, + "loss": 0.6700241, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11322021, + "step": 13198, + "time_per_iteration": 4.046200275421143 + }, + { + "auxiliary_loss_clip": 0.01116724, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.04105234, + "balance_loss_mlp": 1.01605392, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.554269588478949, + "language_loss": 0.72733033, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74879169, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13360596, + "step": 13199, + "time_per_iteration": 2.4396560192108154 + }, + { + "auxiliary_loss_clip": 0.01117975, + "auxiliary_loss_mlp": 0.01027706, + "balance_loss_clip": 1.04486346, + "balance_loss_mlp": 1.01614285, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 2.2781098343753685, + "language_loss": 0.80286229, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82431906, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11572266, + "step": 13200, + "time_per_iteration": 2.445671558380127 + }, + { + "auxiliary_loss_clip": 0.01109545, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.0394491, + "balance_loss_mlp": 1.0195744, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.7886473802647955, + "language_loss": 0.77919173, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.80060828, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.12524414, + "step": 13201, + "time_per_iteration": 2.3970911502838135 + }, + { + "auxiliary_loss_clip": 0.01113572, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.0408746, + "balance_loss_mlp": 1.02099526, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.5737116454342714, + "language_loss": 0.67093658, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69239998, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11755371, + "step": 13202, + "time_per_iteration": 2.478624105453491 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.04326773, + "balance_loss_mlp": 1.01844501, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.799448520281967, + "language_loss": 0.7480135, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.76949304, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11871338, + "step": 13203, + "time_per_iteration": 2.4363057613372803 + }, + { + "auxiliary_loss_clip": 0.01115217, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.04327822, + "balance_loss_mlp": 1.0192337, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 1.8391170700294701, + "language_loss": 0.66337687, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68483764, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11633301, + "step": 13204, + "time_per_iteration": 2.4447062015533447 + }, + { + "auxiliary_loss_clip": 0.01110959, + "auxiliary_loss_mlp": 0.0102874, + "balance_loss_clip": 1.04008389, + "balance_loss_mlp": 1.01652098, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.7526112496999187, + "language_loss": 0.79530662, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81670368, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.12225342, + "step": 13205, + "time_per_iteration": 2.495025634765625 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.03831041, + "balance_loss_mlp": 1.01715934, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.8339945434329774, + "language_loss": 0.78054094, + "learning_rate": 4.28820771692858e-07, + "loss": 0.80191416, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11810303, + "step": 13206, + "time_per_iteration": 2.4575514793395996 + }, + { + "auxiliary_loss_clip": 0.01114809, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03907752, + "balance_loss_mlp": 1.01928163, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 2.3580504592332043, + "language_loss": 0.79284871, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81431663, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12713623, + "step": 13207, + "time_per_iteration": 2.491943359375 + }, + { + "auxiliary_loss_clip": 0.01125065, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.05066633, + "balance_loss_mlp": 1.02053308, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.7969911389723274, + "language_loss": 0.84117997, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86275822, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12207031, + "step": 13208, + "time_per_iteration": 2.512491226196289 + }, + { + "auxiliary_loss_clip": 0.01050013, + "auxiliary_loss_mlp": 0.01004471, + "balance_loss_clip": 1.02515018, + "balance_loss_mlp": 1.0032227, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7302844876991994, + "language_loss": 0.58327633, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60382116, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01249695, + "step": 13209, + "time_per_iteration": 3.190016984939575 + }, + { + "auxiliary_loss_clip": 0.01124692, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.04866624, + "balance_loss_mlp": 1.01708841, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 2.117005747553373, + "language_loss": 0.62412226, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.64566612, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12609863, + "step": 13210, + "time_per_iteration": 2.502622127532959 + }, + { + "auxiliary_loss_clip": 0.01115585, + "auxiliary_loss_mlp": 0.01035527, + "balance_loss_clip": 1.04217756, + "balance_loss_mlp": 1.02374947, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5711775661996663, + "language_loss": 0.6968863, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71839738, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11779785, + "step": 13211, + "time_per_iteration": 2.511474370956421 + }, + { + "auxiliary_loss_clip": 0.01121018, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.04643977, + "balance_loss_mlp": 1.02327859, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.742674372076576, + "language_loss": 0.72314608, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74471724, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12817383, + "step": 13212, + "time_per_iteration": 2.5250351428985596 + }, + { + "auxiliary_loss_clip": 0.01117514, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.04780793, + "balance_loss_mlp": 1.01567888, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 2.1880849455312816, + "language_loss": 0.80821824, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82966661, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.11645508, + "step": 13213, + "time_per_iteration": 2.619239091873169 + }, + { + "auxiliary_loss_clip": 0.01115756, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03954673, + "balance_loss_mlp": 1.01617134, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.6646309710121043, + "language_loss": 0.68542659, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70687324, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12744141, + "step": 13214, + "time_per_iteration": 4.001784324645996 + }, + { + "auxiliary_loss_clip": 0.01115584, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.04226923, + "balance_loss_mlp": 1.02055788, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 2.181008356735102, + "language_loss": 0.72422945, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74570644, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11553955, + "step": 13215, + "time_per_iteration": 2.438288927078247 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01024995, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.01331294, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.6068243459446547, + "language_loss": 0.78635114, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.80771369, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11676025, + "step": 13216, + "time_per_iteration": 2.4826831817626953 + }, + { + "auxiliary_loss_clip": 0.01114615, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.04240727, + "balance_loss_mlp": 1.01979017, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.5466564738998447, + "language_loss": 0.73872399, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76018393, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11602783, + "step": 13217, + "time_per_iteration": 2.5752291679382324 + }, + { + "auxiliary_loss_clip": 0.01112384, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.04259491, + "balance_loss_mlp": 1.02034545, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 2.5708253380895276, + "language_loss": 0.73860383, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76004732, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11621094, + "step": 13218, + "time_per_iteration": 2.4398343563079834 + }, + { + "auxiliary_loss_clip": 0.01109328, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.03541732, + "balance_loss_mlp": 1.02001238, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 2.139937048007792, + "language_loss": 0.8371346, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85855943, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13140869, + "step": 13219, + "time_per_iteration": 2.4135921001434326 + }, + { + "auxiliary_loss_clip": 0.0111999, + "auxiliary_loss_mlp": 0.01033434, + "balance_loss_clip": 1.04382503, + "balance_loss_mlp": 1.02035069, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 2.7798779769069943, + "language_loss": 0.75332725, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77486145, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13092041, + "step": 13220, + "time_per_iteration": 2.46000337600708 + }, + { + "auxiliary_loss_clip": 0.01111336, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.03758097, + "balance_loss_mlp": 1.02309287, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 2.2904028328653405, + "language_loss": 0.72909242, + "learning_rate": 4.252128005599176e-07, + "loss": 0.75056374, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12695312, + "step": 13221, + "time_per_iteration": 2.600444793701172 + }, + { + "auxiliary_loss_clip": 0.01116891, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.04555225, + "balance_loss_mlp": 1.01871502, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 2.001395855211079, + "language_loss": 0.74728954, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76876062, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1151123, + "step": 13222, + "time_per_iteration": 2.4127144813537598 + }, + { + "auxiliary_loss_clip": 0.01043728, + "auxiliary_loss_mlp": 0.01006198, + "balance_loss_clip": 1.01823068, + "balance_loss_mlp": 1.00491095, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7690666876059384, + "language_loss": 0.6697728, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69027203, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01287842, + "step": 13223, + "time_per_iteration": 2.9134681224823 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.01942205, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.6465031380165396, + "language_loss": 0.71227443, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73369801, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1315918, + "step": 13224, + "time_per_iteration": 2.4449496269226074 + }, + { + "auxiliary_loss_clip": 0.01044909, + "auxiliary_loss_mlp": 0.01005871, + "balance_loss_clip": 1.01939297, + "balance_loss_mlp": 1.00460124, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6676034826537367, + "language_loss": 0.54978061, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57028836, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.25488281, + "router_z_loss_mlp": 0.01269531, + "step": 13225, + "time_per_iteration": 3.1186904907226562 + }, + { + "auxiliary_loss_clip": 0.01110288, + "auxiliary_loss_mlp": 0.01022977, + "balance_loss_clip": 1.03854704, + "balance_loss_mlp": 1.01200962, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 2.1612915900605323, + "language_loss": 0.64864302, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.66997564, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10974121, + "step": 13226, + "time_per_iteration": 2.438051223754883 + }, + { + "auxiliary_loss_clip": 0.01121298, + "auxiliary_loss_mlp": 0.01043338, + "balance_loss_clip": 1.04461813, + "balance_loss_mlp": 1.03079081, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 2.3363717641795194, + "language_loss": 0.70219004, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72383642, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12542725, + "step": 13227, + "time_per_iteration": 2.560725450515747 + }, + { + "auxiliary_loss_clip": 0.01116466, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.04542351, + "balance_loss_mlp": 1.01619661, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.9088245453697708, + "language_loss": 0.69248474, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71391839, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10699463, + "step": 13228, + "time_per_iteration": 2.4471933841705322 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.04351997, + "balance_loss_mlp": 1.02464938, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.5372709275940275, + "language_loss": 0.70786619, + "learning_rate": 4.232940412119095e-07, + "loss": 0.72940814, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11767578, + "step": 13229, + "time_per_iteration": 2.6159095764160156 + }, + { + "auxiliary_loss_clip": 0.01122484, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.0462184, + "balance_loss_mlp": 1.02001834, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.7889331018219858, + "language_loss": 0.71854985, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74009979, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.125, + "step": 13230, + "time_per_iteration": 2.472930669784546 + }, + { + "auxiliary_loss_clip": 0.01069093, + "auxiliary_loss_mlp": 0.01004692, + "balance_loss_clip": 1.04528522, + "balance_loss_mlp": 1.00324035, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.9018594353727429, + "language_loss": 0.63590944, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65664726, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.23803711, + "router_z_loss_mlp": 0.01452637, + "step": 13231, + "time_per_iteration": 4.500465393066406 + }, + { + "auxiliary_loss_clip": 0.01119006, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.04694176, + "balance_loss_mlp": 1.01594448, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.6770718505416449, + "language_loss": 0.69708359, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.7185446, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11151123, + "step": 13232, + "time_per_iteration": 2.4310505390167236 + }, + { + "auxiliary_loss_clip": 0.0112042, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.04764509, + "balance_loss_mlp": 1.01615953, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.9661538305219173, + "language_loss": 0.78041649, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80190271, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12023926, + "step": 13233, + "time_per_iteration": 2.4921605587005615 + }, + { + "auxiliary_loss_clip": 0.01109511, + "auxiliary_loss_mlp": 0.01042522, + "balance_loss_clip": 1.035676, + "balance_loss_mlp": 1.02893162, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 1.9372235234808464, + "language_loss": 0.78899658, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81051689, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.13574219, + "step": 13234, + "time_per_iteration": 2.462338447570801 + }, + { + "auxiliary_loss_clip": 0.01112808, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.03993917, + "balance_loss_mlp": 1.01785684, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.7432338403334298, + "language_loss": 0.7012794, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72269636, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1104126, + "step": 13235, + "time_per_iteration": 3.8739612102508545 + }, + { + "auxiliary_loss_clip": 0.01117934, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.04424906, + "balance_loss_mlp": 1.01685214, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.5899999864757512, + "language_loss": 0.67933023, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70080042, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12237549, + "step": 13236, + "time_per_iteration": 2.4723215103149414 + }, + { + "auxiliary_loss_clip": 0.01118214, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.04346204, + "balance_loss_mlp": 1.01682246, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.7838566249125187, + "language_loss": 0.75393069, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77539659, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11547852, + "step": 13237, + "time_per_iteration": 2.412672281265259 + }, + { + "auxiliary_loss_clip": 0.01118511, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.04517174, + "balance_loss_mlp": 1.02121973, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 2.417172128350236, + "language_loss": 0.71533197, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73685426, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12512207, + "step": 13238, + "time_per_iteration": 2.440021514892578 + }, + { + "auxiliary_loss_clip": 0.01118676, + "auxiliary_loss_mlp": 0.01025059, + "balance_loss_clip": 1.04425621, + "balance_loss_mlp": 1.0134356, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.7523159648780082, + "language_loss": 0.73815858, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.75959587, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11602783, + "step": 13239, + "time_per_iteration": 2.470374822616577 + }, + { + "auxiliary_loss_clip": 0.01116448, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.04094243, + "balance_loss_mlp": 1.02301788, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.9759560600954, + "language_loss": 0.69406736, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71558511, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12316895, + "step": 13240, + "time_per_iteration": 2.492077350616455 + }, + { + "auxiliary_loss_clip": 0.01056133, + "auxiliary_loss_mlp": 0.01005774, + "balance_loss_clip": 1.03023994, + "balance_loss_mlp": 1.00452399, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8911864953139096, + "language_loss": 0.58659881, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60721791, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 0.01248169, + "step": 13241, + "time_per_iteration": 4.253690481185913 + }, + { + "auxiliary_loss_clip": 0.0111699, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.04474926, + "balance_loss_mlp": 1.01942492, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 2.261460291867976, + "language_loss": 0.64403629, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66551018, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10974121, + "step": 13242, + "time_per_iteration": 2.573519706726074 + }, + { + "auxiliary_loss_clip": 0.01118345, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.04359603, + "balance_loss_mlp": 1.01963449, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 2.9875149079628347, + "language_loss": 0.76153457, + "learning_rate": 4.199454226296526e-07, + "loss": 0.7830348, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12060547, + "step": 13243, + "time_per_iteration": 2.4363038539886475 + }, + { + "auxiliary_loss_clip": 0.01121482, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.04715514, + "balance_loss_mlp": 1.01822984, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.793817542656559, + "language_loss": 0.79518467, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81669921, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11743164, + "step": 13244, + "time_per_iteration": 2.4676876068115234 + }, + { + "auxiliary_loss_clip": 0.01119703, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.04371023, + "balance_loss_mlp": 1.01888025, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.0986884866934217, + "language_loss": 0.68534476, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.7068541, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12353516, + "step": 13245, + "time_per_iteration": 2.4045145511627197 + }, + { + "auxiliary_loss_clip": 0.01118524, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.04574895, + "balance_loss_mlp": 1.02008438, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.418394719531265, + "language_loss": 0.7907747, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81227708, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11639404, + "step": 13246, + "time_per_iteration": 2.428502082824707 + }, + { + "auxiliary_loss_clip": 0.01115823, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.04099965, + "balance_loss_mlp": 1.01683724, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.8159338524173883, + "language_loss": 0.66187763, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68332458, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12011719, + "step": 13247, + "time_per_iteration": 2.4653801918029785 + }, + { + "auxiliary_loss_clip": 0.01120975, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.05006969, + "balance_loss_mlp": 1.01994205, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.1407913490722, + "language_loss": 0.71677881, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73829943, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.1114502, + "step": 13248, + "time_per_iteration": 2.519676923751831 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.04048467, + "balance_loss_mlp": 1.01800275, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 6.451443684796649, + "language_loss": 0.76267648, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.7841146, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11968994, + "step": 13249, + "time_per_iteration": 2.459336996078491 + }, + { + "auxiliary_loss_clip": 0.01118974, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04794455, + "balance_loss_mlp": 1.02032709, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.0803039449206975, + "language_loss": 0.61800849, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63951784, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11633301, + "step": 13250, + "time_per_iteration": 2.4742531776428223 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.04290819, + "balance_loss_mlp": 1.01445103, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.59858778664317, + "language_loss": 0.71886313, + "learning_rate": 4.180371972938206e-07, + "loss": 0.74028879, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12072754, + "step": 13251, + "time_per_iteration": 2.4524433612823486 + }, + { + "auxiliary_loss_clip": 0.01122947, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.04750824, + "balance_loss_mlp": 1.01894712, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 2.7756066120608973, + "language_loss": 0.73155123, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75309914, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12915039, + "step": 13252, + "time_per_iteration": 2.549236536026001 + }, + { + "auxiliary_loss_clip": 0.01110675, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.04159403, + "balance_loss_mlp": 1.01854873, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.646362751154606, + "language_loss": 0.65830272, + "learning_rate": 4.175607406609278e-07, + "loss": 0.67970991, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.11486816, + "step": 13253, + "time_per_iteration": 2.545945405960083 + }, + { + "auxiliary_loss_clip": 0.01120249, + "auxiliary_loss_mlp": 0.01036208, + "balance_loss_clip": 1.04648924, + "balance_loss_mlp": 1.02359593, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.6882598867347571, + "language_loss": 0.67845792, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.70002252, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1260376, + "step": 13254, + "time_per_iteration": 2.473806858062744 + }, + { + "auxiliary_loss_clip": 0.01106694, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.03571939, + "balance_loss_mlp": 1.0235728, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 4.884338979850827, + "language_loss": 0.69382262, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71523666, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11138916, + "step": 13255, + "time_per_iteration": 2.506160259246826 + }, + { + "auxiliary_loss_clip": 0.01111507, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.03995275, + "balance_loss_mlp": 1.01829553, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.0372766413594796, + "language_loss": 0.79333341, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81474149, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11004639, + "step": 13256, + "time_per_iteration": 2.4235756397247314 + }, + { + "auxiliary_loss_clip": 0.01117529, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.04463696, + "balance_loss_mlp": 1.01634359, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 2.8195656386261723, + "language_loss": 0.66005963, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68151891, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12060547, + "step": 13257, + "time_per_iteration": 3.9297146797180176 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.03947115, + "balance_loss_mlp": 1.01897001, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 1.9149806015987403, + "language_loss": 0.72424167, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74569833, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12115479, + "step": 13258, + "time_per_iteration": 2.5179190635681152 + }, + { + "auxiliary_loss_clip": 0.01111222, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.03660989, + "balance_loss_mlp": 1.01996136, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 2.0593779464817494, + "language_loss": 0.6888448, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.71027887, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12225342, + "step": 13259, + "time_per_iteration": 2.437415361404419 + }, + { + "auxiliary_loss_clip": 0.0111266, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.04311728, + "balance_loss_mlp": 1.01742351, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.7545240161052458, + "language_loss": 0.73472548, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75613338, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10699463, + "step": 13260, + "time_per_iteration": 2.4907824993133545 + }, + { + "auxiliary_loss_clip": 0.01105092, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.03549564, + "balance_loss_mlp": 1.01733375, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.8222616114673986, + "language_loss": 0.78712332, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80845559, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10809326, + "step": 13261, + "time_per_iteration": 2.517402172088623 + }, + { + "auxiliary_loss_clip": 0.01109801, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.04100895, + "balance_loss_mlp": 1.0220443, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.4933479092489013, + "language_loss": 0.76127446, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78270042, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10748291, + "step": 13262, + "time_per_iteration": 2.4855778217315674 + }, + { + "auxiliary_loss_clip": 0.01115044, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.03974509, + "balance_loss_mlp": 1.02151608, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.1504682958048758, + "language_loss": 0.70610124, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72761393, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.14703369, + "step": 13263, + "time_per_iteration": 2.534883499145508 + }, + { + "auxiliary_loss_clip": 0.01114434, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.0375824, + "balance_loss_mlp": 1.02118063, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.7333704814215694, + "language_loss": 0.71481383, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73630464, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13458252, + "step": 13264, + "time_per_iteration": 2.4682977199554443 + }, + { + "auxiliary_loss_clip": 0.01108712, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.03777742, + "balance_loss_mlp": 1.02091503, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.9123542109393754, + "language_loss": 0.77077746, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79219949, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12579346, + "step": 13265, + "time_per_iteration": 2.469403028488159 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.04118395, + "balance_loss_mlp": 1.0176332, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.879585055261318, + "language_loss": 0.75718844, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77864003, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11334229, + "step": 13266, + "time_per_iteration": 2.4607677459716797 + }, + { + "auxiliary_loss_clip": 0.01114772, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.0427568, + "balance_loss_mlp": 1.01357031, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 2.702671317265273, + "language_loss": 0.84124511, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86264038, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11187744, + "step": 13267, + "time_per_iteration": 2.4464540481567383 + }, + { + "auxiliary_loss_clip": 0.01108247, + "auxiliary_loss_mlp": 0.01036939, + "balance_loss_clip": 1.03729033, + "balance_loss_mlp": 1.02337909, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.6067505202298904, + "language_loss": 0.76039696, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78184879, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.13574219, + "step": 13268, + "time_per_iteration": 2.410323143005371 + }, + { + "auxiliary_loss_clip": 0.01121623, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.04710364, + "balance_loss_mlp": 1.01998007, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.7300130907204587, + "language_loss": 0.78273088, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.80427366, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12683105, + "step": 13269, + "time_per_iteration": 2.45988130569458 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.04140258, + "balance_loss_mlp": 1.02523518, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.834088188058617, + "language_loss": 0.82575947, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84723586, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11175537, + "step": 13270, + "time_per_iteration": 2.4197452068328857 + }, + { + "auxiliary_loss_clip": 0.01117923, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.04373789, + "balance_loss_mlp": 1.01770604, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 2.418988589675045, + "language_loss": 0.59775233, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61923057, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12194824, + "step": 13271, + "time_per_iteration": 2.4242000579833984 + }, + { + "auxiliary_loss_clip": 0.01129044, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.05075979, + "balance_loss_mlp": 1.01632547, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 2.63854294158625, + "language_loss": 0.73509663, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75666934, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.11895752, + "step": 13272, + "time_per_iteration": 2.50565505027771 + }, + { + "auxiliary_loss_clip": 0.01112434, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.04134679, + "balance_loss_mlp": 1.02421689, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 1.8708781782945476, + "language_loss": 0.71383417, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73534155, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.14099121, + "step": 13273, + "time_per_iteration": 2.4080395698547363 + }, + { + "auxiliary_loss_clip": 0.01121528, + "auxiliary_loss_mlp": 0.01038746, + "balance_loss_clip": 1.04626179, + "balance_loss_mlp": 1.02660489, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 2.1850122858772116, + "language_loss": 0.76084948, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.78245223, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.121521, + "step": 13274, + "time_per_iteration": 3.9270284175872803 + }, + { + "auxiliary_loss_clip": 0.01106098, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03836441, + "balance_loss_mlp": 1.02010655, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.315126779448966, + "language_loss": 0.77697194, + "learning_rate": 4.12335575223518e-07, + "loss": 0.7983464, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.67675781, + "router_z_loss_mlp": 0.11248779, + "step": 13275, + "time_per_iteration": 2.5300004482269287 + }, + { + "auxiliary_loss_clip": 0.01116676, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.04023266, + "balance_loss_mlp": 1.02208173, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 2.4168805202317536, + "language_loss": 0.6389398, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66046011, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13287354, + "step": 13276, + "time_per_iteration": 2.5156502723693848 + }, + { + "auxiliary_loss_clip": 0.01106246, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.03650069, + "balance_loss_mlp": 1.02093029, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.5915419831305102, + "language_loss": 0.61063778, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63204086, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.13128662, + "step": 13277, + "time_per_iteration": 2.4751670360565186 + }, + { + "auxiliary_loss_clip": 0.01126495, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.04565072, + "balance_loss_mlp": 1.02527249, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.0212182653120547, + "language_loss": 0.79714954, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81879663, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.1293335, + "step": 13278, + "time_per_iteration": 3.869243860244751 + }, + { + "auxiliary_loss_clip": 0.01117626, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.0439322, + "balance_loss_mlp": 1.02164865, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 3.737834475916312, + "language_loss": 0.63462698, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65614694, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12719727, + "step": 13279, + "time_per_iteration": 2.463625431060791 + }, + { + "auxiliary_loss_clip": 0.01107405, + "auxiliary_loss_mlp": 0.01028157, + "balance_loss_clip": 1.03945267, + "balance_loss_mlp": 1.01741076, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.6495713706875725, + "language_loss": 0.70685029, + "learning_rate": 4.111520979802825e-07, + "loss": 0.72820598, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.10754395, + "step": 13280, + "time_per_iteration": 2.693469285964966 + }, + { + "auxiliary_loss_clip": 0.01120851, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.04670453, + "balance_loss_mlp": 1.01772749, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 2.9114689973596577, + "language_loss": 0.62999523, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.6515069, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12573242, + "step": 13281, + "time_per_iteration": 2.65667724609375 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.04414129, + "balance_loss_mlp": 1.02035153, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.7095784273243395, + "language_loss": 0.80452907, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82605958, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12652588, + "step": 13282, + "time_per_iteration": 2.4748804569244385 + }, + { + "auxiliary_loss_clip": 0.01123133, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.04667938, + "balance_loss_mlp": 1.01516914, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 2.4380053430164175, + "language_loss": 0.7145046, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73601484, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12713623, + "step": 13283, + "time_per_iteration": 2.4335927963256836 + }, + { + "auxiliary_loss_clip": 0.01114038, + "auxiliary_loss_mlp": 0.01026663, + "balance_loss_clip": 1.04282236, + "balance_loss_mlp": 1.01565456, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.7907830351208431, + "language_loss": 0.73348445, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75489151, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10998535, + "step": 13284, + "time_per_iteration": 3.8864500522613525 + }, + { + "auxiliary_loss_clip": 0.011203, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.04915619, + "balance_loss_mlp": 1.0201602, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.4863655345549094, + "language_loss": 0.70419151, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72570217, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10614014, + "step": 13285, + "time_per_iteration": 2.472945213317871 + }, + { + "auxiliary_loss_clip": 0.01111632, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.04027724, + "balance_loss_mlp": 1.0155071, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.7049145761034383, + "language_loss": 0.73623216, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75761116, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10760498, + "step": 13286, + "time_per_iteration": 2.4535741806030273 + }, + { + "auxiliary_loss_clip": 0.01112551, + "auxiliary_loss_mlp": 0.01030209, + "balance_loss_clip": 1.04087293, + "balance_loss_mlp": 1.01857471, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 1.6252927323362933, + "language_loss": 0.75214684, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77357447, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11651611, + "step": 13287, + "time_per_iteration": 2.417945146560669 + }, + { + "auxiliary_loss_clip": 0.01117628, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.04666829, + "balance_loss_mlp": 1.0161581, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.4615046371845342, + "language_loss": 0.61865079, + "learning_rate": 4.092616678191863e-07, + "loss": 0.64010483, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11602783, + "step": 13288, + "time_per_iteration": 2.52409029006958 + }, + { + "auxiliary_loss_clip": 0.01116687, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.04530406, + "balance_loss_mlp": 1.02051175, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.076026017222969, + "language_loss": 0.70510638, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72658592, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10742188, + "step": 13289, + "time_per_iteration": 2.523101568222046 + }, + { + "auxiliary_loss_clip": 0.01114623, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.04473937, + "balance_loss_mlp": 1.02172995, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 2.2272653033640095, + "language_loss": 0.62187624, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64335907, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.1192627, + "step": 13290, + "time_per_iteration": 2.405862808227539 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.04312479, + "balance_loss_mlp": 1.01484013, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 1.9117921325468272, + "language_loss": 0.71560264, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73703551, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12329102, + "step": 13291, + "time_per_iteration": 2.460592269897461 + }, + { + "auxiliary_loss_clip": 0.01114845, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.04219604, + "balance_loss_mlp": 1.01967072, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.5334496010288932, + "language_loss": 0.6365236, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65797973, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11102295, + "step": 13292, + "time_per_iteration": 2.440162420272827 + }, + { + "auxiliary_loss_clip": 0.01120169, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.04509544, + "balance_loss_mlp": 1.01841092, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 2.00966875897576, + "language_loss": 0.55911386, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58061206, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11242676, + "step": 13293, + "time_per_iteration": 2.556769847869873 + }, + { + "auxiliary_loss_clip": 0.01123779, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.04862738, + "balance_loss_mlp": 1.02031088, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.503425264141301, + "language_loss": 0.71328545, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.7348448, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1184082, + "step": 13294, + "time_per_iteration": 2.6979050636291504 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.05138183, + "balance_loss_mlp": 1.01958907, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.8240558971145495, + "language_loss": 0.72678161, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74832863, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11480713, + "step": 13295, + "time_per_iteration": 2.5047597885131836 + }, + { + "auxiliary_loss_clip": 0.01111881, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.04334736, + "balance_loss_mlp": 1.02130651, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.8447290678318158, + "language_loss": 0.76409781, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78553593, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10632324, + "step": 13296, + "time_per_iteration": 2.3996622562408447 + }, + { + "auxiliary_loss_clip": 0.01042745, + "auxiliary_loss_mlp": 0.01005446, + "balance_loss_clip": 1.01788437, + "balance_loss_mlp": 1.00421488, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.8854903528981073, + "language_loss": 0.6074689, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62795079, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01231384, + "step": 13297, + "time_per_iteration": 3.145660877227783 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.03922343, + "balance_loss_mlp": 1.01903772, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.1514206929862985, + "language_loss": 0.70435047, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72576183, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10998535, + "step": 13298, + "time_per_iteration": 2.4631450176239014 + }, + { + "auxiliary_loss_clip": 0.0112209, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.04525626, + "balance_loss_mlp": 1.02198255, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 1.913178554246418, + "language_loss": 0.75738841, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77896166, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13256836, + "step": 13299, + "time_per_iteration": 2.450894832611084 + }, + { + "auxiliary_loss_clip": 0.01104208, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.03565121, + "balance_loss_mlp": 1.01778901, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.781653694508621, + "language_loss": 0.77539766, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79672968, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.11206055, + "step": 13300, + "time_per_iteration": 2.488616466522217 + }, + { + "auxiliary_loss_clip": 0.0112391, + "auxiliary_loss_mlp": 0.01025493, + "balance_loss_clip": 1.0495975, + "balance_loss_mlp": 1.0133456, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 1.707782414831273, + "language_loss": 0.63562465, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65711868, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12145996, + "step": 13301, + "time_per_iteration": 3.8778374195098877 + }, + { + "auxiliary_loss_clip": 0.01107892, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.03909433, + "balance_loss_mlp": 1.02182817, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.6962205565667912, + "language_loss": 0.71681929, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73823202, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.11566162, + "step": 13302, + "time_per_iteration": 2.4410595893859863 + }, + { + "auxiliary_loss_clip": 0.01122227, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.04363883, + "balance_loss_mlp": 1.01874614, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 1.7693042217406036, + "language_loss": 0.83707613, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85860837, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12261963, + "step": 13303, + "time_per_iteration": 2.4637491703033447 + }, + { + "auxiliary_loss_clip": 0.01112374, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.04154253, + "balance_loss_mlp": 1.02020574, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.7868143351003447, + "language_loss": 0.59240592, + "learning_rate": 4.054923936969166e-07, + "loss": 0.61384475, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11315918, + "step": 13304, + "time_per_iteration": 2.503960371017456 + }, + { + "auxiliary_loss_clip": 0.0110833, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.03423369, + "balance_loss_mlp": 1.01603484, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.8373424968797878, + "language_loss": 0.69085848, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71222377, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12164307, + "step": 13305, + "time_per_iteration": 2.46958589553833 + }, + { + "auxiliary_loss_clip": 0.01108806, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.03939676, + "balance_loss_mlp": 1.01626098, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5858895628835494, + "language_loss": 0.69473851, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71609914, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10998535, + "step": 13306, + "time_per_iteration": 2.4330599308013916 + }, + { + "auxiliary_loss_clip": 0.01112391, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.04065895, + "balance_loss_mlp": 1.01872683, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.3998482200165798, + "language_loss": 0.69736391, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71879143, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11651611, + "step": 13307, + "time_per_iteration": 2.642883062362671 + }, + { + "auxiliary_loss_clip": 0.01111085, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.03945112, + "balance_loss_mlp": 1.02060652, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 2.225013241006888, + "language_loss": 0.76739407, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.78882456, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11358643, + "step": 13308, + "time_per_iteration": 2.4951024055480957 + }, + { + "auxiliary_loss_clip": 0.01122104, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.04587686, + "balance_loss_mlp": 1.02158165, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.5297226276446934, + "language_loss": 0.78615409, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80772078, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12976074, + "step": 13309, + "time_per_iteration": 2.538165330886841 + }, + { + "auxiliary_loss_clip": 0.01064238, + "auxiliary_loss_mlp": 0.01002676, + "balance_loss_clip": 1.04075408, + "balance_loss_mlp": 1.00127864, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9249001096815457, + "language_loss": 0.6472553, + "learning_rate": 4.040829045539571e-07, + "loss": 0.6679244, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.23486328, + "router_z_loss_mlp": 0.01397705, + "step": 13310, + "time_per_iteration": 3.0453908443450928 + }, + { + "auxiliary_loss_clip": 0.01114003, + "auxiliary_loss_mlp": 0.0103601, + "balance_loss_clip": 1.04215693, + "balance_loss_mlp": 1.02301049, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 1.8936220664482233, + "language_loss": 0.82943296, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85093307, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.13000488, + "step": 13311, + "time_per_iteration": 2.491577625274658 + }, + { + "auxiliary_loss_clip": 0.01115785, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.04458296, + "balance_loss_mlp": 1.02116179, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.1046021283520004, + "language_loss": 0.6592173, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68070495, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11816406, + "step": 13312, + "time_per_iteration": 2.5134541988372803 + }, + { + "auxiliary_loss_clip": 0.01115788, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.04104984, + "balance_loss_mlp": 1.02184939, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.8235601914096082, + "language_loss": 0.75305575, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77457136, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13934326, + "step": 13313, + "time_per_iteration": 2.5352933406829834 + }, + { + "auxiliary_loss_clip": 0.01108018, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.03666461, + "balance_loss_mlp": 1.01732397, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3560753297072974, + "language_loss": 0.75874197, + "learning_rate": 4.031444553532575e-07, + "loss": 0.78011006, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11468506, + "step": 13314, + "time_per_iteration": 2.5942466259002686 + }, + { + "auxiliary_loss_clip": 0.01041498, + "auxiliary_loss_mlp": 0.01005365, + "balance_loss_clip": 1.01687264, + "balance_loss_mlp": 1.00412083, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.7989187019720424, + "language_loss": 0.53797483, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55844349, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.24633789, + "router_z_loss_mlp": 0.01243591, + "step": 13315, + "time_per_iteration": 3.0880239009857178 + }, + { + "auxiliary_loss_clip": 0.01113902, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.04256797, + "balance_loss_mlp": 1.01965094, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.643182841044259, + "language_loss": 0.71073687, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73219442, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12188721, + "step": 13316, + "time_per_iteration": 2.6172988414764404 + }, + { + "auxiliary_loss_clip": 0.01111509, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.03756571, + "balance_loss_mlp": 1.01748824, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 2.0412722829051506, + "language_loss": 0.64933246, + "learning_rate": 4.024412542272706e-07, + "loss": 0.67074174, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11950684, + "step": 13317, + "time_per_iteration": 3.988067865371704 + }, + { + "auxiliary_loss_clip": 0.01038354, + "auxiliary_loss_mlp": 0.01006524, + "balance_loss_clip": 1.01320362, + "balance_loss_mlp": 1.00535238, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7639817601691548, + "language_loss": 0.58978832, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61023706, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01170349, + "step": 13318, + "time_per_iteration": 3.17549467086792 + }, + { + "auxiliary_loss_clip": 0.01107054, + "auxiliary_loss_mlp": 0.01022368, + "balance_loss_clip": 1.0368228, + "balance_loss_mlp": 1.01100743, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.6944819450487782, + "language_loss": 0.66448379, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68577802, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.1137085, + "step": 13319, + "time_per_iteration": 2.515841007232666 + }, + { + "auxiliary_loss_clip": 0.01116023, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.04070282, + "balance_loss_mlp": 1.02369273, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 2.1624717461733796, + "language_loss": 0.74150378, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76303852, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.13757324, + "step": 13320, + "time_per_iteration": 2.5489182472229004 + }, + { + "auxiliary_loss_clip": 0.01116831, + "auxiliary_loss_mlp": 0.01025622, + "balance_loss_clip": 1.0434103, + "balance_loss_mlp": 1.01342118, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 1.8437118344227408, + "language_loss": 0.80197036, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82339489, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12188721, + "step": 13321, + "time_per_iteration": 2.4260196685791016 + }, + { + "auxiliary_loss_clip": 0.01114748, + "auxiliary_loss_mlp": 0.01027171, + "balance_loss_clip": 1.04279137, + "balance_loss_mlp": 1.01589942, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.9365764601342799, + "language_loss": 0.66096807, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68238723, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11273193, + "step": 13322, + "time_per_iteration": 4.206987619400024 + }, + { + "auxiliary_loss_clip": 0.01113138, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.04195023, + "balance_loss_mlp": 1.01360059, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 3.832388219788361, + "language_loss": 0.78167599, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80305696, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11346436, + "step": 13323, + "time_per_iteration": 2.49664306640625 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01039242, + "balance_loss_clip": 1.0427022, + "balance_loss_mlp": 1.02514601, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.539902744729488, + "language_loss": 0.71870887, + "learning_rate": 4.00802572299932e-07, + "loss": 0.74027967, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.14111328, + "step": 13324, + "time_per_iteration": 2.525014877319336 + }, + { + "auxiliary_loss_clip": 0.01115451, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.04032075, + "balance_loss_mlp": 1.02035308, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 2.4384564165181435, + "language_loss": 0.76523972, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78672308, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12524414, + "step": 13325, + "time_per_iteration": 2.4348983764648438 + }, + { + "auxiliary_loss_clip": 0.01109385, + "auxiliary_loss_mlp": 0.0102435, + "balance_loss_clip": 1.04244971, + "balance_loss_mlp": 1.01378202, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.5532837872067384, + "language_loss": 0.79915833, + "learning_rate": 4.003349231059898e-07, + "loss": 0.82049567, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 0.66943359, + "router_z_loss_mlp": 0.10571289, + "step": 13326, + "time_per_iteration": 2.495887517929077 + }, + { + "auxiliary_loss_clip": 0.01114153, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.04381442, + "balance_loss_mlp": 1.02174938, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.9950203921788066, + "language_loss": 0.66520852, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68667591, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10839844, + "step": 13327, + "time_per_iteration": 2.428999662399292 + }, + { + "auxiliary_loss_clip": 0.01115883, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.04470849, + "balance_loss_mlp": 1.01872361, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.7042440345137326, + "language_loss": 0.74058354, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.76204395, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11431885, + "step": 13328, + "time_per_iteration": 3.933720827102661 + }, + { + "auxiliary_loss_clip": 0.01121867, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.04631448, + "balance_loss_mlp": 1.02001715, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 1.9047056901279387, + "language_loss": 0.74236733, + "learning_rate": 3.996339042831798e-07, + "loss": 0.7639122, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12609863, + "step": 13329, + "time_per_iteration": 2.4203941822052 + }, + { + "auxiliary_loss_clip": 0.01057627, + "auxiliary_loss_mlp": 0.01003814, + "balance_loss_clip": 1.03333545, + "balance_loss_mlp": 1.00263667, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.6976977410555706, + "language_loss": 0.52877116, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54938555, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.24291992, + "router_z_loss_mlp": 0.01176453, + "step": 13330, + "time_per_iteration": 3.153247833251953 + }, + { + "auxiliary_loss_clip": 0.01109772, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.0353967, + "balance_loss_mlp": 1.02267122, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.8572435457495862, + "language_loss": 0.72757131, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74902898, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13342285, + "step": 13331, + "time_per_iteration": 2.49749493598938 + }, + { + "auxiliary_loss_clip": 0.01111163, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.03906679, + "balance_loss_mlp": 1.01784933, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.8256598099219168, + "language_loss": 0.77550477, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79690355, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10858154, + "step": 13332, + "time_per_iteration": 2.4847071170806885 + }, + { + "auxiliary_loss_clip": 0.01115618, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.04265201, + "balance_loss_mlp": 1.01716328, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 1.9647302342732769, + "language_loss": 0.83457345, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85601658, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11529541, + "step": 13333, + "time_per_iteration": 2.591582775115967 + }, + { + "auxiliary_loss_clip": 0.01111144, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.03785288, + "balance_loss_mlp": 1.01770759, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.648109138735047, + "language_loss": 0.73602223, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75742948, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11871338, + "step": 13334, + "time_per_iteration": 2.493624448776245 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.04586756, + "balance_loss_mlp": 1.01852512, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 2.2803011456696947, + "language_loss": 0.74802923, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76949334, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11871338, + "step": 13335, + "time_per_iteration": 2.496882200241089 + }, + { + "auxiliary_loss_clip": 0.01109522, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.03891611, + "balance_loss_mlp": 1.01634479, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 2.083728202940875, + "language_loss": 0.75723052, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77861381, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.12481689, + "step": 13336, + "time_per_iteration": 2.4660165309906006 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.04883862, + "balance_loss_mlp": 1.02254009, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.1454774726941785, + "language_loss": 0.7560246, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77764761, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12628174, + "step": 13337, + "time_per_iteration": 2.4497742652893066 + }, + { + "auxiliary_loss_clip": 0.01117603, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.0434041, + "balance_loss_mlp": 1.02046371, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 2.982895244223122, + "language_loss": 0.80012608, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82163095, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12420654, + "step": 13338, + "time_per_iteration": 2.50956392288208 + }, + { + "auxiliary_loss_clip": 0.01111261, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.03736353, + "balance_loss_mlp": 1.01695645, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 2.7427224648483333, + "language_loss": 0.75036532, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.77179003, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.14233398, + "step": 13339, + "time_per_iteration": 2.469665765762329 + }, + { + "auxiliary_loss_clip": 0.01110081, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.04134548, + "balance_loss_mlp": 1.01674879, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.7688752528153295, + "language_loss": 0.79052019, + "learning_rate": 3.970681765754775e-07, + "loss": 0.8118965, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.10803223, + "step": 13340, + "time_per_iteration": 2.472574234008789 + }, + { + "auxiliary_loss_clip": 0.01111671, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.03944886, + "balance_loss_mlp": 1.01590109, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.6651767897646428, + "language_loss": 0.68187207, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70325881, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11102295, + "step": 13341, + "time_per_iteration": 2.48781156539917 + }, + { + "auxiliary_loss_clip": 0.01056946, + "auxiliary_loss_mlp": 0.01006358, + "balance_loss_clip": 1.03200495, + "balance_loss_mlp": 1.00515282, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8133277217974251, + "language_loss": 0.61643493, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63706797, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01205444, + "step": 13342, + "time_per_iteration": 3.026583671569824 + }, + { + "auxiliary_loss_clip": 0.01118255, + "auxiliary_loss_mlp": 0.01035139, + "balance_loss_clip": 1.04475713, + "balance_loss_mlp": 1.02260983, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 2.2931315066277094, + "language_loss": 0.63835907, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65989304, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12530518, + "step": 13343, + "time_per_iteration": 3.8945231437683105 + }, + { + "auxiliary_loss_clip": 0.0111077, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.03950655, + "balance_loss_mlp": 1.02840626, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.925651368187303, + "language_loss": 0.69554591, + "learning_rate": 3.96137007563051e-07, + "loss": 0.7170614, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.1237793, + "step": 13344, + "time_per_iteration": 2.4288885593414307 + }, + { + "auxiliary_loss_clip": 0.01124297, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.01564872, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 2.0146163665629735, + "language_loss": 0.70181692, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72334433, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12792969, + "step": 13345, + "time_per_iteration": 2.6216351985931396 + }, + { + "auxiliary_loss_clip": 0.01046674, + "auxiliary_loss_mlp": 0.01004187, + "balance_loss_clip": 1.02206039, + "balance_loss_mlp": 1.002846, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8916922114733022, + "language_loss": 0.6294862, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64999479, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.01342773, + "step": 13346, + "time_per_iteration": 3.1164870262145996 + }, + { + "auxiliary_loss_clip": 0.01110472, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.04080606, + "balance_loss_mlp": 1.01869011, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.527240059848841, + "language_loss": 0.72579575, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.7472018, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.11456299, + "step": 13347, + "time_per_iteration": 2.643406391143799 + }, + { + "auxiliary_loss_clip": 0.01115834, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.04281819, + "balance_loss_mlp": 1.01599932, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 1.9994215055034972, + "language_loss": 0.7282964, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74973822, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12353516, + "step": 13348, + "time_per_iteration": 2.458111524581909 + }, + { + "auxiliary_loss_clip": 0.01114166, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.04042566, + "balance_loss_mlp": 1.01927423, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 1.8444998524237712, + "language_loss": 0.7602126, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.7816807, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1338501, + "step": 13349, + "time_per_iteration": 2.4294116497039795 + }, + { + "auxiliary_loss_clip": 0.01120834, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04843354, + "balance_loss_mlp": 1.02080774, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.383807973436691, + "language_loss": 0.83303642, + "learning_rate": 3.947420787800755e-07, + "loss": 0.8545596, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10693359, + "step": 13350, + "time_per_iteration": 2.4524037837982178 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.04509795, + "balance_loss_mlp": 1.0254302, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.635575758563308, + "language_loss": 0.71476293, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73630404, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11566162, + "step": 13351, + "time_per_iteration": 2.45402193069458 + }, + { + "auxiliary_loss_clip": 0.01115007, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.04388547, + "balance_loss_mlp": 1.02142835, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 2.325507448109976, + "language_loss": 0.61690652, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63840222, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.13122559, + "step": 13352, + "time_per_iteration": 2.5171122550964355 + }, + { + "auxiliary_loss_clip": 0.01117701, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.04650438, + "balance_loss_mlp": 1.02236009, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 2.8338557227077192, + "language_loss": 0.77234709, + "learning_rate": 3.940454360354046e-07, + "loss": 0.79386532, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11749268, + "step": 13353, + "time_per_iteration": 2.424490451812744 + }, + { + "auxiliary_loss_clip": 0.01119965, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.04061413, + "balance_loss_mlp": 1.01683712, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.339549372377918, + "language_loss": 0.73641187, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75791615, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13623047, + "step": 13354, + "time_per_iteration": 2.4467954635620117 + }, + { + "auxiliary_loss_clip": 0.01116607, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.04312885, + "balance_loss_mlp": 1.02068067, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 2.146383460299956, + "language_loss": 0.6592955, + "learning_rate": 3.935813120140714e-07, + "loss": 0.68078309, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11474609, + "step": 13355, + "time_per_iteration": 2.4080138206481934 + }, + { + "auxiliary_loss_clip": 0.01125383, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.04900336, + "balance_loss_mlp": 1.01538503, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.1197048018603146, + "language_loss": 0.68806207, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70959896, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12915039, + "step": 13356, + "time_per_iteration": 2.683330774307251 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.04069304, + "balance_loss_mlp": 1.01706481, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.5308994756976586, + "language_loss": 0.77652836, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79794228, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11499023, + "step": 13357, + "time_per_iteration": 2.4763705730438232 + }, + { + "auxiliary_loss_clip": 0.01109597, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.03510571, + "balance_loss_mlp": 1.01871705, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.4213732692338956, + "language_loss": 0.77033603, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79174888, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12982178, + "step": 13358, + "time_per_iteration": 2.5047667026519775 + }, + { + "auxiliary_loss_clip": 0.01109331, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.03945649, + "balance_loss_mlp": 1.02111793, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.5131121690591283, + "language_loss": 0.84805429, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86948264, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.1239624, + "step": 13359, + "time_per_iteration": 2.4530410766601562 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01026161, + "balance_loss_clip": 1.0371685, + "balance_loss_mlp": 1.01527715, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 2.6458828703719264, + "language_loss": 0.73602849, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75736219, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10876465, + "step": 13360, + "time_per_iteration": 4.083327770233154 + }, + { + "auxiliary_loss_clip": 0.01121764, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.04443645, + "balance_loss_mlp": 1.02781606, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 2.1606077896568228, + "language_loss": 0.69676578, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71838516, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12359619, + "step": 13361, + "time_per_iteration": 2.557708501815796 + }, + { + "auxiliary_loss_clip": 0.01118057, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.04187536, + "balance_loss_mlp": 1.0243988, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.5273536344874294, + "language_loss": 0.70329964, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72484982, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12561035, + "step": 13362, + "time_per_iteration": 2.4682888984680176 + }, + { + "auxiliary_loss_clip": 0.01122518, + "auxiliary_loss_mlp": 0.01039611, + "balance_loss_clip": 1.04207516, + "balance_loss_mlp": 1.02482343, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.4717891788089634, + "language_loss": 0.78756469, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80918598, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14788818, + "step": 13363, + "time_per_iteration": 2.497358798980713 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.03801656, + "balance_loss_mlp": 1.01760995, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.0492599358159427, + "language_loss": 0.74936056, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77076674, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12261963, + "step": 13364, + "time_per_iteration": 2.4947500228881836 + }, + { + "auxiliary_loss_clip": 0.01122834, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.05037951, + "balance_loss_mlp": 1.02103758, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 2.248236809070111, + "language_loss": 0.60436559, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62592232, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11816406, + "step": 13365, + "time_per_iteration": 3.9348702430725098 + }, + { + "auxiliary_loss_clip": 0.01120614, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.04507232, + "balance_loss_mlp": 1.01978874, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 2.49659633275392, + "language_loss": 0.664675, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68620622, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1272583, + "step": 13366, + "time_per_iteration": 2.437485933303833 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.01026672, + "balance_loss_clip": 1.04388201, + "balance_loss_mlp": 1.0153296, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.4184588092275114, + "language_loss": 0.75460947, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77602959, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11346436, + "step": 13367, + "time_per_iteration": 2.444234609603882 + }, + { + "auxiliary_loss_clip": 0.01115069, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.04114354, + "balance_loss_mlp": 1.01704705, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.5162606442815056, + "language_loss": 0.74137759, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76281661, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11779785, + "step": 13368, + "time_per_iteration": 2.56860613822937 + }, + { + "auxiliary_loss_clip": 0.01122115, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.04644966, + "balance_loss_mlp": 1.01844752, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 1.9380295646240713, + "language_loss": 0.69979131, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72131777, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12078857, + "step": 13369, + "time_per_iteration": 2.462184190750122 + }, + { + "auxiliary_loss_clip": 0.01115205, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.04466724, + "balance_loss_mlp": 1.01741242, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 2.4869204451454814, + "language_loss": 0.73620611, + "learning_rate": 3.901081534434312e-07, + "loss": 0.75764436, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11193848, + "step": 13370, + "time_per_iteration": 2.4258029460906982 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.04546356, + "balance_loss_mlp": 1.02039838, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 13.198180888120186, + "language_loss": 0.87583321, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89737409, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12664795, + "step": 13371, + "time_per_iteration": 2.4285945892333984 + }, + { + "auxiliary_loss_clip": 0.01117839, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.04160202, + "balance_loss_mlp": 1.01780856, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.8213621163796019, + "language_loss": 0.74655914, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76804185, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12628174, + "step": 13372, + "time_per_iteration": 3.9309558868408203 + }, + { + "auxiliary_loss_clip": 0.01106345, + "auxiliary_loss_mlp": 0.0102885, + "balance_loss_clip": 1.03624487, + "balance_loss_mlp": 1.0163455, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 3.713082654587592, + "language_loss": 0.79031479, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81166673, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.12512207, + "step": 13373, + "time_per_iteration": 2.5630242824554443 + }, + { + "auxiliary_loss_clip": 0.01117337, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.04696572, + "balance_loss_mlp": 1.02113605, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.511420009421458, + "language_loss": 0.74587774, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76737535, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11291504, + "step": 13374, + "time_per_iteration": 2.4760193824768066 + }, + { + "auxiliary_loss_clip": 0.01120408, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.04569674, + "balance_loss_mlp": 1.01732659, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 2.0736861773416657, + "language_loss": 0.68798345, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70948184, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12109375, + "step": 13375, + "time_per_iteration": 2.4319369792938232 + }, + { + "auxiliary_loss_clip": 0.01037366, + "auxiliary_loss_mlp": 0.01001201, + "balance_loss_clip": 1.01271129, + "balance_loss_mlp": 0.99984914, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.7263378141153154, + "language_loss": 0.55680275, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57718849, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01351929, + "step": 13376, + "time_per_iteration": 3.1445772647857666 + }, + { + "auxiliary_loss_clip": 0.0111094, + "auxiliary_loss_mlp": 0.01027567, + "balance_loss_clip": 1.03788376, + "balance_loss_mlp": 1.01558065, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.6097393162764277, + "language_loss": 0.7308526, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75223774, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11993408, + "step": 13377, + "time_per_iteration": 2.4402544498443604 + }, + { + "auxiliary_loss_clip": 0.01111141, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.03797781, + "balance_loss_mlp": 1.01696038, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.7420607479473544, + "language_loss": 0.70492172, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.7263211, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11834717, + "step": 13378, + "time_per_iteration": 2.5591540336608887 + }, + { + "auxiliary_loss_clip": 0.01114521, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.04133701, + "balance_loss_mlp": 1.01836932, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.454552415085394, + "language_loss": 0.69326288, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71472561, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.13378906, + "step": 13379, + "time_per_iteration": 2.56463623046875 + }, + { + "auxiliary_loss_clip": 0.0112135, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.04307365, + "balance_loss_mlp": 1.02474976, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.7716515129411905, + "language_loss": 0.76265383, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78425515, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.14025879, + "step": 13380, + "time_per_iteration": 2.479344606399536 + }, + { + "auxiliary_loss_clip": 0.01114774, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.03981531, + "balance_loss_mlp": 1.01639462, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 2.2259787337036205, + "language_loss": 0.6913538, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71277952, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11401367, + "step": 13381, + "time_per_iteration": 2.423489570617676 + }, + { + "auxiliary_loss_clip": 0.01117525, + "auxiliary_loss_mlp": 0.01030374, + "balance_loss_clip": 1.04488969, + "balance_loss_mlp": 1.01859617, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 1.8366876157068865, + "language_loss": 0.6408428, + "learning_rate": 3.873395148176135e-07, + "loss": 0.66232175, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11779785, + "step": 13382, + "time_per_iteration": 2.501218318939209 + }, + { + "auxiliary_loss_clip": 0.01115953, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.04385138, + "balance_loss_mlp": 1.02128994, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 3.2611274896047444, + "language_loss": 0.7695387, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.79101956, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10839844, + "step": 13383, + "time_per_iteration": 2.4774279594421387 + }, + { + "auxiliary_loss_clip": 0.01115027, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.04100323, + "balance_loss_mlp": 1.0231576, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.7676378338968715, + "language_loss": 0.69776541, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71926844, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12127686, + "step": 13384, + "time_per_iteration": 2.470005989074707 + }, + { + "auxiliary_loss_clip": 0.01115072, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.03887725, + "balance_loss_mlp": 1.02091801, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.1922082200733732, + "language_loss": 0.79915524, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.82064801, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13287354, + "step": 13385, + "time_per_iteration": 2.4180078506469727 + }, + { + "auxiliary_loss_clip": 0.01110172, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.03743815, + "balance_loss_mlp": 1.0207653, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.6951002793120844, + "language_loss": 0.72320712, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74464542, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12890625, + "step": 13386, + "time_per_iteration": 2.5107827186584473 + }, + { + "auxiliary_loss_clip": 0.01036139, + "auxiliary_loss_mlp": 0.0100299, + "balance_loss_clip": 1.01146138, + "balance_loss_mlp": 1.00175285, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6851344195342169, + "language_loss": 0.51237231, + "learning_rate": 3.861885134935865e-07, + "loss": 0.5327636, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01235962, + "step": 13387, + "time_per_iteration": 4.63828182220459 + }, + { + "auxiliary_loss_clip": 0.01108424, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.0362587, + "balance_loss_mlp": 1.01979947, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 2.6857045958533283, + "language_loss": 0.73917431, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76058835, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.13183594, + "step": 13388, + "time_per_iteration": 2.4744136333465576 + }, + { + "auxiliary_loss_clip": 0.01110782, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.04000032, + "balance_loss_mlp": 1.01680183, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 2.116207075544422, + "language_loss": 0.71880174, + "learning_rate": 3.857285412741411e-07, + "loss": 0.74019414, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11663818, + "step": 13389, + "time_per_iteration": 2.518172264099121 + }, + { + "auxiliary_loss_clip": 0.01120292, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.04320717, + "balance_loss_mlp": 1.02335167, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.0502068546676626, + "language_loss": 0.83059835, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.85216284, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12792969, + "step": 13390, + "time_per_iteration": 2.411332845687866 + }, + { + "auxiliary_loss_clip": 0.01045466, + "auxiliary_loss_mlp": 0.01005781, + "balance_loss_clip": 1.02037704, + "balance_loss_mlp": 1.00440705, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7738952883177418, + "language_loss": 0.55526578, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57577825, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01374817, + "step": 13391, + "time_per_iteration": 3.1069676876068115 + }, + { + "auxiliary_loss_clip": 0.01110198, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.04098105, + "balance_loss_mlp": 1.01895046, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.5802390806319908, + "language_loss": 0.8491289, + "learning_rate": 3.850390420667762e-07, + "loss": 0.87054765, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.12731934, + "step": 13392, + "time_per_iteration": 2.42641282081604 + }, + { + "auxiliary_loss_clip": 0.01106486, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.03506148, + "balance_loss_mlp": 1.0205723, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.4517342695747286, + "language_loss": 0.70260191, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72399998, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12762451, + "step": 13393, + "time_per_iteration": 2.5313546657562256 + }, + { + "auxiliary_loss_clip": 0.01117976, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.01646638, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 2.5169811785598863, + "language_loss": 0.76052308, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78199565, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12817383, + "step": 13394, + "time_per_iteration": 2.4409210681915283 + }, + { + "auxiliary_loss_clip": 0.01119069, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.04484117, + "balance_loss_mlp": 1.01823044, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.8422705111488082, + "language_loss": 0.64919496, + "learning_rate": 3.843500940147304e-07, + "loss": 0.6706776, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.10968018, + "step": 13395, + "time_per_iteration": 2.5166375637054443 + }, + { + "auxiliary_loss_clip": 0.01052065, + "auxiliary_loss_mlp": 0.01002138, + "balance_loss_clip": 1.02637565, + "balance_loss_mlp": 1.00072801, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7485880185809004, + "language_loss": 0.57369214, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59423423, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01408386, + "step": 13396, + "time_per_iteration": 3.18990421295166 + }, + { + "auxiliary_loss_clip": 0.01114944, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.04129791, + "balance_loss_mlp": 1.02310169, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.8572400108327656, + "language_loss": 0.77577806, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79729009, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.13165283, + "step": 13397, + "time_per_iteration": 2.4321060180664062 + }, + { + "auxiliary_loss_clip": 0.01119563, + "auxiliary_loss_mlp": 0.01025756, + "balance_loss_clip": 1.0463208, + "balance_loss_mlp": 1.01465201, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.6459743634330006, + "language_loss": 0.70172065, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72317386, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11108398, + "step": 13398, + "time_per_iteration": 2.407564878463745 + }, + { + "auxiliary_loss_clip": 0.01121126, + "auxiliary_loss_mlp": 0.01024778, + "balance_loss_clip": 1.04818308, + "balance_loss_mlp": 1.01426375, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.841140190431025, + "language_loss": 0.68924659, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71070564, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10516357, + "step": 13399, + "time_per_iteration": 2.436173439025879 + }, + { + "auxiliary_loss_clip": 0.0111405, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.04285431, + "balance_loss_mlp": 1.01977158, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.546614185570762, + "language_loss": 0.7239455, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74539542, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11169434, + "step": 13400, + "time_per_iteration": 2.4960100650787354 + }, + { + "auxiliary_loss_clip": 0.01119211, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.04802084, + "balance_loss_mlp": 1.01336884, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.8051863599622866, + "language_loss": 0.63860941, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66005147, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11608887, + "step": 13401, + "time_per_iteration": 2.4481616020202637 + }, + { + "auxiliary_loss_clip": 0.01116197, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.04288745, + "balance_loss_mlp": 1.01664257, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.07335085663783, + "language_loss": 0.83955586, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.86099911, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11474609, + "step": 13402, + "time_per_iteration": 2.405625104904175 + }, + { + "auxiliary_loss_clip": 0.01122022, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_clip": 1.04835343, + "balance_loss_mlp": 1.01643133, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.9257785539172325, + "language_loss": 0.68105304, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70255423, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11669922, + "step": 13403, + "time_per_iteration": 2.4842123985290527 + }, + { + "auxiliary_loss_clip": 0.01121099, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.05088997, + "balance_loss_mlp": 1.02156866, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.6454131462854253, + "language_loss": 0.84984052, + "learning_rate": 3.822865591408084e-07, + "loss": 0.87138194, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11474609, + "step": 13404, + "time_per_iteration": 2.4889211654663086 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.04490852, + "balance_loss_mlp": 1.02054691, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 2.2299201178374157, + "language_loss": 0.70442951, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72590959, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12091064, + "step": 13405, + "time_per_iteration": 3.9749341011047363 + }, + { + "auxiliary_loss_clip": 0.01110026, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.03895593, + "balance_loss_mlp": 1.01817274, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 3.120302405034808, + "language_loss": 0.75119841, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77260256, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.12200928, + "step": 13406, + "time_per_iteration": 2.5149848461151123 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.04197228, + "balance_loss_mlp": 1.02232313, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.5635668136312015, + "language_loss": 0.76094174, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78245151, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12646484, + "step": 13407, + "time_per_iteration": 2.5124690532684326 + }, + { + "auxiliary_loss_clip": 0.01108776, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.03749847, + "balance_loss_mlp": 1.02205765, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.6930754524719727, + "language_loss": 0.73923969, + "learning_rate": 3.81371027093822e-07, + "loss": 0.76067758, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12957764, + "step": 13408, + "time_per_iteration": 3.8222575187683105 + }, + { + "auxiliary_loss_clip": 0.0111416, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04339719, + "balance_loss_mlp": 1.02214825, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 2.0622371941113538, + "language_loss": 0.7083745, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72986758, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.13012695, + "step": 13409, + "time_per_iteration": 2.493417501449585 + }, + { + "auxiliary_loss_clip": 0.01113507, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.03847861, + "balance_loss_mlp": 1.01780653, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 3.0178733463185385, + "language_loss": 0.77435076, + "learning_rate": 3.809136293070545e-07, + "loss": 0.79580498, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.14105225, + "step": 13410, + "time_per_iteration": 2.4059736728668213 + }, + { + "auxiliary_loss_clip": 0.01119389, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.04832292, + "balance_loss_mlp": 1.02109599, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 1.8755080279097613, + "language_loss": 0.68788028, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70940101, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11602783, + "step": 13411, + "time_per_iteration": 2.456033229827881 + }, + { + "auxiliary_loss_clip": 0.01112705, + "auxiliary_loss_mlp": 0.01027105, + "balance_loss_clip": 1.04274821, + "balance_loss_mlp": 1.01544619, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.7305289883055397, + "language_loss": 0.68017846, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70157653, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11651611, + "step": 13412, + "time_per_iteration": 2.637322187423706 + }, + { + "auxiliary_loss_clip": 0.01123488, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.04658341, + "balance_loss_mlp": 1.0227077, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.637745463162651, + "language_loss": 0.81607872, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83767778, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.137146, + "step": 13413, + "time_per_iteration": 2.4670445919036865 + }, + { + "auxiliary_loss_clip": 0.01116876, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.04621387, + "balance_loss_mlp": 1.01979136, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.8750517088539977, + "language_loss": 0.8496539, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87115037, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12994385, + "step": 13414, + "time_per_iteration": 2.4756460189819336 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.02142286, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.9262695254650744, + "language_loss": 0.66739118, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.6888392, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12579346, + "step": 13415, + "time_per_iteration": 2.423651933670044 + }, + { + "auxiliary_loss_clip": 0.01111621, + "auxiliary_loss_mlp": 0.01032134, + "balance_loss_clip": 1.04125786, + "balance_loss_mlp": 1.01928961, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.5575006333088675, + "language_loss": 0.76783466, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78927219, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.12860107, + "step": 13416, + "time_per_iteration": 3.8528120517730713 + }, + { + "auxiliary_loss_clip": 0.01121464, + "auxiliary_loss_mlp": 0.01037756, + "balance_loss_clip": 1.04373074, + "balance_loss_mlp": 1.02580571, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.452125694972457, + "language_loss": 0.65435958, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67595178, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11932373, + "step": 13417, + "time_per_iteration": 2.5398364067077637 + }, + { + "auxiliary_loss_clip": 0.01119294, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.04208875, + "balance_loss_mlp": 1.03172278, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.8179059607113464, + "language_loss": 0.80594987, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82757878, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11859131, + "step": 13418, + "time_per_iteration": 2.4801580905914307 + }, + { + "auxiliary_loss_clip": 0.01116876, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.04130483, + "balance_loss_mlp": 1.02348089, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 2.709054572958763, + "language_loss": 0.8500464, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87159681, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.14703369, + "step": 13419, + "time_per_iteration": 2.471717119216919 + }, + { + "auxiliary_loss_clip": 0.01126234, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.05033731, + "balance_loss_mlp": 1.01654088, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.8443728503699213, + "language_loss": 0.75747806, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77902496, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11914062, + "step": 13420, + "time_per_iteration": 2.5310850143432617 + }, + { + "auxiliary_loss_clip": 0.01110029, + "auxiliary_loss_mlp": 0.01023795, + "balance_loss_clip": 1.0388757, + "balance_loss_mlp": 1.01341236, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.9417294601735804, + "language_loss": 0.78521323, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80655146, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1038208, + "step": 13421, + "time_per_iteration": 2.4501678943634033 + }, + { + "auxiliary_loss_clip": 0.01118204, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.04415059, + "balance_loss_mlp": 1.01347017, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.7512812973575356, + "language_loss": 0.80083179, + "learning_rate": 3.78174402269098e-07, + "loss": 0.82227039, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12194824, + "step": 13422, + "time_per_iteration": 2.3879919052124023 + }, + { + "auxiliary_loss_clip": 0.01114348, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.04305172, + "balance_loss_mlp": 1.0175333, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.8180072377829872, + "language_loss": 0.68327266, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70470452, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11303711, + "step": 13423, + "time_per_iteration": 2.480196952819824 + }, + { + "auxiliary_loss_clip": 0.01120722, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.04452693, + "balance_loss_mlp": 1.01918268, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.6719543388568388, + "language_loss": 0.80123234, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82275605, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12475586, + "step": 13424, + "time_per_iteration": 2.445216178894043 + }, + { + "auxiliary_loss_clip": 0.01119829, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.04468441, + "balance_loss_mlp": 1.01581657, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 2.0961377724042793, + "language_loss": 0.78882205, + "learning_rate": 3.774909786710232e-07, + "loss": 0.81030083, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12231445, + "step": 13425, + "time_per_iteration": 2.6520273685455322 + }, + { + "auxiliary_loss_clip": 0.01113956, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.04062605, + "balance_loss_mlp": 1.01908135, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.9043743777719886, + "language_loss": 0.75507188, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77651775, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11547852, + "step": 13426, + "time_per_iteration": 2.4465622901916504 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.04582989, + "balance_loss_mlp": 1.0135448, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 2.026387050239671, + "language_loss": 0.73030996, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75175571, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11761475, + "step": 13427, + "time_per_iteration": 2.4937708377838135 + }, + { + "auxiliary_loss_clip": 0.01119009, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.04536784, + "balance_loss_mlp": 1.0229193, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.6708989937348036, + "language_loss": 0.70761871, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72916687, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12878418, + "step": 13428, + "time_per_iteration": 2.4147658348083496 + }, + { + "auxiliary_loss_clip": 0.01117184, + "auxiliary_loss_mlp": 0.01024043, + "balance_loss_clip": 1.04437709, + "balance_loss_mlp": 1.01341581, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 1.9058908744122671, + "language_loss": 0.75069982, + "learning_rate": 3.765806086070544e-07, + "loss": 0.77211213, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10632324, + "step": 13429, + "time_per_iteration": 2.4379003047943115 + }, + { + "auxiliary_loss_clip": 0.01117821, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.04580808, + "balance_loss_mlp": 1.01618516, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 2.2695145675427195, + "language_loss": 0.67017627, + "learning_rate": 3.763531699700568e-07, + "loss": 0.69163448, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1182251, + "step": 13430, + "time_per_iteration": 2.453080892562866 + }, + { + "auxiliary_loss_clip": 0.01117107, + "auxiliary_loss_mlp": 0.01024518, + "balance_loss_clip": 1.0452106, + "balance_loss_mlp": 1.01352692, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 2.0395896452373754, + "language_loss": 0.80237573, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82379192, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10992432, + "step": 13431, + "time_per_iteration": 3.9127197265625 + }, + { + "auxiliary_loss_clip": 0.01114972, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.04329658, + "balance_loss_mlp": 1.0141257, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 2.151535420095041, + "language_loss": 0.80650234, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82791978, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12646484, + "step": 13432, + "time_per_iteration": 2.436192035675049 + }, + { + "auxiliary_loss_clip": 0.01122708, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.04683578, + "balance_loss_mlp": 1.01975644, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 1.88022712943773, + "language_loss": 0.69879961, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.7203455, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12133789, + "step": 13433, + "time_per_iteration": 2.4599509239196777 + }, + { + "auxiliary_loss_clip": 0.01117064, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.04584646, + "balance_loss_mlp": 1.01394987, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.5115789534443655, + "language_loss": 0.72179699, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74321783, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11065674, + "step": 13434, + "time_per_iteration": 2.5938100814819336 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.01025448, + "balance_loss_clip": 1.04213834, + "balance_loss_mlp": 1.01445663, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 1.9065711161717374, + "language_loss": 0.68138933, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70276666, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10992432, + "step": 13435, + "time_per_iteration": 2.4369819164276123 + }, + { + "auxiliary_loss_clip": 0.01125144, + "auxiliary_loss_mlp": 0.01039748, + "balance_loss_clip": 1.04604745, + "balance_loss_mlp": 1.02488291, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.5804917308232005, + "language_loss": 0.75296825, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77461714, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.1484375, + "step": 13436, + "time_per_iteration": 2.4462525844573975 + }, + { + "auxiliary_loss_clip": 0.01112203, + "auxiliary_loss_mlp": 0.01026923, + "balance_loss_clip": 1.04086983, + "balance_loss_mlp": 1.01518083, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.7133100477591532, + "language_loss": 0.70278859, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72417986, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11737061, + "step": 13437, + "time_per_iteration": 2.469287395477295 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.04339719, + "balance_loss_mlp": 1.02204537, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.6854031535350849, + "language_loss": 0.72932708, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75082958, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11132812, + "step": 13438, + "time_per_iteration": 2.510556697845459 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.04598284, + "balance_loss_mlp": 1.0169251, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 1.910789056772061, + "language_loss": 0.77298284, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79443282, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11376953, + "step": 13439, + "time_per_iteration": 2.5453789234161377 + }, + { + "auxiliary_loss_clip": 0.01116312, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.04441285, + "balance_loss_mlp": 1.01813269, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.563383430812445, + "language_loss": 0.78789175, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80935144, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11529541, + "step": 13440, + "time_per_iteration": 2.4909324645996094 + }, + { + "auxiliary_loss_clip": 0.01120296, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.04533577, + "balance_loss_mlp": 1.0162226, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 1.8988916915883276, + "language_loss": 0.59076166, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.61225015, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12316895, + "step": 13441, + "time_per_iteration": 2.4320905208587646 + }, + { + "auxiliary_loss_clip": 0.011171, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.04345191, + "balance_loss_mlp": 1.01721644, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 15.17855130856064, + "language_loss": 0.76084954, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78231275, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11999512, + "step": 13442, + "time_per_iteration": 2.408660888671875 + }, + { + "auxiliary_loss_clip": 0.01113711, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.04072046, + "balance_loss_mlp": 1.01887012, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.4131304037325618, + "language_loss": 0.70794821, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72938925, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11523438, + "step": 13443, + "time_per_iteration": 2.7180516719818115 + }, + { + "auxiliary_loss_clip": 0.01116732, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.04536533, + "balance_loss_mlp": 1.02463162, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 1.7757024714092153, + "language_loss": 0.82482088, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.8463468, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11242676, + "step": 13444, + "time_per_iteration": 2.409677028656006 + }, + { + "auxiliary_loss_clip": 0.01064761, + "auxiliary_loss_mlp": 0.01005791, + "balance_loss_clip": 1.04001904, + "balance_loss_mlp": 1.00425434, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8478449563940226, + "language_loss": 0.53659022, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55729568, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01535034, + "step": 13445, + "time_per_iteration": 2.932462692260742 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.04445195, + "balance_loss_mlp": 1.0174849, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 4.561510903611339, + "language_loss": 0.72430432, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.74579179, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12915039, + "step": 13446, + "time_per_iteration": 2.409959077835083 + }, + { + "auxiliary_loss_clip": 0.01123403, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.04694271, + "balance_loss_mlp": 1.02043676, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 2.372224721467911, + "language_loss": 0.71277392, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73433983, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12738037, + "step": 13447, + "time_per_iteration": 2.453484296798706 + }, + { + "auxiliary_loss_clip": 0.01120824, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.04607868, + "balance_loss_mlp": 1.02100897, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.2421585434186238, + "language_loss": 0.7516011, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77315235, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13299561, + "step": 13448, + "time_per_iteration": 3.888963460922241 + }, + { + "auxiliary_loss_clip": 0.01041232, + "auxiliary_loss_mlp": 0.01004555, + "balance_loss_clip": 1.01557875, + "balance_loss_mlp": 1.00317383, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7345515316140798, + "language_loss": 0.63901716, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65947497, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.25634766, + "router_z_loss_mlp": 0.01382446, + "step": 13449, + "time_per_iteration": 3.1028740406036377 + }, + { + "auxiliary_loss_clip": 0.01119203, + "auxiliary_loss_mlp": 0.01026523, + "balance_loss_clip": 1.04723442, + "balance_loss_mlp": 1.01447093, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 2.2531409666900566, + "language_loss": 0.74122316, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76268041, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1206665, + "step": 13450, + "time_per_iteration": 2.4905426502227783 + }, + { + "auxiliary_loss_clip": 0.01123702, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.04828203, + "balance_loss_mlp": 1.017048, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.6154684630067586, + "language_loss": 0.73914623, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.76066864, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1149292, + "step": 13451, + "time_per_iteration": 2.4444773197174072 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.03993702, + "balance_loss_mlp": 1.01619053, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.870170397895457, + "language_loss": 0.8030625, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82450318, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13208008, + "step": 13452, + "time_per_iteration": 3.9858853816986084 + }, + { + "auxiliary_loss_clip": 0.01122413, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.04932904, + "balance_loss_mlp": 1.02197945, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 1.7306505182587202, + "language_loss": 0.78371692, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80527258, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11181641, + "step": 13453, + "time_per_iteration": 2.521350860595703 + }, + { + "auxiliary_loss_clip": 0.01111157, + "auxiliary_loss_mlp": 0.01028133, + "balance_loss_clip": 1.03956199, + "balance_loss_mlp": 1.01620626, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.590425681768146, + "language_loss": 0.77409828, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79549122, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11932373, + "step": 13454, + "time_per_iteration": 2.457237720489502 + }, + { + "auxiliary_loss_clip": 0.01122483, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.05081916, + "balance_loss_mlp": 1.01991856, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.780262096229046, + "language_loss": 0.76564157, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78719568, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.13018799, + "step": 13455, + "time_per_iteration": 2.5747625827789307 + }, + { + "auxiliary_loss_clip": 0.01120997, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.04914117, + "balance_loss_mlp": 1.01693201, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 2.8634636760254675, + "language_loss": 0.79119486, + "learning_rate": 3.70461401253471e-07, + "loss": 0.81268835, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11413574, + "step": 13456, + "time_per_iteration": 2.427152633666992 + }, + { + "auxiliary_loss_clip": 0.01119345, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.04726279, + "balance_loss_mlp": 1.02011418, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 1.8307896512798247, + "language_loss": 0.71069813, + "learning_rate": 3.702356279949801e-07, + "loss": 0.73220861, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11590576, + "step": 13457, + "time_per_iteration": 2.551276683807373 + }, + { + "auxiliary_loss_clip": 0.01122614, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.05117369, + "balance_loss_mlp": 1.02051616, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 2.4837006796465113, + "language_loss": 0.73175627, + "learning_rate": 3.700099165373176e-07, + "loss": 0.75329733, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10980225, + "step": 13458, + "time_per_iteration": 2.4489731788635254 + }, + { + "auxiliary_loss_clip": 0.01117517, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.04575253, + "balance_loss_mlp": 1.02234697, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 3.157234997574924, + "language_loss": 0.78864163, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.8101725, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.13226318, + "step": 13459, + "time_per_iteration": 2.4305009841918945 + }, + { + "auxiliary_loss_clip": 0.01114986, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.04160404, + "balance_loss_mlp": 1.0149827, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 1.836156167375992, + "language_loss": 0.80003572, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82145524, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11993408, + "step": 13460, + "time_per_iteration": 3.935328245162964 + }, + { + "auxiliary_loss_clip": 0.01117727, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.0438925, + "balance_loss_mlp": 1.01830077, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.8088886756316582, + "language_loss": 0.84618133, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86766839, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12689209, + "step": 13461, + "time_per_iteration": 2.4591684341430664 + }, + { + "auxiliary_loss_clip": 0.01117369, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.04243493, + "balance_loss_mlp": 1.02287138, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.814812572142153, + "language_loss": 0.762802, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78432775, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12329102, + "step": 13462, + "time_per_iteration": 2.4967596530914307 + }, + { + "auxiliary_loss_clip": 0.01118585, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.04563117, + "balance_loss_mlp": 1.02166724, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 1.5674342071216583, + "language_loss": 0.82826251, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.84979749, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13256836, + "step": 13463, + "time_per_iteration": 2.471383571624756 + }, + { + "auxiliary_loss_clip": 0.01112995, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.04029763, + "balance_loss_mlp": 1.02175689, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 2.2613991254228663, + "language_loss": 0.62359631, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64505398, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11022949, + "step": 13464, + "time_per_iteration": 2.4103353023529053 + }, + { + "auxiliary_loss_clip": 0.0111018, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.04055655, + "balance_loss_mlp": 1.01608729, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.6166054296782213, + "language_loss": 0.61788797, + "learning_rate": 3.684316674755341e-07, + "loss": 0.639256, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10534668, + "step": 13465, + "time_per_iteration": 2.584688186645508 + }, + { + "auxiliary_loss_clip": 0.01122552, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.0498507, + "balance_loss_mlp": 1.02166653, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.691971288263805, + "language_loss": 0.822914, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84446919, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11303711, + "step": 13466, + "time_per_iteration": 2.4116766452789307 + }, + { + "auxiliary_loss_clip": 0.01121119, + "auxiliary_loss_mlp": 0.01038377, + "balance_loss_clip": 1.04542339, + "balance_loss_mlp": 1.02655697, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 3.0150353643546417, + "language_loss": 0.76051766, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.7821126, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1182251, + "step": 13467, + "time_per_iteration": 2.5743088722229004 + }, + { + "auxiliary_loss_clip": 0.01110889, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.0384686, + "balance_loss_mlp": 1.01612473, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 2.0384525263743605, + "language_loss": 0.79106677, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81245482, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11804199, + "step": 13468, + "time_per_iteration": 2.481154441833496 + }, + { + "auxiliary_loss_clip": 0.01120577, + "auxiliary_loss_mlp": 0.0102816, + "balance_loss_clip": 1.04861104, + "balance_loss_mlp": 1.01703811, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.6790461543720714, + "language_loss": 0.67536861, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69685602, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11132812, + "step": 13469, + "time_per_iteration": 2.411423921585083 + }, + { + "auxiliary_loss_clip": 0.01053516, + "auxiliary_loss_mlp": 0.01004194, + "balance_loss_clip": 1.02647316, + "balance_loss_mlp": 1.0027647, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.721157260220426, + "language_loss": 0.54708308, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56766015, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.01428223, + "step": 13470, + "time_per_iteration": 3.1232032775878906 + }, + { + "auxiliary_loss_clip": 0.01112593, + "auxiliary_loss_mlp": 0.01027133, + "balance_loss_clip": 1.04173088, + "balance_loss_mlp": 1.01622522, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.9704328843235535, + "language_loss": 0.69718468, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71858191, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10906982, + "step": 13471, + "time_per_iteration": 2.449355125427246 + }, + { + "auxiliary_loss_clip": 0.01114147, + "auxiliary_loss_mlp": 0.01024497, + "balance_loss_clip": 1.04298615, + "balance_loss_mlp": 1.0131247, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 2.0020842092942637, + "language_loss": 0.80166209, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.82304859, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.1137085, + "step": 13472, + "time_per_iteration": 2.5064303874969482 + }, + { + "auxiliary_loss_clip": 0.01043482, + "auxiliary_loss_mlp": 0.0100708, + "balance_loss_clip": 1.0193578, + "balance_loss_mlp": 1.00585842, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7471357257835337, + "language_loss": 0.57787359, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59837919, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01220703, + "step": 13473, + "time_per_iteration": 3.0898947715759277 + }, + { + "auxiliary_loss_clip": 0.01110717, + "auxiliary_loss_mlp": 0.01025566, + "balance_loss_clip": 1.03838408, + "balance_loss_mlp": 1.01321554, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 2.031506342700455, + "language_loss": 0.74450064, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76586342, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12347412, + "step": 13474, + "time_per_iteration": 2.451681137084961 + }, + { + "auxiliary_loss_clip": 0.01120707, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.04644716, + "balance_loss_mlp": 1.02044964, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.8460507253633112, + "language_loss": 0.78686726, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80839407, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11517334, + "step": 13475, + "time_per_iteration": 3.9329993724823 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.04781055, + "balance_loss_mlp": 1.02036285, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.6351507562669103, + "language_loss": 0.75362444, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77512276, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11273193, + "step": 13476, + "time_per_iteration": 2.452953815460205 + }, + { + "auxiliary_loss_clip": 0.0111699, + "auxiliary_loss_mlp": 0.01048834, + "balance_loss_clip": 1.04221535, + "balance_loss_mlp": 1.03444529, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 3.9628342625429367, + "language_loss": 0.73875356, + "learning_rate": 3.657331523685485e-07, + "loss": 0.76041174, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.14404297, + "step": 13477, + "time_per_iteration": 2.3870184421539307 + }, + { + "auxiliary_loss_clip": 0.01112396, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.03960657, + "balance_loss_mlp": 1.01834309, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 1.9393079781738136, + "language_loss": 0.70321476, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.72463572, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11352539, + "step": 13478, + "time_per_iteration": 2.5233168601989746 + }, + { + "auxiliary_loss_clip": 0.01054218, + "auxiliary_loss_mlp": 0.01003346, + "balance_loss_clip": 1.02946055, + "balance_loss_mlp": 1.0021317, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6824295701536202, + "language_loss": 0.52204072, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54261637, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.012146, + "step": 13479, + "time_per_iteration": 3.0054352283477783 + }, + { + "auxiliary_loss_clip": 0.01115851, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.04309845, + "balance_loss_mlp": 1.01864529, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.5948200082360549, + "language_loss": 0.71426332, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73572564, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11724854, + "step": 13480, + "time_per_iteration": 2.4694933891296387 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.04218125, + "balance_loss_mlp": 1.01975155, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.813613845986909, + "language_loss": 0.80103099, + "learning_rate": 3.648356296957327e-07, + "loss": 0.82248616, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1126709, + "step": 13481, + "time_per_iteration": 2.46077299118042 + }, + { + "auxiliary_loss_clip": 0.01118861, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.04593647, + "balance_loss_mlp": 1.01874304, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.736706364602222, + "language_loss": 0.72610462, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74759644, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11584473, + "step": 13482, + "time_per_iteration": 2.4448695182800293 + }, + { + "auxiliary_loss_clip": 0.01113707, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.03996468, + "balance_loss_mlp": 1.01525807, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.0040685256578765, + "language_loss": 0.65361804, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67502677, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11914062, + "step": 13483, + "time_per_iteration": 2.3790078163146973 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01025753, + "balance_loss_clip": 1.04099464, + "balance_loss_mlp": 1.01387358, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.6134586508705728, + "language_loss": 0.76308608, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78447342, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11883545, + "step": 13484, + "time_per_iteration": 2.4580295085906982 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.04251695, + "balance_loss_mlp": 1.01934767, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.491866900304948, + "language_loss": 0.72473389, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74624205, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12591553, + "step": 13485, + "time_per_iteration": 2.4513680934906006 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01025182, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.0142926, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.9651993041559654, + "language_loss": 0.75971162, + "learning_rate": 3.637151215443308e-07, + "loss": 0.78103131, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10888672, + "step": 13486, + "time_per_iteration": 2.4244041442871094 + }, + { + "auxiliary_loss_clip": 0.01119013, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.04358876, + "balance_loss_mlp": 1.02174854, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 3.7733150941561426, + "language_loss": 0.72447038, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74600983, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1315918, + "step": 13487, + "time_per_iteration": 2.4534811973571777 + }, + { + "auxiliary_loss_clip": 0.0110873, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.04000807, + "balance_loss_mlp": 1.01785707, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.6334563855449777, + "language_loss": 0.843786, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86516738, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.11553955, + "step": 13488, + "time_per_iteration": 2.6590771675109863 + }, + { + "auxiliary_loss_clip": 0.01120109, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.04662824, + "balance_loss_mlp": 1.01689768, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.9020893206760379, + "language_loss": 0.73688805, + "learning_rate": 3.630435611625502e-07, + "loss": 0.75837696, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11877441, + "step": 13489, + "time_per_iteration": 2.5836374759674072 + }, + { + "auxiliary_loss_clip": 0.01116614, + "auxiliary_loss_mlp": 0.0103253, + "balance_loss_clip": 1.04721916, + "balance_loss_mlp": 1.02114546, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.7124633968801377, + "language_loss": 0.71995163, + "learning_rate": 3.628198318377453e-07, + "loss": 0.74144304, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.1138916, + "step": 13490, + "time_per_iteration": 2.5278055667877197 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.03957415, + "balance_loss_mlp": 1.02694201, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 3.0500009580673675, + "language_loss": 0.71702325, + "learning_rate": 3.625961645949762e-07, + "loss": 0.73853934, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12493896, + "step": 13491, + "time_per_iteration": 3.8773865699768066 + }, + { + "auxiliary_loss_clip": 0.0110927, + "auxiliary_loss_mlp": 0.0103054, + "balance_loss_clip": 1.03725743, + "balance_loss_mlp": 1.01928091, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.4513728058493045, + "language_loss": 0.6774615, + "learning_rate": 3.623725594427245e-07, + "loss": 0.69885957, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11260986, + "step": 13492, + "time_per_iteration": 2.443735122680664 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.03788686, + "balance_loss_mlp": 1.01698744, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.6890192731331193, + "language_loss": 0.72075069, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74215239, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12451172, + "step": 13493, + "time_per_iteration": 2.4651970863342285 + }, + { + "auxiliary_loss_clip": 0.01118348, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.0444473, + "balance_loss_mlp": 1.02371073, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.677444902434574, + "language_loss": 0.70691884, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72847158, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.13232422, + "step": 13494, + "time_per_iteration": 2.4880223274230957 + }, + { + "auxiliary_loss_clip": 0.0111623, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.04166806, + "balance_loss_mlp": 1.0203526, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 1.877178085633766, + "language_loss": 0.76686478, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78836322, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13275146, + "step": 13495, + "time_per_iteration": 3.832402229309082 + }, + { + "auxiliary_loss_clip": 0.01122278, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.04762089, + "balance_loss_mlp": 1.02178907, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.8289871223410261, + "language_loss": 0.79911119, + "learning_rate": 3.614787599084417e-07, + "loss": 0.8206774, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12542725, + "step": 13496, + "time_per_iteration": 2.4699575901031494 + }, + { + "auxiliary_loss_clip": 0.0111267, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.0409205, + "balance_loss_mlp": 1.01736033, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.5585213953942885, + "language_loss": 0.71070629, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73213345, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12689209, + "step": 13497, + "time_per_iteration": 2.4520726203918457 + }, + { + "auxiliary_loss_clip": 0.01117097, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.0434413, + "balance_loss_mlp": 1.02001977, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.6563939979727091, + "language_loss": 0.76895589, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79043913, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11199951, + "step": 13498, + "time_per_iteration": 2.4638631343841553 + }, + { + "auxiliary_loss_clip": 0.0111889, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.04703414, + "balance_loss_mlp": 1.0208261, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 2.3006081648684367, + "language_loss": 0.84111041, + "learning_rate": 3.608090626234055e-07, + "loss": 0.86262286, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11523438, + "step": 13499, + "time_per_iteration": 2.4396884441375732 + }, + { + "auxiliary_loss_clip": 0.01109718, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.03940797, + "balance_loss_mlp": 1.01861215, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.7054573342185146, + "language_loss": 0.76173347, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.7831552, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.13861084, + "step": 13500, + "time_per_iteration": 2.447497844696045 + }, + { + "auxiliary_loss_clip": 0.0104418, + "auxiliary_loss_mlp": 0.01003029, + "balance_loss_clip": 1.01960588, + "balance_loss_mlp": 1.00180006, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8047944825500236, + "language_loss": 0.59932017, + "learning_rate": 3.603629085440303e-07, + "loss": 0.61979234, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01228333, + "step": 13501, + "time_per_iteration": 3.099191665649414 + }, + { + "auxiliary_loss_clip": 0.01111593, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.04011083, + "balance_loss_mlp": 1.01952052, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.489714128620807, + "language_loss": 0.79207015, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81350374, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12249756, + "step": 13502, + "time_per_iteration": 2.5191731452941895 + }, + { + "auxiliary_loss_clip": 0.01118877, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.04715741, + "balance_loss_mlp": 1.02243137, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.9795261478364816, + "language_loss": 0.71109784, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73262215, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11120605, + "step": 13503, + "time_per_iteration": 2.4510562419891357 + }, + { + "auxiliary_loss_clip": 0.01116766, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.04530287, + "balance_loss_mlp": 1.01617658, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.4521111510189173, + "language_loss": 0.67932725, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70079052, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1338501, + "step": 13504, + "time_per_iteration": 4.271721601486206 + }, + { + "auxiliary_loss_clip": 0.01112538, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.0386889, + "balance_loss_mlp": 1.01764596, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 1.9407942762684025, + "language_loss": 0.74651593, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76794636, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.128479, + "step": 13505, + "time_per_iteration": 2.6803500652313232 + }, + { + "auxiliary_loss_clip": 0.01112624, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.03954291, + "balance_loss_mlp": 1.01319897, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.13149065583486, + "language_loss": 0.72625721, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.74764723, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1317749, + "step": 13506, + "time_per_iteration": 2.544452667236328 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.04313397, + "balance_loss_mlp": 1.01593637, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.1688190016170212, + "language_loss": 0.75976378, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78124928, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12445068, + "step": 13507, + "time_per_iteration": 2.4058620929718018 + }, + { + "auxiliary_loss_clip": 0.01121579, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.0451858, + "balance_loss_mlp": 1.01492393, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.8459024066150547, + "language_loss": 0.69834971, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.71983284, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11810303, + "step": 13508, + "time_per_iteration": 2.4473204612731934 + }, + { + "auxiliary_loss_clip": 0.01109424, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.03909266, + "balance_loss_mlp": 1.02065253, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.6392160190211857, + "language_loss": 0.76068246, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78209358, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11047363, + "step": 13509, + "time_per_iteration": 2.4266116619110107 + }, + { + "auxiliary_loss_clip": 0.01116727, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.04299998, + "balance_loss_mlp": 1.0196172, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.8989270784291774, + "language_loss": 0.77092475, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79241735, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12921143, + "step": 13510, + "time_per_iteration": 2.4875543117523193 + }, + { + "auxiliary_loss_clip": 0.01120538, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.04558229, + "balance_loss_mlp": 1.02171755, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.7750033131621892, + "language_loss": 0.69849718, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72004211, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12231445, + "step": 13511, + "time_per_iteration": 2.446495294570923 + }, + { + "auxiliary_loss_clip": 0.01115927, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.04042888, + "balance_loss_mlp": 1.02232206, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.8146138063220805, + "language_loss": 0.79727447, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81878048, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12359619, + "step": 13512, + "time_per_iteration": 2.450762987136841 + }, + { + "auxiliary_loss_clip": 0.01111774, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.0407033, + "balance_loss_mlp": 1.01809335, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.6258107386996876, + "language_loss": 0.63672262, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65813684, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11547852, + "step": 13513, + "time_per_iteration": 2.8184449672698975 + }, + { + "auxiliary_loss_clip": 0.01119702, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.04249215, + "balance_loss_mlp": 1.02249312, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.6688014523847727, + "language_loss": 0.71311295, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73465836, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12347412, + "step": 13514, + "time_per_iteration": 2.4534521102905273 + }, + { + "auxiliary_loss_clip": 0.01110128, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.03900683, + "balance_loss_mlp": 1.02084708, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.7687494096380236, + "language_loss": 0.62947905, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.65090746, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11865234, + "step": 13515, + "time_per_iteration": 2.496101140975952 + }, + { + "auxiliary_loss_clip": 0.01112346, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.04350817, + "balance_loss_mlp": 1.01596379, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 1.4812695500468283, + "language_loss": 0.7527861, + "learning_rate": 3.570246849544616e-07, + "loss": 0.7741828, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.11358643, + "step": 13516, + "time_per_iteration": 2.479059934616089 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.04598153, + "balance_loss_mlp": 1.01979876, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 2.211058401687289, + "language_loss": 0.91419971, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93572277, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11639404, + "step": 13517, + "time_per_iteration": 2.654397964477539 + }, + { + "auxiliary_loss_clip": 0.01124815, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.04982173, + "balance_loss_mlp": 1.02053773, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.5088475374028891, + "language_loss": 0.78258741, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80415678, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11578369, + "step": 13518, + "time_per_iteration": 2.519754409790039 + }, + { + "auxiliary_loss_clip": 0.01114368, + "auxiliary_loss_mlp": 0.01027924, + "balance_loss_clip": 1.04400527, + "balance_loss_mlp": 1.01781487, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.6343137296326498, + "language_loss": 0.79197049, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81339341, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10107422, + "step": 13519, + "time_per_iteration": 2.5105838775634766 + }, + { + "auxiliary_loss_clip": 0.0111701, + "auxiliary_loss_mlp": 0.01034571, + "balance_loss_clip": 1.04341936, + "balance_loss_mlp": 1.02322841, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.6834466736128508, + "language_loss": 0.70471966, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72623551, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11334229, + "step": 13520, + "time_per_iteration": 3.9114880561828613 + }, + { + "auxiliary_loss_clip": 0.01116489, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.04413378, + "balance_loss_mlp": 1.02131391, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.4779056622152735, + "language_loss": 0.7272653, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74876213, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11883545, + "step": 13521, + "time_per_iteration": 2.4496593475341797 + }, + { + "auxiliary_loss_clip": 0.01119936, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.0452379, + "balance_loss_mlp": 1.01788557, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.6330016019838784, + "language_loss": 0.70306903, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.72456682, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11950684, + "step": 13522, + "time_per_iteration": 2.5205132961273193 + }, + { + "auxiliary_loss_clip": 0.01112208, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.0436089, + "balance_loss_mlp": 1.02020645, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.5584547810035203, + "language_loss": 0.70520532, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72663796, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10852051, + "step": 13523, + "time_per_iteration": 2.4523918628692627 + }, + { + "auxiliary_loss_clip": 0.01105468, + "auxiliary_loss_mlp": 0.01028777, + "balance_loss_clip": 1.03386152, + "balance_loss_mlp": 1.01699901, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.256270066995164, + "language_loss": 0.70894927, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.73029172, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11779785, + "step": 13524, + "time_per_iteration": 2.40173602104187 + }, + { + "auxiliary_loss_clip": 0.01115731, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.0434742, + "balance_loss_mlp": 1.01862025, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.8305766834807722, + "language_loss": 0.62913322, + "learning_rate": 3.550284775712653e-07, + "loss": 0.65059066, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11401367, + "step": 13525, + "time_per_iteration": 2.5104129314422607 + }, + { + "auxiliary_loss_clip": 0.01112497, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.039626, + "balance_loss_mlp": 1.02470279, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 3.9147781623357396, + "language_loss": 0.65486997, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67637837, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.13647461, + "step": 13526, + "time_per_iteration": 2.541184663772583 + }, + { + "auxiliary_loss_clip": 0.01112598, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.04036736, + "balance_loss_mlp": 1.02237177, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 2.5525242434783437, + "language_loss": 0.75422513, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77570593, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.13098145, + "step": 13527, + "time_per_iteration": 2.5032851696014404 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.03956246, + "balance_loss_mlp": 1.01713943, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 1.8184751483748485, + "language_loss": 0.70362681, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.72505128, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11590576, + "step": 13528, + "time_per_iteration": 2.5151703357696533 + }, + { + "auxiliary_loss_clip": 0.01115131, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.04116213, + "balance_loss_mlp": 1.01907361, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 2.78041691762859, + "language_loss": 0.68932545, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71077943, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11175537, + "step": 13529, + "time_per_iteration": 2.4595491886138916 + }, + { + "auxiliary_loss_clip": 0.0110998, + "auxiliary_loss_mlp": 0.01025126, + "balance_loss_clip": 1.03941846, + "balance_loss_mlp": 1.01419437, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.3566832022839004, + "language_loss": 0.77293181, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79428291, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10943604, + "step": 13530, + "time_per_iteration": 2.451460838317871 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.04299235, + "balance_loss_mlp": 1.01903105, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 1.786736373643584, + "language_loss": 0.82240391, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84385991, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12402344, + "step": 13531, + "time_per_iteration": 2.610924482345581 + }, + { + "auxiliary_loss_clip": 0.01114984, + "auxiliary_loss_mlp": 0.01026572, + "balance_loss_clip": 1.04072523, + "balance_loss_mlp": 1.01420403, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 2.081784706695319, + "language_loss": 0.71324408, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73465967, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12365723, + "step": 13532, + "time_per_iteration": 2.361362934112549 + }, + { + "auxiliary_loss_clip": 0.0111303, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.0418458, + "balance_loss_mlp": 1.01900244, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 2.074001733870441, + "language_loss": 0.76281321, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78424615, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11273193, + "step": 13533, + "time_per_iteration": 2.425790786743164 + }, + { + "auxiliary_loss_clip": 0.01119619, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.04258704, + "balance_loss_mlp": 1.0238049, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 2.937045626788104, + "language_loss": 0.76647329, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.7880373, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12969971, + "step": 13534, + "time_per_iteration": 2.4364638328552246 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.01027813, + "balance_loss_clip": 1.04969394, + "balance_loss_mlp": 1.01745379, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.114545353259075, + "language_loss": 0.928738, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95023876, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1036377, + "step": 13535, + "time_per_iteration": 3.8362417221069336 + }, + { + "auxiliary_loss_clip": 0.01123305, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.05335498, + "balance_loss_mlp": 1.0175941, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.5676805415686974, + "language_loss": 0.70304096, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72456455, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11468506, + "step": 13536, + "time_per_iteration": 2.4752986431121826 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.04375231, + "balance_loss_mlp": 1.02050674, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.5096396796143434, + "language_loss": 0.75569427, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77716267, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11633301, + "step": 13537, + "time_per_iteration": 2.458421468734741 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.04164875, + "balance_loss_mlp": 1.02152419, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.4715294315152998, + "language_loss": 0.76257777, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78404975, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11810303, + "step": 13538, + "time_per_iteration": 2.4169952869415283 + }, + { + "auxiliary_loss_clip": 0.01121863, + "auxiliary_loss_mlp": 0.01027774, + "balance_loss_clip": 1.04817748, + "balance_loss_mlp": 1.01647341, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.6304264588544908, + "language_loss": 0.77403355, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.79552984, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11309814, + "step": 13539, + "time_per_iteration": 3.842560052871704 + }, + { + "auxiliary_loss_clip": 0.01123082, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.05356538, + "balance_loss_mlp": 1.0196631, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.571565444201279, + "language_loss": 0.66170669, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68324852, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11437988, + "step": 13540, + "time_per_iteration": 2.600062370300293 + }, + { + "auxiliary_loss_clip": 0.01110251, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.03938508, + "balance_loss_mlp": 1.02273905, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.6119152234864143, + "language_loss": 0.6727553, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69419134, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10632324, + "step": 13541, + "time_per_iteration": 2.5287137031555176 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.04646862, + "balance_loss_mlp": 1.02153993, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 1.7888157392542172, + "language_loss": 0.68854427, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71005988, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12164307, + "step": 13542, + "time_per_iteration": 2.5052709579467773 + }, + { + "auxiliary_loss_clip": 0.01109981, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03553677, + "balance_loss_mlp": 1.01837945, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 3.8006880335006454, + "language_loss": 0.80003083, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.8214519, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13739014, + "step": 13543, + "time_per_iteration": 2.387831926345825 + }, + { + "auxiliary_loss_clip": 0.01116304, + "auxiliary_loss_mlp": 0.01047155, + "balance_loss_clip": 1.04081845, + "balance_loss_mlp": 1.03239727, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.220464472488652, + "language_loss": 0.77821958, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79985416, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.14758301, + "step": 13544, + "time_per_iteration": 2.563214063644409 + }, + { + "auxiliary_loss_clip": 0.01125635, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.04528844, + "balance_loss_mlp": 1.0200026, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 3.3442687798066353, + "language_loss": 0.74120718, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.76280046, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13696289, + "step": 13545, + "time_per_iteration": 2.507551431655884 + }, + { + "auxiliary_loss_clip": 0.01116089, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_clip": 1.044487, + "balance_loss_mlp": 1.01624751, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.59839887875446, + "language_loss": 0.76710832, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78854537, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11376953, + "step": 13546, + "time_per_iteration": 2.525571346282959 + }, + { + "auxiliary_loss_clip": 0.01120924, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.04535842, + "balance_loss_mlp": 1.02086234, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.2789569838519332, + "language_loss": 0.70786077, + "learning_rate": 3.501701426337178e-07, + "loss": 0.7293914, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11291504, + "step": 13547, + "time_per_iteration": 3.884549140930176 + }, + { + "auxiliary_loss_clip": 0.01119347, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.04402757, + "balance_loss_mlp": 1.02117169, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 1.878057077589705, + "language_loss": 0.7089023, + "learning_rate": 3.49950028014111e-07, + "loss": 0.73044038, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.13299561, + "step": 13548, + "time_per_iteration": 2.4944190979003906 + }, + { + "auxiliary_loss_clip": 0.0112373, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.04624557, + "balance_loss_mlp": 1.01823545, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.071539828458204, + "language_loss": 0.76603568, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.7875874, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13208008, + "step": 13549, + "time_per_iteration": 2.4286398887634277 + }, + { + "auxiliary_loss_clip": 0.01119333, + "auxiliary_loss_mlp": 0.01027704, + "balance_loss_clip": 1.047562, + "balance_loss_mlp": 1.01583111, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 2.929048289237846, + "language_loss": 0.71335566, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73482597, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11871338, + "step": 13550, + "time_per_iteration": 2.4829835891723633 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01021702, + "balance_loss_clip": 1.04935193, + "balance_loss_mlp": 1.01129484, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.8635900730404331, + "language_loss": 0.71933365, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74072438, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10406494, + "step": 13551, + "time_per_iteration": 2.394498825073242 + }, + { + "auxiliary_loss_clip": 0.01126475, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.0497067, + "balance_loss_mlp": 1.01742148, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.9772263347038639, + "language_loss": 0.686566, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70813358, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12866211, + "step": 13552, + "time_per_iteration": 2.448242664337158 + }, + { + "auxiliary_loss_clip": 0.01109521, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.0384872, + "balance_loss_mlp": 1.02841353, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.8235448336993065, + "language_loss": 0.82361573, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84511054, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11553955, + "step": 13553, + "time_per_iteration": 2.3971168994903564 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.0502876, + "balance_loss_mlp": 1.01732183, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 1.9680448602715062, + "language_loss": 0.68058217, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70208633, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11413574, + "step": 13554, + "time_per_iteration": 2.4352545738220215 + }, + { + "auxiliary_loss_clip": 0.01120362, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.04681921, + "balance_loss_mlp": 1.01933265, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.7941728772161885, + "language_loss": 0.66433287, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68585634, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12652588, + "step": 13555, + "time_per_iteration": 2.4912590980529785 + }, + { + "auxiliary_loss_clip": 0.01117871, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.04059291, + "balance_loss_mlp": 1.02906561, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.632134954641455, + "language_loss": 0.72907102, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75069427, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.15386963, + "step": 13556, + "time_per_iteration": 2.4353790283203125 + }, + { + "auxiliary_loss_clip": 0.01113922, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.04257703, + "balance_loss_mlp": 1.01677942, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.6096935741365916, + "language_loss": 0.80262685, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82403904, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10522461, + "step": 13557, + "time_per_iteration": 2.487597703933716 + }, + { + "auxiliary_loss_clip": 0.01122283, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.04438221, + "balance_loss_mlp": 1.0229075, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.8512202789469017, + "language_loss": 0.65690064, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.67847639, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12384033, + "step": 13558, + "time_per_iteration": 2.4526665210723877 + }, + { + "auxiliary_loss_clip": 0.01041186, + "auxiliary_loss_mlp": 0.01003839, + "balance_loss_clip": 1.01566219, + "balance_loss_mlp": 1.00238323, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 1.147504743161026, + "language_loss": 0.56921649, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58966678, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01454163, + "step": 13559, + "time_per_iteration": 2.97895884513855 + }, + { + "auxiliary_loss_clip": 0.01040672, + "auxiliary_loss_mlp": 0.01004184, + "balance_loss_clip": 1.01492548, + "balance_loss_mlp": 1.00284457, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6759819146051423, + "language_loss": 0.55301368, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57346225, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01339722, + "step": 13560, + "time_per_iteration": 2.911686420440674 + }, + { + "auxiliary_loss_clip": 0.01118634, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.04782534, + "balance_loss_mlp": 1.01844859, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.7882909387069053, + "language_loss": 0.67886162, + "learning_rate": 3.470942348696948e-07, + "loss": 0.70034289, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11053467, + "step": 13561, + "time_per_iteration": 2.4724462032318115 + }, + { + "auxiliary_loss_clip": 0.01126375, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.04584634, + "balance_loss_mlp": 1.02219915, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.765933047235193, + "language_loss": 0.81394851, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83555663, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12243652, + "step": 13562, + "time_per_iteration": 2.44230318069458 + }, + { + "auxiliary_loss_clip": 0.01122483, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.04381776, + "balance_loss_mlp": 1.0282166, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.553609007495868, + "language_loss": 0.71952152, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74115241, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.1239624, + "step": 13563, + "time_per_iteration": 3.94350266456604 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.04395771, + "balance_loss_mlp": 1.01673555, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.8901956652507546, + "language_loss": 0.69836593, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.71982783, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12835693, + "step": 13564, + "time_per_iteration": 2.4938557147979736 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01024683, + "balance_loss_clip": 1.04181838, + "balance_loss_mlp": 1.01273203, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.9612362480343355, + "language_loss": 0.70252085, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72393143, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11962891, + "step": 13565, + "time_per_iteration": 2.489614486694336 + }, + { + "auxiliary_loss_clip": 0.01111971, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.04216385, + "balance_loss_mlp": 1.02081275, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.8230066996727745, + "language_loss": 0.78849113, + "learning_rate": 3.459986724180188e-07, + "loss": 0.80993605, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.1171875, + "step": 13566, + "time_per_iteration": 2.462228536605835 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01027513, + "balance_loss_clip": 1.04442191, + "balance_loss_mlp": 1.01718402, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.6491869070597314, + "language_loss": 0.82184744, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84326553, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10333252, + "step": 13567, + "time_per_iteration": 2.534396171569824 + }, + { + "auxiliary_loss_clip": 0.01111976, + "auxiliary_loss_mlp": 0.01024826, + "balance_loss_clip": 1.04283309, + "balance_loss_mlp": 1.01464558, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.566970469923295, + "language_loss": 0.79739034, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81875837, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10180664, + "step": 13568, + "time_per_iteration": 2.528026819229126 + }, + { + "auxiliary_loss_clip": 0.01118959, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.04969037, + "balance_loss_mlp": 1.01607621, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.7886777074259868, + "language_loss": 0.77334952, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79481238, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11260986, + "step": 13569, + "time_per_iteration": 2.461480140686035 + }, + { + "auxiliary_loss_clip": 0.0111458, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.04172873, + "balance_loss_mlp": 1.02286983, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.4785678276964402, + "language_loss": 0.5856005, + "learning_rate": 3.451233513649199e-07, + "loss": 0.60708869, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11376953, + "step": 13570, + "time_per_iteration": 2.6362569332122803 + }, + { + "auxiliary_loss_clip": 0.01119415, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.04427207, + "balance_loss_mlp": 1.02481747, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.8247767160453248, + "language_loss": 0.82335448, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84493566, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13891602, + "step": 13571, + "time_per_iteration": 2.4688730239868164 + }, + { + "auxiliary_loss_clip": 0.01125426, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.05154729, + "balance_loss_mlp": 1.02548397, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.6375407346630593, + "language_loss": 0.78733194, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80895907, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11804199, + "step": 13572, + "time_per_iteration": 2.407738208770752 + }, + { + "auxiliary_loss_clip": 0.01121645, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.04748249, + "balance_loss_mlp": 1.01791644, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.444190545440613, + "language_loss": 0.65206403, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.6735695, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.10986328, + "step": 13573, + "time_per_iteration": 2.5722503662109375 + }, + { + "auxiliary_loss_clip": 0.01115757, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.04457951, + "balance_loss_mlp": 1.01847625, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.8976025187977923, + "language_loss": 0.75262868, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77407873, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10784912, + "step": 13574, + "time_per_iteration": 2.4735498428344727 + }, + { + "auxiliary_loss_clip": 0.01117041, + "auxiliary_loss_mlp": 0.01031635, + "balance_loss_clip": 1.04478478, + "balance_loss_mlp": 1.0194819, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 2.058967706107604, + "language_loss": 0.59672868, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61821538, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12139893, + "step": 13575, + "time_per_iteration": 2.5320870876312256 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.04549706, + "balance_loss_mlp": 1.01991177, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.8429399956289887, + "language_loss": 0.74190438, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76342446, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12072754, + "step": 13576, + "time_per_iteration": 2.4001996517181396 + }, + { + "auxiliary_loss_clip": 0.01038936, + "auxiliary_loss_mlp": 0.01000371, + "balance_loss_clip": 1.01370609, + "balance_loss_mlp": 0.99892974, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8339017118552627, + "language_loss": 0.58683085, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60722399, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.0144043, + "step": 13577, + "time_per_iteration": 3.0786936283111572 + }, + { + "auxiliary_loss_clip": 0.01116296, + "auxiliary_loss_mlp": 0.01028334, + "balance_loss_clip": 1.04655409, + "balance_loss_mlp": 1.01704454, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.5576823120689156, + "language_loss": 0.71237481, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73382115, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11291504, + "step": 13578, + "time_per_iteration": 2.442172050476074 + }, + { + "auxiliary_loss_clip": 0.01115939, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.04448581, + "balance_loss_mlp": 1.01859403, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.871592227672363, + "language_loss": 0.73679686, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75825852, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11627197, + "step": 13579, + "time_per_iteration": 3.8731260299682617 + }, + { + "auxiliary_loss_clip": 0.01113246, + "auxiliary_loss_mlp": 0.01029009, + "balance_loss_clip": 1.04014039, + "balance_loss_mlp": 1.0161283, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 2.0983759968867908, + "language_loss": 0.79477632, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81619883, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12878418, + "step": 13580, + "time_per_iteration": 2.4709115028381348 + }, + { + "auxiliary_loss_clip": 0.01111625, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.04155421, + "balance_loss_mlp": 1.01779258, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 2.4316075964914594, + "language_loss": 0.68922216, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71062756, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11120605, + "step": 13581, + "time_per_iteration": 2.4187042713165283 + }, + { + "auxiliary_loss_clip": 0.0111622, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.04430664, + "balance_loss_mlp": 1.01594138, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.7192984646396425, + "language_loss": 0.59769684, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61913031, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11193848, + "step": 13582, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.01106875, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.03936172, + "balance_loss_mlp": 1.02030277, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.3940130893755538, + "language_loss": 0.82561302, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84699547, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.11065674, + "step": 13583, + "time_per_iteration": 4.009417295455933 + }, + { + "auxiliary_loss_clip": 0.01111865, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.04003036, + "balance_loss_mlp": 1.01732659, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 2.997192716527467, + "language_loss": 0.74800068, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76941329, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12078857, + "step": 13584, + "time_per_iteration": 2.4573888778686523 + }, + { + "auxiliary_loss_clip": 0.01117405, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.04427934, + "balance_loss_mlp": 1.0189178, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.6761643720059913, + "language_loss": 0.74535835, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76683593, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11431885, + "step": 13585, + "time_per_iteration": 2.4577789306640625 + }, + { + "auxiliary_loss_clip": 0.01114886, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.04030275, + "balance_loss_mlp": 1.02305865, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.718983664908311, + "language_loss": 0.69600803, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71751165, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12402344, + "step": 13586, + "time_per_iteration": 2.452381134033203 + }, + { + "auxiliary_loss_clip": 0.01114942, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.04197073, + "balance_loss_mlp": 1.0186727, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.5143532766144, + "language_loss": 0.60903829, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63048226, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10760498, + "step": 13587, + "time_per_iteration": 2.4223389625549316 + }, + { + "auxiliary_loss_clip": 0.01124888, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.05033684, + "balance_loss_mlp": 1.02067864, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.3565316243651218, + "language_loss": 0.70019901, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.72177351, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11883545, + "step": 13588, + "time_per_iteration": 2.5270559787750244 + }, + { + "auxiliary_loss_clip": 0.01122396, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.04800713, + "balance_loss_mlp": 1.01849675, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.5718246662790047, + "language_loss": 0.72801191, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.74954498, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12414551, + "step": 13589, + "time_per_iteration": 2.4539644718170166 + }, + { + "auxiliary_loss_clip": 0.01111579, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.0391382, + "balance_loss_mlp": 1.02043819, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.7813658723259675, + "language_loss": 0.73769021, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75914061, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.13000488, + "step": 13590, + "time_per_iteration": 2.442288875579834 + }, + { + "auxiliary_loss_clip": 0.01126078, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.04936945, + "balance_loss_mlp": 1.02007675, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.792677595059096, + "language_loss": 0.6517424, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67333508, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13122559, + "step": 13591, + "time_per_iteration": 4.0154054164886475 + }, + { + "auxiliary_loss_clip": 0.0111694, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.04098773, + "balance_loss_mlp": 1.02072477, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 2.363647170779584, + "language_loss": 0.68116862, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70266271, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11761475, + "step": 13592, + "time_per_iteration": 2.436096429824829 + }, + { + "auxiliary_loss_clip": 0.01119552, + "auxiliary_loss_mlp": 0.01027403, + "balance_loss_clip": 1.04506969, + "balance_loss_mlp": 1.01504111, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 2.997393725237532, + "language_loss": 0.66233498, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68380451, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12371826, + "step": 13593, + "time_per_iteration": 2.4878695011138916 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.03648829, + "balance_loss_mlp": 1.02199686, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 1.843608153482534, + "language_loss": 0.69352061, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71494085, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.12487793, + "step": 13594, + "time_per_iteration": 2.4404001235961914 + }, + { + "auxiliary_loss_clip": 0.01117315, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.04239786, + "balance_loss_mlp": 1.01929367, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.81181919768834, + "language_loss": 0.65993637, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.68142104, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11853027, + "step": 13595, + "time_per_iteration": 2.4951515197753906 + }, + { + "auxiliary_loss_clip": 0.01119396, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.04356444, + "balance_loss_mlp": 1.01821423, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.6430337534288826, + "language_loss": 0.78811342, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80962861, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13909912, + "step": 13596, + "time_per_iteration": 2.685596227645874 + }, + { + "auxiliary_loss_clip": 0.01120315, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.04828143, + "balance_loss_mlp": 1.01660025, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 2.207933891766527, + "language_loss": 0.58770144, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60919529, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12469482, + "step": 13597, + "time_per_iteration": 2.4772839546203613 + }, + { + "auxiliary_loss_clip": 0.01112799, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.04237282, + "balance_loss_mlp": 1.01887155, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.5393937760704817, + "language_loss": 0.82421744, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84564757, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11340332, + "step": 13598, + "time_per_iteration": 2.436511754989624 + }, + { + "auxiliary_loss_clip": 0.01121155, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.04575717, + "balance_loss_mlp": 1.01727557, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 2.5062341056926716, + "language_loss": 0.82245755, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.8439582, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11651611, + "step": 13599, + "time_per_iteration": 2.493255376815796 + }, + { + "auxiliary_loss_clip": 0.01114283, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.04199505, + "balance_loss_mlp": 1.01967764, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.826482684221956, + "language_loss": 0.8374843, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.85893977, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11572266, + "step": 13600, + "time_per_iteration": 2.602456569671631 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.04674506, + "balance_loss_mlp": 1.02207565, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 2.038065781371288, + "language_loss": 0.74018043, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76173067, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12280273, + "step": 13601, + "time_per_iteration": 2.461470365524292 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.04678607, + "balance_loss_mlp": 1.01986849, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.292830508546275, + "language_loss": 0.68112493, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70267332, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11865234, + "step": 13602, + "time_per_iteration": 2.393915891647339 + }, + { + "auxiliary_loss_clip": 0.01114351, + "auxiliary_loss_mlp": 0.01026415, + "balance_loss_clip": 1.04316568, + "balance_loss_mlp": 1.0152452, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 1.9982985448654176, + "language_loss": 0.83232486, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85373259, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11175537, + "step": 13603, + "time_per_iteration": 2.399346113204956 + }, + { + "auxiliary_loss_clip": 0.01107177, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.03592372, + "balance_loss_mlp": 1.01742148, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.8175410047977527, + "language_loss": 0.69639909, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71775782, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.1126709, + "step": 13604, + "time_per_iteration": 2.465190887451172 + }, + { + "auxiliary_loss_clip": 0.0111667, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.04469514, + "balance_loss_mlp": 1.01871932, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 4.911866891701376, + "language_loss": 0.74065912, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76212829, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11523438, + "step": 13605, + "time_per_iteration": 2.444803476333618 + }, + { + "auxiliary_loss_clip": 0.01121697, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.04988599, + "balance_loss_mlp": 1.02857232, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.8986626905351627, + "language_loss": 0.74009615, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76172441, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12561035, + "step": 13606, + "time_per_iteration": 3.8434786796569824 + }, + { + "auxiliary_loss_clip": 0.01111627, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.04045939, + "balance_loss_mlp": 1.01728964, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.7196188402730843, + "language_loss": 0.6537528, + "learning_rate": 3.370742988503916e-07, + "loss": 0.675156, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11401367, + "step": 13607, + "time_per_iteration": 2.540123224258423 + }, + { + "auxiliary_loss_clip": 0.01113351, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.04138303, + "balance_loss_mlp": 1.01885366, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.6881549750478235, + "language_loss": 0.70473254, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72617459, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12011719, + "step": 13608, + "time_per_iteration": 2.4592647552490234 + }, + { + "auxiliary_loss_clip": 0.01110727, + "auxiliary_loss_mlp": 0.01038618, + "balance_loss_clip": 1.03920615, + "balance_loss_mlp": 1.02546966, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 1.7928855570406594, + "language_loss": 0.79372168, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81521523, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1315918, + "step": 13609, + "time_per_iteration": 2.485907793045044 + }, + { + "auxiliary_loss_clip": 0.0105647, + "auxiliary_loss_mlp": 0.01004024, + "balance_loss_clip": 1.03235388, + "balance_loss_mlp": 1.00276613, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.7446786222865888, + "language_loss": 0.55842543, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.57903039, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01257324, + "step": 13610, + "time_per_iteration": 3.2017698287963867 + }, + { + "auxiliary_loss_clip": 0.01112747, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.04259181, + "balance_loss_mlp": 1.01810002, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 2.128702332938811, + "language_loss": 0.77489918, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79632419, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11657715, + "step": 13611, + "time_per_iteration": 2.4606854915618896 + }, + { + "auxiliary_loss_clip": 0.01119773, + "auxiliary_loss_mlp": 0.01039688, + "balance_loss_clip": 1.04029298, + "balance_loss_mlp": 1.02441168, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 1.794673920553239, + "language_loss": 0.77145338, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79304796, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.152771, + "step": 13612, + "time_per_iteration": 2.4989001750946045 + }, + { + "auxiliary_loss_clip": 0.01117846, + "auxiliary_loss_mlp": 0.01026574, + "balance_loss_clip": 1.04682279, + "balance_loss_mlp": 1.01519525, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 2.166498058794308, + "language_loss": 0.86324275, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88468695, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11376953, + "step": 13613, + "time_per_iteration": 2.403041124343872 + }, + { + "auxiliary_loss_clip": 0.01119362, + "auxiliary_loss_mlp": 0.01038827, + "balance_loss_clip": 1.04825914, + "balance_loss_mlp": 1.02774596, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.5131485221386651, + "language_loss": 0.72749645, + "learning_rate": 3.355612034397746e-07, + "loss": 0.74907833, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11083984, + "step": 13614, + "time_per_iteration": 2.537689447402954 + }, + { + "auxiliary_loss_clip": 0.01118075, + "auxiliary_loss_mlp": 0.01036546, + "balance_loss_clip": 1.04208875, + "balance_loss_mlp": 1.02454782, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.628071994540878, + "language_loss": 0.80954707, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83109331, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12005615, + "step": 13615, + "time_per_iteration": 2.48102068901062 + }, + { + "auxiliary_loss_clip": 0.01118094, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.04433739, + "balance_loss_mlp": 1.01855135, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.8177434016449652, + "language_loss": 0.75976658, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.78125119, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11810303, + "step": 13616, + "time_per_iteration": 2.491367816925049 + }, + { + "auxiliary_loss_clip": 0.01108802, + "auxiliary_loss_mlp": 0.01029412, + "balance_loss_clip": 1.03927493, + "balance_loss_mlp": 1.01724648, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.9081600250902906, + "language_loss": 0.75907874, + "learning_rate": 3.349136805494979e-07, + "loss": 0.78046089, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.12164307, + "step": 13617, + "time_per_iteration": 2.5300800800323486 + }, + { + "auxiliary_loss_clip": 0.01117963, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.04785299, + "balance_loss_mlp": 1.01947105, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 2.0016552348173153, + "language_loss": 0.6859411, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70742571, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11029053, + "step": 13618, + "time_per_iteration": 2.477137565612793 + }, + { + "auxiliary_loss_clip": 0.01119439, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.0441016, + "balance_loss_mlp": 1.01906705, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 4.256083323330025, + "language_loss": 0.69882953, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72034019, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12561035, + "step": 13619, + "time_per_iteration": 2.481954336166382 + }, + { + "auxiliary_loss_clip": 0.01120434, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.0464828, + "balance_loss_mlp": 1.01734567, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.7978067931343154, + "language_loss": 0.74001026, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76150614, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11810303, + "step": 13620, + "time_per_iteration": 2.4519567489624023 + }, + { + "auxiliary_loss_clip": 0.01115015, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.04445744, + "balance_loss_mlp": 1.01546097, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.6558607025714862, + "language_loss": 0.76246166, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78387725, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11083984, + "step": 13621, + "time_per_iteration": 2.438551902770996 + }, + { + "auxiliary_loss_clip": 0.01111731, + "auxiliary_loss_mlp": 0.0102539, + "balance_loss_clip": 1.04029274, + "balance_loss_mlp": 1.01333141, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.354431590331769, + "language_loss": 0.65763152, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67900276, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12054443, + "step": 13622, + "time_per_iteration": 2.506664276123047 + }, + { + "auxiliary_loss_clip": 0.01118913, + "auxiliary_loss_mlp": 0.01025742, + "balance_loss_clip": 1.04566503, + "balance_loss_mlp": 1.01360095, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.799320837062756, + "language_loss": 0.74957776, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77102435, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.121521, + "step": 13623, + "time_per_iteration": 3.8482022285461426 + }, + { + "auxiliary_loss_clip": 0.0111746, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.04447675, + "balance_loss_mlp": 1.02192664, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 4.71149821082018, + "language_loss": 0.63654292, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.6580618, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12518311, + "step": 13624, + "time_per_iteration": 2.6051487922668457 + }, + { + "auxiliary_loss_clip": 0.01114667, + "auxiliary_loss_mlp": 0.01028197, + "balance_loss_clip": 1.04380989, + "balance_loss_mlp": 1.01707458, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.4962228812965312, + "language_loss": 0.78350163, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80493021, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11120605, + "step": 13625, + "time_per_iteration": 2.5351667404174805 + }, + { + "auxiliary_loss_clip": 0.01116495, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.03956091, + "balance_loss_mlp": 1.01732206, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 2.3949152984673128, + "language_loss": 0.76057565, + "learning_rate": 3.329745223345244e-07, + "loss": 0.78203738, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12353516, + "step": 13626, + "time_per_iteration": 2.506970167160034 + }, + { + "auxiliary_loss_clip": 0.01119266, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.04841769, + "balance_loss_mlp": 1.02134442, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.7276277488719058, + "language_loss": 0.73658836, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75810432, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10974121, + "step": 13627, + "time_per_iteration": 3.927985429763794 + }, + { + "auxiliary_loss_clip": 0.01115488, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.04394925, + "balance_loss_mlp": 1.01960945, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.9368560794418757, + "language_loss": 0.69179547, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71326494, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11865234, + "step": 13628, + "time_per_iteration": 2.4189083576202393 + }, + { + "auxiliary_loss_clip": 0.01121228, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.04449952, + "balance_loss_mlp": 1.01878643, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.6276245629639894, + "language_loss": 0.85480058, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87632167, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12091064, + "step": 13629, + "time_per_iteration": 2.4058563709259033 + }, + { + "auxiliary_loss_clip": 0.01115128, + "auxiliary_loss_mlp": 0.010263, + "balance_loss_clip": 1.04400206, + "balance_loss_mlp": 1.01477885, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.236916208931446, + "language_loss": 0.74316078, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.76457506, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11517334, + "step": 13630, + "time_per_iteration": 2.3991963863372803 + }, + { + "auxiliary_loss_clip": 0.01110274, + "auxiliary_loss_mlp": 0.01028127, + "balance_loss_clip": 1.03801012, + "balance_loss_mlp": 1.01654029, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 1.809145224401001, + "language_loss": 0.72057331, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74195737, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.1159668, + "step": 13631, + "time_per_iteration": 2.4586129188537598 + }, + { + "auxiliary_loss_clip": 0.01115787, + "auxiliary_loss_mlp": 0.01027901, + "balance_loss_clip": 1.04180789, + "balance_loss_mlp": 1.0167315, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 2.0121999882847446, + "language_loss": 0.77207255, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.79350948, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1116333, + "step": 13632, + "time_per_iteration": 2.4612245559692383 + }, + { + "auxiliary_loss_clip": 0.01107106, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.03684354, + "balance_loss_mlp": 1.01856065, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.78495039868647, + "language_loss": 0.66120327, + "learning_rate": 3.314698278332588e-07, + "loss": 0.68257612, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.1161499, + "step": 13633, + "time_per_iteration": 2.5102148056030273 + }, + { + "auxiliary_loss_clip": 0.01105055, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.03663325, + "balance_loss_mlp": 1.0226934, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 2.2367987217602128, + "language_loss": 0.75756067, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77894568, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10754395, + "step": 13634, + "time_per_iteration": 2.4881045818328857 + }, + { + "auxiliary_loss_clip": 0.01104541, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.03648961, + "balance_loss_mlp": 1.01744318, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 2.208698623081712, + "language_loss": 0.81767154, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83899927, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10791016, + "step": 13635, + "time_per_iteration": 3.8689730167388916 + }, + { + "auxiliary_loss_clip": 0.01116999, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.04435837, + "balance_loss_mlp": 1.0212698, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.5874632317962918, + "language_loss": 0.75816262, + "learning_rate": 3.308259076607949e-07, + "loss": 0.77967906, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.13366699, + "step": 13636, + "time_per_iteration": 2.579277992248535 + }, + { + "auxiliary_loss_clip": 0.01109048, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.03894019, + "balance_loss_mlp": 1.01835275, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 4.029998062677756, + "language_loss": 0.81308246, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83447766, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.12121582, + "step": 13637, + "time_per_iteration": 2.4784348011016846 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.04442453, + "balance_loss_mlp": 1.01882756, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.748028320100838, + "language_loss": 0.71292323, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.7343871, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.1237793, + "step": 13638, + "time_per_iteration": 2.51005220413208 + }, + { + "auxiliary_loss_clip": 0.01114399, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.03917074, + "balance_loss_mlp": 1.02168489, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 2.1548367812704163, + "language_loss": 0.7972914, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81880796, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.15570068, + "step": 13639, + "time_per_iteration": 2.4608378410339355 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.04569912, + "balance_loss_mlp": 1.02057886, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.7673315928043798, + "language_loss": 0.79270858, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81420624, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.12518311, + "step": 13640, + "time_per_iteration": 2.4264864921569824 + }, + { + "auxiliary_loss_clip": 0.01133391, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.05559576, + "balance_loss_mlp": 1.01574087, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 1.840537890568367, + "language_loss": 0.63576394, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65737402, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11871338, + "step": 13641, + "time_per_iteration": 2.5866451263427734 + }, + { + "auxiliary_loss_clip": 0.01114928, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.04267037, + "balance_loss_mlp": 1.01682675, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.870958110293398, + "language_loss": 0.73742688, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75886589, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12139893, + "step": 13642, + "time_per_iteration": 2.399238348007202 + }, + { + "auxiliary_loss_clip": 0.01116952, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.04648972, + "balance_loss_mlp": 1.01839161, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.6786066668559216, + "language_loss": 0.70238912, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72385901, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11639404, + "step": 13643, + "time_per_iteration": 2.495455503463745 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.04182029, + "balance_loss_mlp": 1.02280879, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.8941940608909749, + "language_loss": 0.65651721, + "learning_rate": 3.291115727880256e-07, + "loss": 0.6780076, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12139893, + "step": 13644, + "time_per_iteration": 2.4243879318237305 + }, + { + "auxiliary_loss_clip": 0.01116385, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.04301667, + "balance_loss_mlp": 1.01822722, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.4892822580785399, + "language_loss": 0.70588219, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72734731, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11889648, + "step": 13645, + "time_per_iteration": 2.4980874061584473 + }, + { + "auxiliary_loss_clip": 0.01119055, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.04703689, + "balance_loss_mlp": 1.01588058, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.7774499517505902, + "language_loss": 0.71495134, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73641455, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11383057, + "step": 13646, + "time_per_iteration": 2.4531655311584473 + }, + { + "auxiliary_loss_clip": 0.01124567, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.05003166, + "balance_loss_mlp": 1.01712871, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.7499755020232692, + "language_loss": 0.78952932, + "learning_rate": 3.284697424316132e-07, + "loss": 0.81106228, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11590576, + "step": 13647, + "time_per_iteration": 2.5125553607940674 + }, + { + "auxiliary_loss_clip": 0.01114659, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.04410958, + "balance_loss_mlp": 1.02146316, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 1.5989668736048115, + "language_loss": 0.67830908, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.69978249, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11206055, + "step": 13648, + "time_per_iteration": 2.4869630336761475 + }, + { + "auxiliary_loss_clip": 0.01111817, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.03830051, + "balance_loss_mlp": 1.0177108, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.8962017817434684, + "language_loss": 0.80138958, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82280511, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12023926, + "step": 13649, + "time_per_iteration": 2.511773109436035 + }, + { + "auxiliary_loss_clip": 0.0111795, + "auxiliary_loss_mlp": 0.01038274, + "balance_loss_clip": 1.04414642, + "balance_loss_mlp": 1.02434468, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.6013757691502817, + "language_loss": 0.6904496, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71201181, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1394043, + "step": 13650, + "time_per_iteration": 3.9787845611572266 + }, + { + "auxiliary_loss_clip": 0.0112741, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.05248463, + "balance_loss_mlp": 1.01963782, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 2.023577940188227, + "language_loss": 0.61126912, + "learning_rate": 3.276148560452001e-07, + "loss": 0.63286817, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.128479, + "step": 13651, + "time_per_iteration": 2.4104511737823486 + }, + { + "auxiliary_loss_clip": 0.01122761, + "auxiliary_loss_mlp": 0.01032253, + "balance_loss_clip": 1.04823864, + "balance_loss_mlp": 1.02015948, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 1.943614117107671, + "language_loss": 0.72143161, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74298179, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12091064, + "step": 13652, + "time_per_iteration": 2.441006898880005 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.04002225, + "balance_loss_mlp": 1.0293895, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 3.039333603413735, + "language_loss": 0.73007309, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75159562, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.12963867, + "step": 13653, + "time_per_iteration": 2.386240243911743 + }, + { + "auxiliary_loss_clip": 0.0112724, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.04963589, + "balance_loss_mlp": 1.01625431, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 13.037725333838633, + "language_loss": 0.63050568, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65207225, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13171387, + "step": 13654, + "time_per_iteration": 2.578533172607422 + }, + { + "auxiliary_loss_clip": 0.01114479, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.04171765, + "balance_loss_mlp": 1.01530671, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.4808406121251267, + "language_loss": 0.70191252, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72331703, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10675049, + "step": 13655, + "time_per_iteration": 2.457524299621582 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01036286, + "balance_loss_clip": 1.04274535, + "balance_loss_mlp": 1.02475286, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.1225421823209802, + "language_loss": 0.81999338, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84148145, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11535645, + "step": 13656, + "time_per_iteration": 2.456099271774292 + }, + { + "auxiliary_loss_clip": 0.01115666, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.04640555, + "balance_loss_mlp": 1.01956105, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.7151035251761177, + "language_loss": 0.74003041, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76149815, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11547852, + "step": 13657, + "time_per_iteration": 2.3965885639190674 + }, + { + "auxiliary_loss_clip": 0.01108192, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.03783762, + "balance_loss_mlp": 1.02216053, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 2.3987575476890965, + "language_loss": 0.56036443, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.58178151, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11364746, + "step": 13658, + "time_per_iteration": 2.5103280544281006 + }, + { + "auxiliary_loss_clip": 0.01112736, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.03860998, + "balance_loss_mlp": 1.02166045, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.4689945621483886, + "language_loss": 0.79231834, + "learning_rate": 3.259081278068805e-07, + "loss": 0.8137958, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.13348389, + "step": 13659, + "time_per_iteration": 2.3866453170776367 + }, + { + "auxiliary_loss_clip": 0.01109279, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.04117107, + "balance_loss_mlp": 1.01572108, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.8595278019996802, + "language_loss": 0.59894979, + "learning_rate": 3.256950723599887e-07, + "loss": 0.6203022, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10229492, + "step": 13660, + "time_per_iteration": 2.605285406112671 + }, + { + "auxiliary_loss_clip": 0.01121764, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.04635334, + "balance_loss_mlp": 1.01951277, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 1.9360824146282851, + "language_loss": 0.72441679, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74595678, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12719727, + "step": 13661, + "time_per_iteration": 2.492098569869995 + }, + { + "auxiliary_loss_clip": 0.01116802, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.04295754, + "balance_loss_mlp": 1.01811326, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 2.053740832363713, + "language_loss": 0.74626935, + "learning_rate": 3.252691519437143e-07, + "loss": 0.76774222, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12371826, + "step": 13662, + "time_per_iteration": 2.445045232772827 + }, + { + "auxiliary_loss_clip": 0.01043699, + "auxiliary_loss_mlp": 0.01004955, + "balance_loss_clip": 1.01800096, + "balance_loss_mlp": 1.00368714, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.9076977346308372, + "language_loss": 0.53979015, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56027663, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01268005, + "step": 13663, + "time_per_iteration": 3.3414738178253174 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.0395416, + "balance_loss_mlp": 1.01950145, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.535919808784134, + "language_loss": 0.65927494, + "learning_rate": 3.248434855512838e-07, + "loss": 0.68069243, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.1161499, + "step": 13664, + "time_per_iteration": 2.4453179836273193 + }, + { + "auxiliary_loss_clip": 0.01119565, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.04804444, + "balance_loss_mlp": 1.01965332, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.5705161122277154, + "language_loss": 0.75314742, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77464747, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10784912, + "step": 13665, + "time_per_iteration": 2.472606658935547 + }, + { + "auxiliary_loss_clip": 0.01119319, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.04618168, + "balance_loss_mlp": 1.01737618, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 2.071605124364987, + "language_loss": 0.64965147, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67113054, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11199951, + "step": 13666, + "time_per_iteration": 3.954801559448242 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.04778767, + "balance_loss_mlp": 1.01751006, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.9455763510461086, + "language_loss": 0.76939934, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79089284, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11199951, + "step": 13667, + "time_per_iteration": 2.488980770111084 + }, + { + "auxiliary_loss_clip": 0.01110406, + "auxiliary_loss_mlp": 0.01028996, + "balance_loss_clip": 1.03640246, + "balance_loss_mlp": 1.01678896, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.8581395368983777, + "language_loss": 0.77152956, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79292357, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12207031, + "step": 13668, + "time_per_iteration": 2.5150084495544434 + }, + { + "auxiliary_loss_clip": 0.01109357, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.03797054, + "balance_loss_mlp": 1.01988149, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.1461036362556376, + "language_loss": 0.73868221, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.7600944, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11999512, + "step": 13669, + "time_per_iteration": 2.419036865234375 + }, + { + "auxiliary_loss_clip": 0.01112749, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.04106903, + "balance_loss_mlp": 1.01517606, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.867457381018812, + "language_loss": 0.78680938, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80820793, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11938477, + "step": 13670, + "time_per_iteration": 3.8292126655578613 + }, + { + "auxiliary_loss_clip": 0.01117821, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.04332352, + "balance_loss_mlp": 1.02238464, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 1.8532193417861709, + "language_loss": 0.754004, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77553004, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12402344, + "step": 13671, + "time_per_iteration": 2.501183271408081 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.04646516, + "balance_loss_mlp": 1.01763785, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.9376185294842156, + "language_loss": 0.76313239, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78466386, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12506104, + "step": 13672, + "time_per_iteration": 2.436270236968994 + }, + { + "auxiliary_loss_clip": 0.01117568, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.04588103, + "balance_loss_mlp": 1.0189991, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.0144475350276503, + "language_loss": 0.75101578, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.77251166, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.13018799, + "step": 13673, + "time_per_iteration": 2.410273313522339 + }, + { + "auxiliary_loss_clip": 0.01123256, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.05025458, + "balance_loss_mlp": 1.01894498, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.639649192167819, + "language_loss": 0.79525864, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81680316, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12243652, + "step": 13674, + "time_per_iteration": 2.4314727783203125 + }, + { + "auxiliary_loss_clip": 0.01111603, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03989911, + "balance_loss_mlp": 1.01947606, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 1.8666644186949708, + "language_loss": 0.7061379, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72756749, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11871338, + "step": 13675, + "time_per_iteration": 2.437598466873169 + }, + { + "auxiliary_loss_clip": 0.01108365, + "auxiliary_loss_mlp": 0.01041981, + "balance_loss_clip": 1.03872466, + "balance_loss_mlp": 1.02817631, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.9563392995530988, + "language_loss": 0.74388397, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76538742, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.13806152, + "step": 13676, + "time_per_iteration": 2.523991107940674 + }, + { + "auxiliary_loss_clip": 0.0111542, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.0430665, + "balance_loss_mlp": 1.01828229, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.778086969722773, + "language_loss": 0.80716723, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.8286159, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11169434, + "step": 13677, + "time_per_iteration": 2.453460216522217 + }, + { + "auxiliary_loss_clip": 0.01120644, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.04629672, + "balance_loss_mlp": 1.02174854, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 2.0171900601716337, + "language_loss": 0.70462722, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72617304, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12200928, + "step": 13678, + "time_per_iteration": 3.8239893913269043 + }, + { + "auxiliary_loss_clip": 0.01117183, + "auxiliary_loss_mlp": 0.01041454, + "balance_loss_clip": 1.04185438, + "balance_loss_mlp": 1.02791166, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.426623119358652, + "language_loss": 0.71477622, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73636258, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13555908, + "step": 13679, + "time_per_iteration": 2.5255579948425293 + }, + { + "auxiliary_loss_clip": 0.0111115, + "auxiliary_loss_mlp": 0.01024767, + "balance_loss_clip": 1.04066825, + "balance_loss_mlp": 1.01327562, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 2.1398886483307185, + "language_loss": 0.69944954, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72080874, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11486816, + "step": 13680, + "time_per_iteration": 2.4127697944641113 + }, + { + "auxiliary_loss_clip": 0.01117473, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.04537284, + "balance_loss_mlp": 1.01951385, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.8248131919676136, + "language_loss": 0.59573674, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61721802, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11157227, + "step": 13681, + "time_per_iteration": 2.4752273559570312 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.0364759, + "balance_loss_mlp": 1.01953065, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.9285241154221926, + "language_loss": 0.69760263, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71902221, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12030029, + "step": 13682, + "time_per_iteration": 2.6199588775634766 + }, + { + "auxiliary_loss_clip": 0.01117239, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.04210913, + "balance_loss_mlp": 1.01901913, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.8337013885292157, + "language_loss": 0.79168427, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81317508, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12811279, + "step": 13683, + "time_per_iteration": 2.512583017349243 + }, + { + "auxiliary_loss_clip": 0.01106022, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.03750563, + "balance_loss_mlp": 1.01661038, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 2.08496175667257, + "language_loss": 0.86682117, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88815272, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10528564, + "step": 13684, + "time_per_iteration": 2.5137946605682373 + }, + { + "auxiliary_loss_clip": 0.01112118, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.0400306, + "balance_loss_mlp": 1.02129722, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.460062593473341, + "language_loss": 0.79673314, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.81818533, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11810303, + "step": 13685, + "time_per_iteration": 2.4857358932495117 + }, + { + "auxiliary_loss_clip": 0.01112461, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.04210567, + "balance_loss_mlp": 1.01778376, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.9889880963368098, + "language_loss": 0.68484831, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70626563, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11499023, + "step": 13686, + "time_per_iteration": 2.4881229400634766 + }, + { + "auxiliary_loss_clip": 0.01114637, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.04090858, + "balance_loss_mlp": 1.02264929, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 3.3612744321806676, + "language_loss": 0.78191906, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80343115, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13934326, + "step": 13687, + "time_per_iteration": 2.3721277713775635 + }, + { + "auxiliary_loss_clip": 0.01115548, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.04224825, + "balance_loss_mlp": 1.02078998, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.7195099165334804, + "language_loss": 0.72663784, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.7481277, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12658691, + "step": 13688, + "time_per_iteration": 2.447347402572632 + }, + { + "auxiliary_loss_clip": 0.01121918, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.04866195, + "balance_loss_mlp": 1.01814532, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.5302078222056645, + "language_loss": 0.72806942, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.74958324, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11309814, + "step": 13689, + "time_per_iteration": 2.572523832321167 + }, + { + "auxiliary_loss_clip": 0.01107478, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.0353055, + "balance_loss_mlp": 1.01793897, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 2.194521939916853, + "language_loss": 0.69202358, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71339655, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11877441, + "step": 13690, + "time_per_iteration": 2.527186393737793 + }, + { + "auxiliary_loss_clip": 0.01110567, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.03895187, + "balance_loss_mlp": 1.01737893, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.7904302334698017, + "language_loss": 0.85287505, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87427235, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11779785, + "step": 13691, + "time_per_iteration": 2.449112892150879 + }, + { + "auxiliary_loss_clip": 0.01112739, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.03995728, + "balance_loss_mlp": 1.01846266, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.9108873345135988, + "language_loss": 0.76965213, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79108113, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11694336, + "step": 13692, + "time_per_iteration": 2.46492338180542 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.04178572, + "balance_loss_mlp": 1.01431024, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.6572691254803367, + "language_loss": 0.71623784, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73762143, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11413574, + "step": 13693, + "time_per_iteration": 2.4827873706817627 + }, + { + "auxiliary_loss_clip": 0.01102519, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.0342989, + "balance_loss_mlp": 1.02012968, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3342592342173007, + "language_loss": 0.83733845, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85866642, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.10125732, + "step": 13694, + "time_per_iteration": 4.02690577507019 + }, + { + "auxiliary_loss_clip": 0.01112905, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.03818917, + "balance_loss_mlp": 1.01852608, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.7073164093097433, + "language_loss": 0.77001733, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79145849, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12695312, + "step": 13695, + "time_per_iteration": 2.477961301803589 + }, + { + "auxiliary_loss_clip": 0.01115015, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.04316926, + "balance_loss_mlp": 1.02030897, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 2.179464188538528, + "language_loss": 0.81026673, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83173823, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11834717, + "step": 13696, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.01072224, + "auxiliary_loss_mlp": 0.01012947, + "balance_loss_clip": 1.04713929, + "balance_loss_mlp": 1.01121545, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7404663795566264, + "language_loss": 0.63896704, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65981877, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01731873, + "step": 13697, + "time_per_iteration": 3.1371917724609375 + }, + { + "auxiliary_loss_clip": 0.01103995, + "auxiliary_loss_mlp": 0.01024542, + "balance_loss_clip": 1.035344, + "balance_loss_mlp": 1.01393819, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.5591517746036814, + "language_loss": 0.73301101, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75429642, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10614014, + "step": 13698, + "time_per_iteration": 2.4392874240875244 + }, + { + "auxiliary_loss_clip": 0.01110056, + "auxiliary_loss_mlp": 0.01029282, + "balance_loss_clip": 1.03863144, + "balance_loss_mlp": 1.01748013, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 2.2141157932793365, + "language_loss": 0.72233963, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74373293, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11810303, + "step": 13699, + "time_per_iteration": 2.4219963550567627 + }, + { + "auxiliary_loss_clip": 0.01104809, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.03515196, + "balance_loss_mlp": 1.01552212, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 2.0088608059113358, + "language_loss": 0.8195762, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84089565, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11608887, + "step": 13700, + "time_per_iteration": 2.4340994358062744 + }, + { + "auxiliary_loss_clip": 0.01113923, + "auxiliary_loss_mlp": 0.01035101, + "balance_loss_clip": 1.04108202, + "balance_loss_mlp": 1.02365077, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.75594848519375, + "language_loss": 0.72913146, + "learning_rate": 3.170145562148763e-07, + "loss": 0.7506218, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11450195, + "step": 13701, + "time_per_iteration": 2.448559284210205 + }, + { + "auxiliary_loss_clip": 0.01113616, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.04087102, + "balance_loss_mlp": 1.02134562, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 2.158290122839171, + "language_loss": 0.69315469, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.7146343, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.13012695, + "step": 13702, + "time_per_iteration": 2.5799498558044434 + }, + { + "auxiliary_loss_clip": 0.01124498, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.05051422, + "balance_loss_mlp": 1.02111614, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.8650965483558013, + "language_loss": 0.75056022, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.77213681, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12054443, + "step": 13703, + "time_per_iteration": 2.466252088546753 + }, + { + "auxiliary_loss_clip": 0.01121481, + "auxiliary_loss_mlp": 0.01036866, + "balance_loss_clip": 1.04532933, + "balance_loss_mlp": 1.02392626, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 2.4968184345061633, + "language_loss": 0.70090145, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.72248495, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12921143, + "step": 13704, + "time_per_iteration": 2.491832733154297 + }, + { + "auxiliary_loss_clip": 0.01109646, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.03751385, + "balance_loss_mlp": 1.01849508, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 3.4655194984773727, + "language_loss": 0.6436913, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66509533, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12249756, + "step": 13705, + "time_per_iteration": 2.462114095687866 + }, + { + "auxiliary_loss_clip": 0.01112697, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.03822351, + "balance_loss_mlp": 1.02301359, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 1.531727592822887, + "language_loss": 0.69367516, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71517295, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.14074707, + "step": 13706, + "time_per_iteration": 2.465632915496826 + }, + { + "auxiliary_loss_clip": 0.01117403, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.04506278, + "balance_loss_mlp": 1.02064121, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.8231936245206, + "language_loss": 0.69578397, + "learning_rate": 3.157532220876475e-07, + "loss": 0.717287, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12268066, + "step": 13707, + "time_per_iteration": 2.462347984313965 + }, + { + "auxiliary_loss_clip": 0.01125284, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.0505836, + "balance_loss_mlp": 1.01539063, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 2.1264154832106628, + "language_loss": 0.79581106, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81733394, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11608887, + "step": 13708, + "time_per_iteration": 2.503652334213257 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01026981, + "balance_loss_clip": 1.04651499, + "balance_loss_mlp": 1.01486909, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 5.147906469474935, + "language_loss": 0.68693304, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70842063, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12115479, + "step": 13709, + "time_per_iteration": 4.008811950683594 + }, + { + "auxiliary_loss_clip": 0.01115265, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.04370546, + "balance_loss_mlp": 1.01931751, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 1.9877955523198891, + "language_loss": 0.82517838, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84663433, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11022949, + "step": 13710, + "time_per_iteration": 2.504713296890259 + }, + { + "auxiliary_loss_clip": 0.01125159, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.04919803, + "balance_loss_mlp": 1.01913548, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 2.4418021695327448, + "language_loss": 0.78430718, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80587387, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1237793, + "step": 13711, + "time_per_iteration": 2.4735465049743652 + }, + { + "auxiliary_loss_clip": 0.01112872, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.04115486, + "balance_loss_mlp": 1.0159291, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 2.90431371436779, + "language_loss": 0.6568594, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67826831, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12091064, + "step": 13712, + "time_per_iteration": 2.4022555351257324 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.04282868, + "balance_loss_mlp": 1.01627159, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.540559836572991, + "language_loss": 0.74264145, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76405835, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11138916, + "step": 13713, + "time_per_iteration": 2.482491970062256 + }, + { + "auxiliary_loss_clip": 0.01126064, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.05167723, + "balance_loss_mlp": 1.01435375, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 1.863466272744078, + "language_loss": 0.81137919, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83290023, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11682129, + "step": 13714, + "time_per_iteration": 2.433554172515869 + }, + { + "auxiliary_loss_clip": 0.01115449, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.04494905, + "balance_loss_mlp": 1.01795781, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 2.1082368462476033, + "language_loss": 0.66585267, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68731111, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12438965, + "step": 13715, + "time_per_iteration": 4.09520149230957 + }, + { + "auxiliary_loss_clip": 0.01113775, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.04084468, + "balance_loss_mlp": 1.01701331, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 3.4669962179298732, + "language_loss": 0.75182146, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77324998, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1204834, + "step": 13716, + "time_per_iteration": 2.4428369998931885 + }, + { + "auxiliary_loss_clip": 0.01069265, + "auxiliary_loss_mlp": 0.01006069, + "balance_loss_clip": 1.04422593, + "balance_loss_mlp": 1.00443006, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7136901663746696, + "language_loss": 0.58942151, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61017483, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.0164032, + "step": 13717, + "time_per_iteration": 3.2472174167633057 + }, + { + "auxiliary_loss_clip": 0.01115126, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.04361439, + "balance_loss_mlp": 1.01980448, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 1.9865209540736917, + "language_loss": 0.7965951, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81804943, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10498047, + "step": 13718, + "time_per_iteration": 2.4626424312591553 + }, + { + "auxiliary_loss_clip": 0.01115304, + "auxiliary_loss_mlp": 0.01034545, + "balance_loss_clip": 1.0425545, + "balance_loss_mlp": 1.02301145, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 2.1197989861026056, + "language_loss": 0.69063926, + "learning_rate": 3.132374531662778e-07, + "loss": 0.7121377, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11529541, + "step": 13719, + "time_per_iteration": 2.4206159114837646 + }, + { + "auxiliary_loss_clip": 0.01115512, + "auxiliary_loss_mlp": 0.01035637, + "balance_loss_clip": 1.04005301, + "balance_loss_mlp": 1.0212605, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.509799029848021, + "language_loss": 0.69817531, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71968687, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1439209, + "step": 13720, + "time_per_iteration": 2.4046406745910645 + }, + { + "auxiliary_loss_clip": 0.01113216, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.03861237, + "balance_loss_mlp": 1.01703835, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.6986294520496468, + "language_loss": 0.75830746, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.77972621, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11627197, + "step": 13721, + "time_per_iteration": 2.5075087547302246 + }, + { + "auxiliary_loss_clip": 0.01119145, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.04739106, + "balance_loss_mlp": 1.01580262, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.901312401563757, + "language_loss": 0.78225732, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80372036, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11346436, + "step": 13722, + "time_per_iteration": 3.9153740406036377 + }, + { + "auxiliary_loss_clip": 0.01122506, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.05055296, + "balance_loss_mlp": 1.01888907, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.9264549480232063, + "language_loss": 0.62749052, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64901662, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11218262, + "step": 13723, + "time_per_iteration": 2.4893157482147217 + }, + { + "auxiliary_loss_clip": 0.01127001, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.05321157, + "balance_loss_mlp": 1.02052557, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.436244389573082, + "language_loss": 0.74328876, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76487958, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11566162, + "step": 13724, + "time_per_iteration": 2.5616087913513184 + }, + { + "auxiliary_loss_clip": 0.01118129, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.04600263, + "balance_loss_mlp": 1.02053154, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.832221182137576, + "language_loss": 0.63529772, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.65681279, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12835693, + "step": 13725, + "time_per_iteration": 2.507356643676758 + }, + { + "auxiliary_loss_clip": 0.01124481, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.05290151, + "balance_loss_mlp": 1.01811504, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.6345466516107605, + "language_loss": 0.82109547, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84263778, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11639404, + "step": 13726, + "time_per_iteration": 2.518670082092285 + }, + { + "auxiliary_loss_clip": 0.01110608, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.04262936, + "balance_loss_mlp": 1.02213025, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.6954741893287513, + "language_loss": 0.70464778, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72608107, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.10601807, + "step": 13727, + "time_per_iteration": 2.5269196033477783 + }, + { + "auxiliary_loss_clip": 0.01118537, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.04342341, + "balance_loss_mlp": 1.02265263, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 1.7635735111409074, + "language_loss": 0.62446511, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64601117, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13421631, + "step": 13728, + "time_per_iteration": 2.5918374061584473 + }, + { + "auxiliary_loss_clip": 0.01126113, + "auxiliary_loss_mlp": 0.01028047, + "balance_loss_clip": 1.0490334, + "balance_loss_mlp": 1.01633477, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.637369512544058, + "language_loss": 0.71487045, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73641205, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11700439, + "step": 13729, + "time_per_iteration": 2.469257116317749 + }, + { + "auxiliary_loss_clip": 0.01043543, + "auxiliary_loss_mlp": 0.01004743, + "balance_loss_clip": 1.01790094, + "balance_loss_mlp": 1.00344312, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8685965092857799, + "language_loss": 0.6270628, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64754564, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01298523, + "step": 13730, + "time_per_iteration": 2.9571118354797363 + }, + { + "auxiliary_loss_clip": 0.01121029, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.04965687, + "balance_loss_mlp": 1.01835132, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.043432195425647, + "language_loss": 0.63361013, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65511775, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11376953, + "step": 13731, + "time_per_iteration": 2.6566925048828125 + }, + { + "auxiliary_loss_clip": 0.01122985, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.04496038, + "balance_loss_mlp": 1.01841116, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.191946605659833, + "language_loss": 0.70125651, + "learning_rate": 3.105224311177812e-07, + "loss": 0.72279263, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12213135, + "step": 13732, + "time_per_iteration": 2.439770460128784 + }, + { + "auxiliary_loss_clip": 0.01118413, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.04262137, + "balance_loss_mlp": 1.02040768, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.6463853167083067, + "language_loss": 0.7123515, + "learning_rate": 3.103140315024817e-07, + "loss": 0.73386234, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12268066, + "step": 13733, + "time_per_iteration": 2.392012357711792 + }, + { + "auxiliary_loss_clip": 0.01115757, + "auxiliary_loss_mlp": 0.01026981, + "balance_loss_clip": 1.04521406, + "balance_loss_mlp": 1.01488698, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.7318165329796047, + "language_loss": 0.82512027, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84654766, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12078857, + "step": 13734, + "time_per_iteration": 2.4818575382232666 + }, + { + "auxiliary_loss_clip": 0.01117967, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.0473665, + "balance_loss_mlp": 1.01560593, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.8841792744013555, + "language_loss": 0.83014584, + "learning_rate": 3.098974244989676e-07, + "loss": 0.85159826, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11676025, + "step": 13735, + "time_per_iteration": 2.4125163555145264 + }, + { + "auxiliary_loss_clip": 0.01115153, + "auxiliary_loss_mlp": 0.01025891, + "balance_loss_clip": 1.04169381, + "balance_loss_mlp": 1.01537085, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 2.1491220717812243, + "language_loss": 0.71097809, + "learning_rate": 3.096892171265497e-07, + "loss": 0.7323885, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10516357, + "step": 13736, + "time_per_iteration": 2.4236745834350586 + }, + { + "auxiliary_loss_clip": 0.01045979, + "auxiliary_loss_mlp": 0.01003868, + "balance_loss_clip": 1.02187943, + "balance_loss_mlp": 1.0025599, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8532311212006659, + "language_loss": 0.6799612, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.7004596, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.2409668, + "router_z_loss_mlp": 0.01307678, + "step": 13737, + "time_per_iteration": 3.0577709674835205 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.03786516, + "balance_loss_mlp": 1.01929355, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 1.814340727421597, + "language_loss": 0.69787252, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71926945, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10931396, + "step": 13738, + "time_per_iteration": 3.88215970993042 + }, + { + "auxiliary_loss_clip": 0.01119422, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.04415059, + "balance_loss_mlp": 1.01856351, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.5016433549340418, + "language_loss": 0.63073707, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65225399, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13696289, + "step": 13739, + "time_per_iteration": 2.571298599243164 + }, + { + "auxiliary_loss_clip": 0.0104676, + "auxiliary_loss_mlp": 0.01005464, + "balance_loss_clip": 1.02177525, + "balance_loss_mlp": 1.00393176, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8313377282890864, + "language_loss": 0.59302151, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61354375, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.24951172, + "router_z_loss_mlp": 0.01530457, + "step": 13740, + "time_per_iteration": 3.110193967819214 + }, + { + "auxiliary_loss_clip": 0.01120398, + "auxiliary_loss_mlp": 0.01031898, + "balance_loss_clip": 1.04298222, + "balance_loss_mlp": 1.01863039, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 2.7828821420856054, + "language_loss": 0.75581431, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77733731, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13262939, + "step": 13741, + "time_per_iteration": 2.5906898975372314 + }, + { + "auxiliary_loss_clip": 0.01109779, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.03804803, + "balance_loss_mlp": 1.01926494, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 2.047371627755438, + "language_loss": 0.62591535, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64732534, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11956787, + "step": 13742, + "time_per_iteration": 2.5124025344848633 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01037953, + "balance_loss_clip": 1.04055381, + "balance_loss_mlp": 1.02190781, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.308749951432583, + "language_loss": 0.66283929, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68438846, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.16040039, + "step": 13743, + "time_per_iteration": 2.405196189880371 + }, + { + "auxiliary_loss_clip": 0.01117936, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.0457139, + "balance_loss_mlp": 1.02030325, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.875891231800951, + "language_loss": 0.66688061, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.68838024, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11730957, + "step": 13744, + "time_per_iteration": 2.429105758666992 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03785861, + "balance_loss_mlp": 1.01889014, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.904547229205614, + "language_loss": 0.75291252, + "learning_rate": 3.078182360753612e-07, + "loss": 0.7743082, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11480713, + "step": 13745, + "time_per_iteration": 2.459841251373291 + }, + { + "auxiliary_loss_clip": 0.01110531, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.03948212, + "balance_loss_mlp": 1.01910138, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.9224080482503754, + "language_loss": 0.78904796, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81044805, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.1038208, + "step": 13746, + "time_per_iteration": 2.4124550819396973 + }, + { + "auxiliary_loss_clip": 0.01115271, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.04098213, + "balance_loss_mlp": 1.02381432, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 2.0100664443168608, + "language_loss": 0.68749964, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70901465, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12414551, + "step": 13747, + "time_per_iteration": 2.4421114921569824 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.03963971, + "balance_loss_mlp": 1.01909745, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 2.2936321368852006, + "language_loss": 0.75694132, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77838814, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12219238, + "step": 13748, + "time_per_iteration": 2.4496116638183594 + }, + { + "auxiliary_loss_clip": 0.01113893, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.04383302, + "balance_loss_mlp": 1.02659345, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 1.8445062995865356, + "language_loss": 0.64261431, + "learning_rate": 3.069883569603102e-07, + "loss": 0.6641311, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11193848, + "step": 13749, + "time_per_iteration": 2.4387714862823486 + }, + { + "auxiliary_loss_clip": 0.01111473, + "auxiliary_loss_mlp": 0.01024896, + "balance_loss_clip": 1.03985858, + "balance_loss_mlp": 1.01379812, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.8430250368246752, + "language_loss": 0.73955405, + "learning_rate": 3.067810476598132e-07, + "loss": 0.76091766, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11108398, + "step": 13750, + "time_per_iteration": 2.4671778678894043 + }, + { + "auxiliary_loss_clip": 0.01118451, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.04411161, + "balance_loss_mlp": 1.02132344, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.8971065412112036, + "language_loss": 0.65850711, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68003154, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12664795, + "step": 13751, + "time_per_iteration": 2.4703872203826904 + }, + { + "auxiliary_loss_clip": 0.01110561, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.04007912, + "balance_loss_mlp": 1.01712286, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.46348135394343, + "language_loss": 0.60712838, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.6285156, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.1104126, + "step": 13752, + "time_per_iteration": 2.596140146255493 + }, + { + "auxiliary_loss_clip": 0.0108506, + "auxiliary_loss_mlp": 0.01001306, + "balance_loss_clip": 1.06087685, + "balance_loss_mlp": 0.99951321, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7724650894414454, + "language_loss": 0.57415366, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59501731, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.24169922, + "router_z_loss_mlp": 0.01791382, + "step": 13753, + "time_per_iteration": 4.545742034912109 + }, + { + "auxiliary_loss_clip": 0.01044402, + "auxiliary_loss_mlp": 0.01004479, + "balance_loss_clip": 1.01849449, + "balance_loss_mlp": 1.00309312, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.7004548435439213, + "language_loss": 0.55004704, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.5705359, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.25927734, + "router_z_loss_mlp": 0.01387024, + "step": 13754, + "time_per_iteration": 3.2037336826324463 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.04098928, + "balance_loss_mlp": 1.01998937, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.9764889607225058, + "language_loss": 0.69285107, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71425748, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10009766, + "step": 13755, + "time_per_iteration": 2.453206777572632 + }, + { + "auxiliary_loss_clip": 0.01113121, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.04320741, + "balance_loss_mlp": 1.01865578, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 2.3346236127714097, + "language_loss": 0.69703138, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.7184546, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10546875, + "step": 13756, + "time_per_iteration": 2.4275758266448975 + }, + { + "auxiliary_loss_clip": 0.01121154, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.04745793, + "balance_loss_mlp": 1.01907563, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.7701264859167507, + "language_loss": 0.72419459, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74571717, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12023926, + "step": 13757, + "time_per_iteration": 3.8807373046875 + }, + { + "auxiliary_loss_clip": 0.01124463, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.04852545, + "balance_loss_mlp": 1.01808667, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.2693383251838988, + "language_loss": 0.69028473, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71184713, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.13696289, + "step": 13758, + "time_per_iteration": 2.4579927921295166 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.03873956, + "balance_loss_mlp": 1.01723099, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.601249292105055, + "language_loss": 0.69431782, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.71567273, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10913086, + "step": 13759, + "time_per_iteration": 2.521740198135376 + }, + { + "auxiliary_loss_clip": 0.011119, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.04191327, + "balance_loss_mlp": 1.01743674, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.7770039183312112, + "language_loss": 0.70920974, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73061764, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11456299, + "step": 13760, + "time_per_iteration": 2.4169607162475586 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.0102727, + "balance_loss_clip": 1.04635966, + "balance_loss_mlp": 1.01643407, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.6539302174266428, + "language_loss": 0.77631551, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79774505, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10839844, + "step": 13761, + "time_per_iteration": 2.449542760848999 + }, + { + "auxiliary_loss_clip": 0.01106931, + "auxiliary_loss_mlp": 0.01029746, + "balance_loss_clip": 1.03791237, + "balance_loss_mlp": 1.01890445, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.6899512669194527, + "language_loss": 0.69684666, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71821344, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10839844, + "step": 13762, + "time_per_iteration": 2.4521424770355225 + }, + { + "auxiliary_loss_clip": 0.01122658, + "auxiliary_loss_mlp": 0.01025801, + "balance_loss_clip": 1.05048358, + "balance_loss_mlp": 1.01563239, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.9162401428648803, + "language_loss": 0.69854724, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.7200318, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10174561, + "step": 13763, + "time_per_iteration": 2.441986322402954 + }, + { + "auxiliary_loss_clip": 0.01036577, + "auxiliary_loss_mlp": 0.01003838, + "balance_loss_clip": 1.01200104, + "balance_loss_mlp": 1.00242555, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8385387465286059, + "language_loss": 0.65142542, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67182958, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01412964, + "step": 13764, + "time_per_iteration": 3.1571638584136963 + }, + { + "auxiliary_loss_clip": 0.01115261, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.04103708, + "balance_loss_mlp": 1.01932716, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 2.348841060869967, + "language_loss": 0.77628261, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79775029, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1217041, + "step": 13765, + "time_per_iteration": 2.419126510620117 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.04351902, + "balance_loss_mlp": 1.01859307, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.8698649812161101, + "language_loss": 0.62820846, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64969313, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12530518, + "step": 13766, + "time_per_iteration": 3.9553170204162598 + }, + { + "auxiliary_loss_clip": 0.01115412, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.04248476, + "balance_loss_mlp": 1.01906931, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.8858115214249966, + "language_loss": 0.82550693, + "learning_rate": 3.03266619632609e-07, + "loss": 0.8469764, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12463379, + "step": 13767, + "time_per_iteration": 2.530191659927368 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.04181695, + "balance_loss_mlp": 1.02032852, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 22.412469215904828, + "language_loss": 0.69061434, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71209848, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11853027, + "step": 13768, + "time_per_iteration": 2.4740233421325684 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01026491, + "balance_loss_clip": 1.04346514, + "balance_loss_mlp": 1.01579785, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 1.8530813612095876, + "language_loss": 0.74625874, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76766205, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10699463, + "step": 13769, + "time_per_iteration": 2.493457317352295 + }, + { + "auxiliary_loss_clip": 0.01109864, + "auxiliary_loss_mlp": 0.01026479, + "balance_loss_clip": 1.03739953, + "balance_loss_mlp": 1.01490951, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 3.205869376754474, + "language_loss": 0.74588579, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76724923, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11566162, + "step": 13770, + "time_per_iteration": 2.585533618927002 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.03779793, + "balance_loss_mlp": 1.01773787, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.661610524153284, + "language_loss": 0.76347506, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.7848767, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12188721, + "step": 13771, + "time_per_iteration": 2.4708478450775146 + }, + { + "auxiliary_loss_clip": 0.01112758, + "auxiliary_loss_mlp": 0.01028458, + "balance_loss_clip": 1.04139304, + "balance_loss_mlp": 1.01731193, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.6349522853563159, + "language_loss": 0.72420204, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74561423, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11138916, + "step": 13772, + "time_per_iteration": 2.6194281578063965 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.03934789, + "balance_loss_mlp": 1.02388644, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.2008704684429903, + "language_loss": 0.74163949, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76313937, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.14733887, + "step": 13773, + "time_per_iteration": 2.4186859130859375 + }, + { + "auxiliary_loss_clip": 0.01115695, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.04261601, + "balance_loss_mlp": 1.02750731, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.8959352013980861, + "language_loss": 0.7560038, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77755225, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11639404, + "step": 13774, + "time_per_iteration": 2.474884510040283 + }, + { + "auxiliary_loss_clip": 0.01119414, + "auxiliary_loss_mlp": 0.0102632, + "balance_loss_clip": 1.04534781, + "balance_loss_mlp": 1.01453042, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.7053689836366792, + "language_loss": 0.74952567, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77098298, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11804199, + "step": 13775, + "time_per_iteration": 2.4293179512023926 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.04369378, + "balance_loss_mlp": 1.01737118, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 2.1967388191499015, + "language_loss": 0.73524129, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75674319, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12658691, + "step": 13776, + "time_per_iteration": 2.5079329013824463 + }, + { + "auxiliary_loss_clip": 0.01117871, + "auxiliary_loss_mlp": 0.01023758, + "balance_loss_clip": 1.04576361, + "balance_loss_mlp": 1.01297522, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.0847106407349165, + "language_loss": 0.77681231, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79822862, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10784912, + "step": 13777, + "time_per_iteration": 2.4036004543304443 + }, + { + "auxiliary_loss_clip": 0.01114719, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.0433383, + "balance_loss_mlp": 1.01771343, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 2.038033512688903, + "language_loss": 0.82627529, + "learning_rate": 3.010024839590604e-07, + "loss": 0.8477121, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1126709, + "step": 13778, + "time_per_iteration": 2.4308583736419678 + }, + { + "auxiliary_loss_clip": 0.01115558, + "auxiliary_loss_mlp": 0.01026129, + "balance_loss_clip": 1.04549897, + "balance_loss_mlp": 1.0144403, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8967520082985518, + "language_loss": 0.74284536, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.7642622, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11694336, + "step": 13779, + "time_per_iteration": 2.424863815307617 + }, + { + "auxiliary_loss_clip": 0.01066307, + "auxiliary_loss_mlp": 0.01012278, + "balance_loss_clip": 1.04123592, + "balance_loss_mlp": 1.01052237, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7694830568994535, + "language_loss": 0.56773722, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58852303, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01754761, + "step": 13780, + "time_per_iteration": 3.1802327632904053 + }, + { + "auxiliary_loss_clip": 0.01118161, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.0440824, + "balance_loss_mlp": 1.0142889, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.653470280651902, + "language_loss": 0.79878205, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82022113, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11450195, + "step": 13781, + "time_per_iteration": 3.9989662170410156 + }, + { + "auxiliary_loss_clip": 0.01115583, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.04416013, + "balance_loss_mlp": 1.01551902, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 2.8481043770277594, + "language_loss": 0.75786972, + "learning_rate": 3.001810941346543e-07, + "loss": 0.7793088, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12805176, + "step": 13782, + "time_per_iteration": 2.5701968669891357 + }, + { + "auxiliary_loss_clip": 0.01114878, + "auxiliary_loss_mlp": 0.01025351, + "balance_loss_clip": 1.04192448, + "balance_loss_mlp": 1.0142169, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.6805816182249533, + "language_loss": 0.76231664, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78371894, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1114502, + "step": 13783, + "time_per_iteration": 2.4551055431365967 + }, + { + "auxiliary_loss_clip": 0.01116612, + "auxiliary_loss_mlp": 0.01023967, + "balance_loss_clip": 1.04492199, + "balance_loss_mlp": 1.01260602, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.6493654614140876, + "language_loss": 0.73633581, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75774157, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11358643, + "step": 13784, + "time_per_iteration": 2.4765167236328125 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.04227245, + "balance_loss_mlp": 1.01865554, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.5134728959254358, + "language_loss": 0.70071375, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72222543, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12963867, + "step": 13785, + "time_per_iteration": 2.524911642074585 + }, + { + "auxiliary_loss_clip": 0.01113543, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.04344654, + "balance_loss_mlp": 1.01711631, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.4229496140706859, + "language_loss": 0.68638694, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70780671, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11309814, + "step": 13786, + "time_per_iteration": 2.4631776809692383 + }, + { + "auxiliary_loss_clip": 0.01122119, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.04521394, + "balance_loss_mlp": 1.01923656, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.71707913593104, + "language_loss": 0.77570826, + "learning_rate": 2.991558072017426e-07, + "loss": 0.7972405, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11883545, + "step": 13787, + "time_per_iteration": 2.4437012672424316 + }, + { + "auxiliary_loss_clip": 0.01114652, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.04105067, + "balance_loss_mlp": 1.02317905, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.7150055281139822, + "language_loss": 0.80746895, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82896507, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11798096, + "step": 13788, + "time_per_iteration": 2.408118963241577 + }, + { + "auxiliary_loss_clip": 0.01113799, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.04393601, + "balance_loss_mlp": 1.01894665, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.666463273987045, + "language_loss": 0.71046448, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73189986, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10809326, + "step": 13789, + "time_per_iteration": 2.4938998222351074 + }, + { + "auxiliary_loss_clip": 0.01120305, + "auxiliary_loss_mlp": 0.01027416, + "balance_loss_clip": 1.0441494, + "balance_loss_mlp": 1.01544178, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 1.6918889149652896, + "language_loss": 0.68415403, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7056312, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11968994, + "step": 13790, + "time_per_iteration": 2.5569639205932617 + }, + { + "auxiliary_loss_clip": 0.01121296, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.04369926, + "balance_loss_mlp": 1.01824284, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.8576197655300652, + "language_loss": 0.77418542, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79571712, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.13635254, + "step": 13791, + "time_per_iteration": 2.449082612991333 + }, + { + "auxiliary_loss_clip": 0.01113755, + "auxiliary_loss_mlp": 0.01024872, + "balance_loss_clip": 1.04511905, + "balance_loss_mlp": 1.01305795, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.511428619167062, + "language_loss": 0.70048827, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72187448, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.1182251, + "step": 13792, + "time_per_iteration": 2.4486024379730225 + }, + { + "auxiliary_loss_clip": 0.01114752, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.04170036, + "balance_loss_mlp": 1.02156794, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.5557955872736662, + "language_loss": 0.64851892, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67001641, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.13439941, + "step": 13793, + "time_per_iteration": 2.488657236099243 + }, + { + "auxiliary_loss_clip": 0.01117713, + "auxiliary_loss_mlp": 0.0103346, + "balance_loss_clip": 1.0438441, + "balance_loss_mlp": 1.01938128, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 2.7536003334646417, + "language_loss": 0.66551185, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68702364, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.14068604, + "step": 13794, + "time_per_iteration": 2.5570883750915527 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01038406, + "balance_loss_clip": 1.03516161, + "balance_loss_mlp": 1.02466118, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.9453067112638165, + "language_loss": 0.66072583, + "learning_rate": 2.975187023140757e-07, + "loss": 0.6821934, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1373291, + "step": 13795, + "time_per_iteration": 2.46136736869812 + }, + { + "auxiliary_loss_clip": 0.01108193, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.04023778, + "balance_loss_mlp": 1.02355134, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 2.04660081614665, + "language_loss": 0.6671598, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68861175, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.13458252, + "step": 13796, + "time_per_iteration": 2.468578577041626 + }, + { + "auxiliary_loss_clip": 0.01109346, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.03868055, + "balance_loss_mlp": 1.02307272, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.7076357184477344, + "language_loss": 0.71734083, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73879719, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.13226318, + "step": 13797, + "time_per_iteration": 3.8530547618865967 + }, + { + "auxiliary_loss_clip": 0.01116432, + "auxiliary_loss_mlp": 0.01043427, + "balance_loss_clip": 1.04204059, + "balance_loss_mlp": 1.03029013, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 1.816710746294799, + "language_loss": 0.72175407, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74335265, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.13146973, + "step": 13798, + "time_per_iteration": 2.4732065200805664 + }, + { + "auxiliary_loss_clip": 0.01113633, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.04568756, + "balance_loss_mlp": 1.01755786, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.6747412801749078, + "language_loss": 0.7641052, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78553057, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.11358643, + "step": 13799, + "time_per_iteration": 2.540116786956787 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.04344749, + "balance_loss_mlp": 1.02685511, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 2.122617270383677, + "language_loss": 0.67944574, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.70098984, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11938477, + "step": 13800, + "time_per_iteration": 2.3996031284332275 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.04493821, + "balance_loss_mlp": 1.01603329, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.762540414362239, + "language_loss": 0.74328947, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76478505, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13354492, + "step": 13801, + "time_per_iteration": 3.7961928844451904 + }, + { + "auxiliary_loss_clip": 0.01114393, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.0419085, + "balance_loss_mlp": 1.01957488, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 2.0274820770078477, + "language_loss": 0.73899251, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.76044166, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10955811, + "step": 13802, + "time_per_iteration": 2.407541036605835 + }, + { + "auxiliary_loss_clip": 0.01119556, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.04699516, + "balance_loss_mlp": 1.0183301, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.494274566909017, + "language_loss": 0.74903715, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77052617, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11022949, + "step": 13803, + "time_per_iteration": 2.492497682571411 + }, + { + "auxiliary_loss_clip": 0.0111882, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.04307711, + "balance_loss_mlp": 1.03075314, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.636626973867754, + "language_loss": 0.79310989, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81473339, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12768555, + "step": 13804, + "time_per_iteration": 2.4056050777435303 + }, + { + "auxiliary_loss_clip": 0.01126241, + "auxiliary_loss_mlp": 0.0102591, + "balance_loss_clip": 1.05173421, + "balance_loss_mlp": 1.01472855, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.6481140597977957, + "language_loss": 0.73131084, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75283235, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11181641, + "step": 13805, + "time_per_iteration": 2.5006256103515625 + }, + { + "auxiliary_loss_clip": 0.01116759, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.04409313, + "balance_loss_mlp": 1.0174408, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 1.9845759627606585, + "language_loss": 0.77669966, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79815745, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11578369, + "step": 13806, + "time_per_iteration": 2.4157047271728516 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04128814, + "balance_loss_mlp": 1.02221, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 3.0842407452951774, + "language_loss": 0.63415724, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65567327, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12573242, + "step": 13807, + "time_per_iteration": 2.584202766418457 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.04823101, + "balance_loss_mlp": 1.01845527, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 1.6479007480542294, + "language_loss": 0.72755826, + "learning_rate": 2.948672208338847e-07, + "loss": 0.74909687, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12054443, + "step": 13808, + "time_per_iteration": 2.411588668823242 + }, + { + "auxiliary_loss_clip": 0.01126251, + "auxiliary_loss_mlp": 0.01041175, + "balance_loss_clip": 1.0487442, + "balance_loss_mlp": 1.02790117, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.7943236368975786, + "language_loss": 0.66686678, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68854105, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.1328125, + "step": 13809, + "time_per_iteration": 2.5244028568267822 + }, + { + "auxiliary_loss_clip": 0.01117722, + "auxiliary_loss_mlp": 0.01027365, + "balance_loss_clip": 1.04502451, + "balance_loss_mlp": 1.01611137, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 2.4071447731473556, + "language_loss": 0.74124956, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76270044, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11260986, + "step": 13810, + "time_per_iteration": 3.8646185398101807 + }, + { + "auxiliary_loss_clip": 0.01117636, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.04758143, + "balance_loss_mlp": 1.02402854, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5583022785956964, + "language_loss": 0.81359303, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83511752, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10778809, + "step": 13811, + "time_per_iteration": 2.467594861984253 + }, + { + "auxiliary_loss_clip": 0.01121985, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.04883289, + "balance_loss_mlp": 1.02133584, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.9442907598543244, + "language_loss": 0.73578191, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75732923, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11413574, + "step": 13812, + "time_per_iteration": 2.4406068325042725 + }, + { + "auxiliary_loss_clip": 0.01112694, + "auxiliary_loss_mlp": 0.01025467, + "balance_loss_clip": 1.04356861, + "balance_loss_mlp": 1.01413655, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.542177365145778, + "language_loss": 0.78323525, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80461687, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11328125, + "step": 13813, + "time_per_iteration": 2.460502862930298 + }, + { + "auxiliary_loss_clip": 0.01124146, + "auxiliary_loss_mlp": 0.0102629, + "balance_loss_clip": 1.04662812, + "balance_loss_mlp": 1.01379085, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 1.780469929055121, + "language_loss": 0.70985484, + "learning_rate": 2.93647144674658e-07, + "loss": 0.73135918, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.125, + "step": 13814, + "time_per_iteration": 2.4300894737243652 + }, + { + "auxiliary_loss_clip": 0.01123684, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.0444597, + "balance_loss_mlp": 1.02465308, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.0049569253856427, + "language_loss": 0.67887998, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70049602, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13269043, + "step": 13815, + "time_per_iteration": 2.436904191970825 + }, + { + "auxiliary_loss_clip": 0.01120685, + "auxiliary_loss_mlp": 0.01028841, + "balance_loss_clip": 1.04802692, + "balance_loss_mlp": 1.01691377, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.8224200839215747, + "language_loss": 0.75709975, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.77859497, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11920166, + "step": 13816, + "time_per_iteration": 2.555856704711914 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.04179966, + "balance_loss_mlp": 1.01974368, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.9222907482326295, + "language_loss": 0.81157726, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83300185, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10498047, + "step": 13817, + "time_per_iteration": 2.4611284732818604 + }, + { + "auxiliary_loss_clip": 0.0111777, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.0449059, + "balance_loss_mlp": 1.02121282, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.5575906475413503, + "language_loss": 0.78101444, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80253279, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12854004, + "step": 13818, + "time_per_iteration": 2.461042881011963 + }, + { + "auxiliary_loss_clip": 0.01119682, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.04717922, + "balance_loss_mlp": 1.02087128, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 2.5424002170130366, + "language_loss": 0.82052588, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84204948, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11804199, + "step": 13819, + "time_per_iteration": 2.4348552227020264 + }, + { + "auxiliary_loss_clip": 0.01043898, + "auxiliary_loss_mlp": 0.01005076, + "balance_loss_clip": 1.01816058, + "balance_loss_mlp": 1.00360215, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7553203358632907, + "language_loss": 0.56201756, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58250731, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.25708008, + "router_z_loss_mlp": 0.01473999, + "step": 13820, + "time_per_iteration": 3.089012861251831 + }, + { + "auxiliary_loss_clip": 0.01116506, + "auxiliary_loss_mlp": 0.01023264, + "balance_loss_clip": 1.04623342, + "balance_loss_mlp": 1.01164699, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.909083265428042, + "language_loss": 0.68278909, + "learning_rate": 2.922266666860831e-07, + "loss": 0.7041868, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.1161499, + "step": 13821, + "time_per_iteration": 2.4446771144866943 + }, + { + "auxiliary_loss_clip": 0.01115741, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.04204941, + "balance_loss_mlp": 1.01953816, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 1.8638485098474054, + "language_loss": 0.691522, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71299696, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12237549, + "step": 13822, + "time_per_iteration": 2.468219041824341 + }, + { + "auxiliary_loss_clip": 0.01110094, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.03947735, + "balance_loss_mlp": 1.01919854, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.727576623650988, + "language_loss": 0.62168777, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64308822, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10754395, + "step": 13823, + "time_per_iteration": 2.5490853786468506 + }, + { + "auxiliary_loss_clip": 0.0104127, + "auxiliary_loss_mlp": 0.01008562, + "balance_loss_clip": 1.01605439, + "balance_loss_mlp": 1.00714791, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.8640967042123757, + "language_loss": 0.6184783, + "learning_rate": 2.916188616354669e-07, + "loss": 0.63897657, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.01412964, + "step": 13824, + "time_per_iteration": 3.1363117694854736 + }, + { + "auxiliary_loss_clip": 0.01115497, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.04469895, + "balance_loss_mlp": 1.01612628, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.5431315425963281, + "language_loss": 0.7373572, + "learning_rate": 2.914163895056552e-07, + "loss": 0.758784, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1105957, + "step": 13825, + "time_per_iteration": 3.8605945110321045 + }, + { + "auxiliary_loss_clip": 0.01121742, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.04462826, + "balance_loss_mlp": 1.02370441, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 2.003625913739484, + "language_loss": 0.80181026, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82339555, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13079834, + "step": 13826, + "time_per_iteration": 2.475264072418213 + }, + { + "auxiliary_loss_clip": 0.01117909, + "auxiliary_loss_mlp": 0.01026573, + "balance_loss_clip": 1.04571044, + "balance_loss_mlp": 1.01446104, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.6731018593999132, + "language_loss": 0.67834842, + "learning_rate": 2.910116396226914e-07, + "loss": 0.69979322, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12115479, + "step": 13827, + "time_per_iteration": 2.479236364364624 + }, + { + "auxiliary_loss_clip": 0.01118378, + "auxiliary_loss_mlp": 0.0102397, + "balance_loss_clip": 1.04706895, + "balance_loss_mlp": 1.0133065, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.8768780226286303, + "language_loss": 0.74131393, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.76273739, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10675049, + "step": 13828, + "time_per_iteration": 2.4177987575531006 + }, + { + "auxiliary_loss_clip": 0.01118069, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.04550743, + "balance_loss_mlp": 1.02267909, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 2.0853841853482757, + "language_loss": 0.67154145, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69306946, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12042236, + "step": 13829, + "time_per_iteration": 2.6421456336975098 + }, + { + "auxiliary_loss_clip": 0.0111854, + "auxiliary_loss_mlp": 0.01029356, + "balance_loss_clip": 1.04433739, + "balance_loss_mlp": 1.01715493, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.7095668473095205, + "language_loss": 0.82732749, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84880656, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12213135, + "step": 13830, + "time_per_iteration": 2.4260077476501465 + }, + { + "auxiliary_loss_clip": 0.01115365, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.04412746, + "balance_loss_mlp": 1.02285004, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.186331557043752, + "language_loss": 0.74747449, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76897055, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11401367, + "step": 13831, + "time_per_iteration": 2.3994884490966797 + }, + { + "auxiliary_loss_clip": 0.0111791, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.04549384, + "balance_loss_mlp": 1.01802433, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.6754316187476743, + "language_loss": 0.71494532, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73642743, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1227417, + "step": 13832, + "time_per_iteration": 2.4473352432250977 + }, + { + "auxiliary_loss_clip": 0.01111881, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.04028237, + "balance_loss_mlp": 1.01908922, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.9410758981317162, + "language_loss": 0.84479678, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86622441, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11798096, + "step": 13833, + "time_per_iteration": 2.5706565380096436 + }, + { + "auxiliary_loss_clip": 0.01122774, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.04896474, + "balance_loss_mlp": 1.01911139, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 2.5330697569126546, + "language_loss": 0.76019508, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.781739, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12506104, + "step": 13834, + "time_per_iteration": 2.532583475112915 + }, + { + "auxiliary_loss_clip": 0.01112963, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.04318321, + "balance_loss_mlp": 1.0165, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.309842494619403, + "language_loss": 0.79270244, + "learning_rate": 2.893952329045459e-07, + "loss": 0.8141064, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.109375, + "step": 13835, + "time_per_iteration": 2.409931182861328 + }, + { + "auxiliary_loss_clip": 0.0112002, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.04544091, + "balance_loss_mlp": 1.02507162, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.982140804598188, + "language_loss": 0.81099194, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.83258063, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13775635, + "step": 13836, + "time_per_iteration": 2.424621105194092 + }, + { + "auxiliary_loss_clip": 0.01113643, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.04190075, + "balance_loss_mlp": 1.01725698, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 1.9196857906712321, + "language_loss": 0.77539927, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.7968182, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10992432, + "step": 13837, + "time_per_iteration": 2.404266119003296 + }, + { + "auxiliary_loss_clip": 0.01122823, + "auxiliary_loss_mlp": 0.01029687, + "balance_loss_clip": 1.04486585, + "balance_loss_mlp": 1.01663339, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.7654992709128305, + "language_loss": 0.83325547, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85478061, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13049316, + "step": 13838, + "time_per_iteration": 2.4324305057525635 + }, + { + "auxiliary_loss_clip": 0.01116434, + "auxiliary_loss_mlp": 0.01031009, + "balance_loss_clip": 1.04580653, + "balance_loss_mlp": 1.01890934, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.933327302312677, + "language_loss": 0.74736488, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76883924, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.12109375, + "step": 13839, + "time_per_iteration": 2.4602084159851074 + }, + { + "auxiliary_loss_clip": 0.01118773, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.04632103, + "balance_loss_mlp": 1.0187571, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.566244838207566, + "language_loss": 0.67814708, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69964433, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12188721, + "step": 13840, + "time_per_iteration": 2.5407822132110596 + }, + { + "auxiliary_loss_clip": 0.01113574, + "auxiliary_loss_mlp": 0.01025345, + "balance_loss_clip": 1.041767, + "balance_loss_mlp": 1.01387739, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 2.2101526934533795, + "language_loss": 0.79422933, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81561852, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11474609, + "step": 13841, + "time_per_iteration": 3.835014820098877 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.05002236, + "balance_loss_mlp": 1.01495647, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.7174697863967983, + "language_loss": 0.68471605, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70618033, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11688232, + "step": 13842, + "time_per_iteration": 2.3913822174072266 + }, + { + "auxiliary_loss_clip": 0.01115052, + "auxiliary_loss_mlp": 0.01025719, + "balance_loss_clip": 1.04338932, + "balance_loss_mlp": 1.01377988, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 1.695777177084833, + "language_loss": 0.73236418, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75377178, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11932373, + "step": 13843, + "time_per_iteration": 2.495994806289673 + }, + { + "auxiliary_loss_clip": 0.0111282, + "auxiliary_loss_mlp": 0.01026996, + "balance_loss_clip": 1.04240119, + "balance_loss_mlp": 1.01528358, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 2.589187909098583, + "language_loss": 0.77592659, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79732472, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.1171875, + "step": 13844, + "time_per_iteration": 2.4166438579559326 + }, + { + "auxiliary_loss_clip": 0.01042692, + "auxiliary_loss_mlp": 0.01005668, + "balance_loss_clip": 1.01674497, + "balance_loss_mlp": 1.00428104, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.8818185925081267, + "language_loss": 0.55246657, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57295012, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.25878906, + "router_z_loss_mlp": 0.01387024, + "step": 13845, + "time_per_iteration": 2.959540367126465 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.04710853, + "balance_loss_mlp": 1.02853823, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 2.1582272722136504, + "language_loss": 0.75446355, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77607924, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1229248, + "step": 13846, + "time_per_iteration": 4.060023546218872 + }, + { + "auxiliary_loss_clip": 0.01113386, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.03857744, + "balance_loss_mlp": 1.01871276, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.7061614780609062, + "language_loss": 0.78848481, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80994421, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13861084, + "step": 13847, + "time_per_iteration": 2.454918146133423 + }, + { + "auxiliary_loss_clip": 0.01112423, + "auxiliary_loss_mlp": 0.01024403, + "balance_loss_clip": 1.04160154, + "balance_loss_mlp": 1.01382291, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.5918768184447616, + "language_loss": 0.74419785, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76556611, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.105896, + "step": 13848, + "time_per_iteration": 2.433783531188965 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01027714, + "balance_loss_clip": 1.03948772, + "balance_loss_mlp": 1.0165379, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 1.8004513327421525, + "language_loss": 0.63294965, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65433097, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11181641, + "step": 13849, + "time_per_iteration": 2.399573802947998 + }, + { + "auxiliary_loss_clip": 0.01116864, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.04299366, + "balance_loss_mlp": 1.018538, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 3.1303901959021205, + "language_loss": 0.80070287, + "learning_rate": 2.863756628194638e-07, + "loss": 0.82217455, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11755371, + "step": 13850, + "time_per_iteration": 2.4478559494018555 + }, + { + "auxiliary_loss_clip": 0.01112021, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.04396999, + "balance_loss_mlp": 1.01817584, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.613380678463222, + "language_loss": 0.7809695, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80237603, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10467529, + "step": 13851, + "time_per_iteration": 2.4584171772003174 + }, + { + "auxiliary_loss_clip": 0.01060031, + "auxiliary_loss_mlp": 0.01005475, + "balance_loss_clip": 1.03590798, + "balance_loss_mlp": 1.00402808, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7571327260180306, + "language_loss": 0.55829298, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57894808, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01448059, + "step": 13852, + "time_per_iteration": 3.1268703937530518 + }, + { + "auxiliary_loss_clip": 0.01110656, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.04202676, + "balance_loss_mlp": 1.02061439, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 1.8408798007215055, + "language_loss": 0.67387772, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69531775, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.12738037, + "step": 13853, + "time_per_iteration": 3.966212034225464 + }, + { + "auxiliary_loss_clip": 0.01118048, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.04509485, + "balance_loss_mlp": 1.01884127, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.5781431485772437, + "language_loss": 0.78771937, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80920231, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11401367, + "step": 13854, + "time_per_iteration": 2.4532968997955322 + }, + { + "auxiliary_loss_clip": 0.01045962, + "auxiliary_loss_mlp": 0.01006326, + "balance_loss_clip": 1.0216186, + "balance_loss_mlp": 1.00496244, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7634109651105745, + "language_loss": 0.58688706, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60740995, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.24365234, + "router_z_loss_mlp": 0.01364136, + "step": 13855, + "time_per_iteration": 2.9654808044433594 + }, + { + "auxiliary_loss_clip": 0.01112343, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.04123712, + "balance_loss_mlp": 1.01545155, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.6288451902675423, + "language_loss": 0.7176069, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73900181, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11700439, + "step": 13856, + "time_per_iteration": 2.4726901054382324 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01026128, + "balance_loss_clip": 1.03708494, + "balance_loss_mlp": 1.01438642, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.5380822563687395, + "language_loss": 0.75404465, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77538127, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11743164, + "step": 13857, + "time_per_iteration": 2.597795009613037 + }, + { + "auxiliary_loss_clip": 0.01106667, + "auxiliary_loss_mlp": 0.01025385, + "balance_loss_clip": 1.03958082, + "balance_loss_mlp": 1.01370883, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.4900991053217993, + "language_loss": 0.73498154, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75630212, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 0.67041016, + "router_z_loss_mlp": 0.11676025, + "step": 13858, + "time_per_iteration": 2.501861572265625 + }, + { + "auxiliary_loss_clip": 0.01119742, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.04209244, + "balance_loss_mlp": 1.02189422, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 1.9596983281415994, + "language_loss": 0.73294926, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75449079, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12518311, + "step": 13859, + "time_per_iteration": 2.5166633129119873 + }, + { + "auxiliary_loss_clip": 0.01110664, + "auxiliary_loss_mlp": 0.01026919, + "balance_loss_clip": 1.04122865, + "balance_loss_mlp": 1.01583886, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.6498195978670616, + "language_loss": 0.79577255, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81714845, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11077881, + "step": 13860, + "time_per_iteration": 2.4916505813598633 + }, + { + "auxiliary_loss_clip": 0.01109927, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.04002583, + "balance_loss_mlp": 1.01898718, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.6605580769122286, + "language_loss": 0.82289088, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84429663, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11663818, + "step": 13861, + "time_per_iteration": 2.533057451248169 + }, + { + "auxiliary_loss_clip": 0.01115657, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.04339528, + "balance_loss_mlp": 1.0165379, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.8805841856230283, + "language_loss": 0.79017228, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81161332, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11914062, + "step": 13862, + "time_per_iteration": 2.4134066104888916 + }, + { + "auxiliary_loss_clip": 0.01111582, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.03937769, + "balance_loss_mlp": 1.02193093, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 2.0239596368572874, + "language_loss": 0.75377166, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77522141, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11456299, + "step": 13863, + "time_per_iteration": 2.4584574699401855 + }, + { + "auxiliary_loss_clip": 0.01109861, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.04211688, + "balance_loss_mlp": 1.01626611, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.2920640729838815, + "language_loss": 0.75321668, + "learning_rate": 2.835705879864232e-07, + "loss": 0.7745868, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.10876465, + "step": 13864, + "time_per_iteration": 2.5191190242767334 + }, + { + "auxiliary_loss_clip": 0.01108208, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.03693366, + "balance_loss_mlp": 1.01941919, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.7914299767897466, + "language_loss": 0.69168997, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71308839, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12213135, + "step": 13865, + "time_per_iteration": 2.617783546447754 + }, + { + "auxiliary_loss_clip": 0.01111195, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.03854287, + "balance_loss_mlp": 1.02368283, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.6209189781480515, + "language_loss": 0.756612, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77807772, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11700439, + "step": 13866, + "time_per_iteration": 2.659946918487549 + }, + { + "auxiliary_loss_clip": 0.01035687, + "auxiliary_loss_mlp": 0.01003432, + "balance_loss_clip": 1.01129544, + "balance_loss_mlp": 1.00193715, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.9074242637998464, + "language_loss": 0.63093466, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65132582, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.01495361, + "step": 13867, + "time_per_iteration": 2.997534990310669 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.04451847, + "balance_loss_mlp": 1.01786923, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.7213301690856384, + "language_loss": 0.72089541, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74233049, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10577393, + "step": 13868, + "time_per_iteration": 2.4769394397735596 + }, + { + "auxiliary_loss_clip": 0.01115603, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.04466724, + "balance_loss_mlp": 1.02069294, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.4749393792629706, + "language_loss": 0.80229294, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82377136, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11535645, + "step": 13869, + "time_per_iteration": 3.9661972522735596 + }, + { + "auxiliary_loss_clip": 0.01114954, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.04363227, + "balance_loss_mlp": 1.02023184, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.6624009480986064, + "language_loss": 0.82658672, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84805572, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1171875, + "step": 13870, + "time_per_iteration": 2.4233005046844482 + }, + { + "auxiliary_loss_clip": 0.01110753, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.03708136, + "balance_loss_mlp": 1.01853454, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 3.0604153246383174, + "language_loss": 0.71481276, + "learning_rate": 2.821728331750264e-07, + "loss": 0.73623538, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12969971, + "step": 13871, + "time_per_iteration": 2.3921570777893066 + }, + { + "auxiliary_loss_clip": 0.01119541, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.0474751, + "balance_loss_mlp": 1.01821494, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.845179724391678, + "language_loss": 0.69116092, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.71265674, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11816406, + "step": 13872, + "time_per_iteration": 2.5177841186523438 + }, + { + "auxiliary_loss_clip": 0.01127434, + "auxiliary_loss_mlp": 0.01026026, + "balance_loss_clip": 1.05388498, + "balance_loss_mlp": 1.01439667, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.8030247351615671, + "language_loss": 0.73221356, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75374818, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11633301, + "step": 13873, + "time_per_iteration": 2.435401201248169 + }, + { + "auxiliary_loss_clip": 0.01127792, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.0513103, + "balance_loss_mlp": 1.01769042, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.2573545734041986, + "language_loss": 0.75320345, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77479494, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.13665771, + "step": 13874, + "time_per_iteration": 2.413712978363037 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01023432, + "balance_loss_clip": 1.04457486, + "balance_loss_mlp": 1.01230407, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.821363972344488, + "language_loss": 0.66002089, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68139744, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11126709, + "step": 13875, + "time_per_iteration": 2.4821388721466064 + }, + { + "auxiliary_loss_clip": 0.0111392, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.04299068, + "balance_loss_mlp": 1.02191663, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.9783326410520672, + "language_loss": 0.79550505, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81697792, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11437988, + "step": 13876, + "time_per_iteration": 2.4361066818237305 + }, + { + "auxiliary_loss_clip": 0.01117054, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.04699612, + "balance_loss_mlp": 1.01992047, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 2.100865990375246, + "language_loss": 0.8721751, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89366031, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11541748, + "step": 13877, + "time_per_iteration": 2.4305968284606934 + }, + { + "auxiliary_loss_clip": 0.01111496, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.04109061, + "balance_loss_mlp": 1.01803017, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.6829779416304214, + "language_loss": 0.69119275, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71259665, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10858154, + "step": 13878, + "time_per_iteration": 2.3891870975494385 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.04301095, + "balance_loss_mlp": 1.01692367, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 1.9433361600845163, + "language_loss": 0.79554832, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81695944, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11254883, + "step": 13879, + "time_per_iteration": 2.448986768722534 + }, + { + "auxiliary_loss_clip": 0.01112283, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.04234052, + "balance_loss_mlp": 1.01508868, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 2.948780783647477, + "language_loss": 0.83695954, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85833699, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.1036377, + "step": 13880, + "time_per_iteration": 2.42576265335083 + }, + { + "auxiliary_loss_clip": 0.01116457, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.04246902, + "balance_loss_mlp": 1.01713252, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.5251707047219512, + "language_loss": 0.77956259, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80101025, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11169434, + "step": 13881, + "time_per_iteration": 2.5249273777008057 + }, + { + "auxiliary_loss_clip": 0.0111605, + "auxiliary_loss_mlp": 0.01027721, + "balance_loss_clip": 1.04685283, + "balance_loss_mlp": 1.01702762, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 4.472240040150256, + "language_loss": 0.79079282, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.81223053, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10693359, + "step": 13882, + "time_per_iteration": 2.4260990619659424 + }, + { + "auxiliary_loss_clip": 0.0110672, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.03659725, + "balance_loss_mlp": 1.02775288, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 2.020823178798833, + "language_loss": 0.80784595, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82930428, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11358643, + "step": 13883, + "time_per_iteration": 2.5209264755249023 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.03448057, + "balance_loss_mlp": 1.01858521, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 7.162326353158478, + "language_loss": 0.74505168, + "learning_rate": 2.795854729972482e-07, + "loss": 0.7664277, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11816406, + "step": 13884, + "time_per_iteration": 3.9419314861297607 + }, + { + "auxiliary_loss_clip": 0.01120381, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.0428822, + "balance_loss_mlp": 1.02200687, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.8137810048946157, + "language_loss": 0.70429623, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72585416, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.1340332, + "step": 13885, + "time_per_iteration": 2.4744081497192383 + }, + { + "auxiliary_loss_clip": 0.0111387, + "auxiliary_loss_mlp": 0.01033755, + "balance_loss_clip": 1.04076529, + "balance_loss_mlp": 1.02051139, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.9147416313486907, + "language_loss": 0.69973278, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72120905, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.13238525, + "step": 13886, + "time_per_iteration": 2.6112074851989746 + }, + { + "auxiliary_loss_clip": 0.0111427, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.04251122, + "balance_loss_mlp": 1.02078533, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.5023657658017482, + "language_loss": 0.79145885, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81294084, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.13153076, + "step": 13887, + "time_per_iteration": 2.4734253883361816 + }, + { + "auxiliary_loss_clip": 0.01119771, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.04344237, + "balance_loss_mlp": 1.02403891, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.0790264932936298, + "language_loss": 0.64240903, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66397434, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12719727, + "step": 13888, + "time_per_iteration": 2.514542579650879 + }, + { + "auxiliary_loss_clip": 0.0112193, + "auxiliary_loss_mlp": 0.01027136, + "balance_loss_clip": 1.04739833, + "balance_loss_mlp": 1.0157578, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.5122992469842123, + "language_loss": 0.67121553, + "learning_rate": 2.785932692855244e-07, + "loss": 0.69270611, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11364746, + "step": 13889, + "time_per_iteration": 3.842193126678467 + }, + { + "auxiliary_loss_clip": 0.0111032, + "auxiliary_loss_mlp": 0.01025316, + "balance_loss_clip": 1.03875113, + "balance_loss_mlp": 1.01424742, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 2.4508071360001344, + "language_loss": 0.68302786, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70438427, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11065674, + "step": 13890, + "time_per_iteration": 2.409097194671631 + }, + { + "auxiliary_loss_clip": 0.01116275, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.04471922, + "balance_loss_mlp": 1.02554727, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.570303248108815, + "language_loss": 0.58866996, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61020619, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11804199, + "step": 13891, + "time_per_iteration": 2.408257246017456 + }, + { + "auxiliary_loss_clip": 0.01112273, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.04081118, + "balance_loss_mlp": 1.01597548, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.615972865513749, + "language_loss": 0.7174238, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73882258, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11633301, + "step": 13892, + "time_per_iteration": 2.493682384490967 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.0102701, + "balance_loss_clip": 1.03588486, + "balance_loss_mlp": 1.01566744, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.6373493862417676, + "language_loss": 0.66300189, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68431491, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.11340332, + "step": 13893, + "time_per_iteration": 2.5141236782073975 + }, + { + "auxiliary_loss_clip": 0.01110576, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.03790271, + "balance_loss_mlp": 1.0135448, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 1.9649629481221684, + "language_loss": 0.78575134, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80711073, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1182251, + "step": 13894, + "time_per_iteration": 2.4869163036346436 + }, + { + "auxiliary_loss_clip": 0.01109879, + "auxiliary_loss_mlp": 0.01024534, + "balance_loss_clip": 1.04209924, + "balance_loss_mlp": 1.01332808, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 3.3892914778199565, + "language_loss": 0.72697872, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74832284, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 0.67773438, + "router_z_loss_mlp": 0.11199951, + "step": 13895, + "time_per_iteration": 2.4556050300598145 + }, + { + "auxiliary_loss_clip": 0.0111017, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.03772354, + "balance_loss_mlp": 1.02120852, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 2.0265094875836525, + "language_loss": 0.72185665, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74330002, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12963867, + "step": 13896, + "time_per_iteration": 2.456820011138916 + }, + { + "auxiliary_loss_clip": 0.01112448, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.04160726, + "balance_loss_mlp": 1.01537037, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.43793666657705, + "language_loss": 0.59059554, + "learning_rate": 2.770091380848423e-07, + "loss": 0.61198962, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11590576, + "step": 13897, + "time_per_iteration": 4.219532012939453 + }, + { + "auxiliary_loss_clip": 0.01035993, + "auxiliary_loss_mlp": 0.01004072, + "balance_loss_clip": 1.01158142, + "balance_loss_mlp": 1.00280356, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.7020104852647083, + "language_loss": 0.57706702, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59746766, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.24438477, + "router_z_loss_mlp": 0.01268005, + "step": 13898, + "time_per_iteration": 3.164592742919922 + }, + { + "auxiliary_loss_clip": 0.01111415, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.03814626, + "balance_loss_mlp": 1.0225265, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.8320189363828887, + "language_loss": 0.79719824, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.81866938, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1315918, + "step": 13899, + "time_per_iteration": 2.4826648235321045 + }, + { + "auxiliary_loss_clip": 0.01111443, + "auxiliary_loss_mlp": 0.01027835, + "balance_loss_clip": 1.03954124, + "balance_loss_mlp": 1.01723742, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.8370609073459376, + "language_loss": 0.69034028, + "learning_rate": 2.764161667219749e-07, + "loss": 0.71173304, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10601807, + "step": 13900, + "time_per_iteration": 2.6273772716522217 + }, + { + "auxiliary_loss_clip": 0.01109936, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.0403564, + "balance_loss_mlp": 1.02043223, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.491189463540431, + "language_loss": 0.71067631, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73209226, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.11224365, + "step": 13901, + "time_per_iteration": 2.4812231063842773 + }, + { + "auxiliary_loss_clip": 0.01112584, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.03789818, + "balance_loss_mlp": 1.02470863, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.1083342649071684, + "language_loss": 0.79872429, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82022518, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12799072, + "step": 13902, + "time_per_iteration": 2.3996613025665283 + }, + { + "auxiliary_loss_clip": 0.01116256, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.04443836, + "balance_loss_mlp": 1.02090681, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.619446394493506, + "language_loss": 0.62572336, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64721441, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1194458, + "step": 13903, + "time_per_iteration": 2.4818129539489746 + }, + { + "auxiliary_loss_clip": 0.0110943, + "auxiliary_loss_mlp": 0.01028644, + "balance_loss_clip": 1.03793216, + "balance_loss_mlp": 1.01736641, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.6937293661904802, + "language_loss": 0.74209547, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76347619, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11273193, + "step": 13904, + "time_per_iteration": 2.4513726234436035 + }, + { + "auxiliary_loss_clip": 0.01106126, + "auxiliary_loss_mlp": 0.01032158, + "balance_loss_clip": 1.03680456, + "balance_loss_mlp": 1.02070189, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.8717330294960086, + "language_loss": 0.7275871, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74896997, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11450195, + "step": 13905, + "time_per_iteration": 2.442934036254883 + }, + { + "auxiliary_loss_clip": 0.0111715, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.04767132, + "balance_loss_mlp": 1.0256381, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 3.286357267754811, + "language_loss": 0.66217136, + "learning_rate": 2.752319888771e-07, + "loss": 0.68370593, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10668945, + "step": 13906, + "time_per_iteration": 2.4855592250823975 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01026622, + "balance_loss_clip": 1.03459167, + "balance_loss_mlp": 1.01495695, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.461486882605776, + "language_loss": 0.7394309, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76075506, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11645508, + "step": 13907, + "time_per_iteration": 2.561617612838745 + }, + { + "auxiliary_loss_clip": 0.01114191, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.04026985, + "balance_loss_mlp": 1.021276, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 1.7181279256103865, + "language_loss": 0.75163442, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77311206, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12316895, + "step": 13908, + "time_per_iteration": 2.519195318222046 + }, + { + "auxiliary_loss_clip": 0.01112533, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.03983665, + "balance_loss_mlp": 1.02007627, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.185472351589092, + "language_loss": 0.71301615, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73448199, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.13983154, + "step": 13909, + "time_per_iteration": 2.482283115386963 + }, + { + "auxiliary_loss_clip": 0.01117276, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.04147816, + "balance_loss_mlp": 1.0263077, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 2.0233473597257134, + "language_loss": 0.73129469, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75285184, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12121582, + "step": 13910, + "time_per_iteration": 2.428831100463867 + }, + { + "auxiliary_loss_clip": 0.01116262, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.04273009, + "balance_loss_mlp": 1.01988304, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.7143406148840385, + "language_loss": 0.7355783, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75705349, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11364746, + "step": 13911, + "time_per_iteration": 3.8613250255584717 + }, + { + "auxiliary_loss_clip": 0.01120748, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.04621375, + "balance_loss_mlp": 1.02400851, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 2.243774396331736, + "language_loss": 0.79240572, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81396925, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1159668, + "step": 13912, + "time_per_iteration": 2.516557455062866 + }, + { + "auxiliary_loss_clip": 0.01118949, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.04708052, + "balance_loss_mlp": 1.02039564, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.6676182785289206, + "language_loss": 0.78825736, + "learning_rate": 2.738534240246797e-07, + "loss": 0.80976039, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10949707, + "step": 13913, + "time_per_iteration": 2.4237220287323 + }, + { + "auxiliary_loss_clip": 0.01110862, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.03790891, + "balance_loss_mlp": 1.01772881, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 2.225998686045784, + "language_loss": 0.73624074, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75765151, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12493896, + "step": 13914, + "time_per_iteration": 2.5265166759490967 + }, + { + "auxiliary_loss_clip": 0.01116576, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.04474592, + "balance_loss_mlp": 1.0204227, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.6322360920605015, + "language_loss": 0.71194243, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73343551, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12310791, + "step": 13915, + "time_per_iteration": 2.5789949893951416 + }, + { + "auxiliary_loss_clip": 0.01114826, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.04138565, + "balance_loss_mlp": 1.01889694, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.7363362944780338, + "language_loss": 0.72015882, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74161357, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11761475, + "step": 13916, + "time_per_iteration": 2.4289493560791016 + }, + { + "auxiliary_loss_clip": 0.01108367, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.03618443, + "balance_loss_mlp": 1.01545453, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 1.9528957397585172, + "language_loss": 0.74940979, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.77076423, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11627197, + "step": 13917, + "time_per_iteration": 2.415282964706421 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.03967309, + "balance_loss_mlp": 1.01929021, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.7441355914232346, + "language_loss": 0.7915923, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81297505, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10913086, + "step": 13918, + "time_per_iteration": 2.48403000831604 + }, + { + "auxiliary_loss_clip": 0.01118146, + "auxiliary_loss_mlp": 0.01036866, + "balance_loss_clip": 1.04548383, + "balance_loss_mlp": 1.0246706, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.5784579555392149, + "language_loss": 0.68080246, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70235252, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12207031, + "step": 13919, + "time_per_iteration": 2.464801073074341 + }, + { + "auxiliary_loss_clip": 0.01117747, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.04594576, + "balance_loss_mlp": 1.01689398, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.6969530881923933, + "language_loss": 0.74231052, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.76376963, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.112854, + "step": 13920, + "time_per_iteration": 2.4537618160247803 + }, + { + "auxiliary_loss_clip": 0.01119001, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.04740977, + "balance_loss_mlp": 1.02132511, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.8381597187702448, + "language_loss": 0.69128287, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71280336, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11730957, + "step": 13921, + "time_per_iteration": 2.4383816719055176 + }, + { + "auxiliary_loss_clip": 0.01119274, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.04313111, + "balance_loss_mlp": 1.01992857, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 2.0384256952898903, + "language_loss": 0.85488117, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87639081, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11761475, + "step": 13922, + "time_per_iteration": 2.4595181941986084 + }, + { + "auxiliary_loss_clip": 0.011115, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.04246819, + "balance_loss_mlp": 1.0189712, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.6008790756276425, + "language_loss": 0.71835154, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73976308, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10681152, + "step": 13923, + "time_per_iteration": 2.4462993144989014 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01028078, + "balance_loss_clip": 1.04514432, + "balance_loss_mlp": 1.01580572, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 1.963120639713674, + "language_loss": 0.76030868, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78173447, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.1227417, + "step": 13924, + "time_per_iteration": 2.467707633972168 + }, + { + "auxiliary_loss_clip": 0.01111192, + "auxiliary_loss_mlp": 0.01025785, + "balance_loss_clip": 1.03943706, + "balance_loss_mlp": 1.01437068, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.5620016327024202, + "language_loss": 0.64400458, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.6653744, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11413574, + "step": 13925, + "time_per_iteration": 2.5142619609832764 + }, + { + "auxiliary_loss_clip": 0.01117605, + "auxiliary_loss_mlp": 0.01030744, + "balance_loss_clip": 1.04206383, + "balance_loss_mlp": 1.01902533, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 1.6596196912211876, + "language_loss": 0.74493849, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76642197, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11724854, + "step": 13926, + "time_per_iteration": 2.576570987701416 + }, + { + "auxiliary_loss_clip": 0.01114323, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.04020381, + "balance_loss_mlp": 1.02179027, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.7913461316280241, + "language_loss": 0.7132917, + "learning_rate": 2.711059188546274e-07, + "loss": 0.734779, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12615967, + "step": 13927, + "time_per_iteration": 2.4344563484191895 + }, + { + "auxiliary_loss_clip": 0.01049694, + "auxiliary_loss_mlp": 0.01005199, + "balance_loss_clip": 1.02500343, + "balance_loss_mlp": 1.00394762, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.7016199215681704, + "language_loss": 0.58831733, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60886633, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.01252747, + "step": 13928, + "time_per_iteration": 4.607122421264648 + }, + { + "auxiliary_loss_clip": 0.01120489, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.04877734, + "balance_loss_mlp": 1.01970291, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.5997603308506307, + "language_loss": 0.69462687, + "learning_rate": 2.707144665977068e-07, + "loss": 0.71614802, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11914062, + "step": 13929, + "time_per_iteration": 2.4408364295959473 + }, + { + "auxiliary_loss_clip": 0.01121074, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.04696155, + "balance_loss_mlp": 1.01458097, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.640469797785559, + "language_loss": 0.66924453, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69072485, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12365723, + "step": 13930, + "time_per_iteration": 2.6482675075531006 + }, + { + "auxiliary_loss_clip": 0.01113555, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.04314923, + "balance_loss_mlp": 1.01572192, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 2.4367670262086625, + "language_loss": 0.71372294, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73513937, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1237793, + "step": 13931, + "time_per_iteration": 2.4278087615966797 + }, + { + "auxiliary_loss_clip": 0.01111803, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.03937721, + "balance_loss_mlp": 1.02634299, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.6888302475377135, + "language_loss": 0.71775103, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73927671, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.144104, + "step": 13932, + "time_per_iteration": 2.4844765663146973 + }, + { + "auxiliary_loss_clip": 0.01117406, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.04299259, + "balance_loss_mlp": 1.02379656, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.0390842919853305, + "language_loss": 0.66906321, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69058615, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11077881, + "step": 13933, + "time_per_iteration": 3.9385430812835693 + }, + { + "auxiliary_loss_clip": 0.01117315, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.04745984, + "balance_loss_mlp": 1.02246416, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 2.248619950860052, + "language_loss": 0.76621705, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78773183, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11706543, + "step": 13934, + "time_per_iteration": 2.400761365890503 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.05121911, + "balance_loss_mlp": 1.01703608, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.686394937230823, + "language_loss": 0.77454007, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79605371, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11981201, + "step": 13935, + "time_per_iteration": 2.4852559566497803 + }, + { + "auxiliary_loss_clip": 0.01114613, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.04286432, + "balance_loss_mlp": 1.01585567, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 2.702961149311161, + "language_loss": 0.55904746, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.58046806, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11590576, + "step": 13936, + "time_per_iteration": 2.381887912750244 + }, + { + "auxiliary_loss_clip": 0.01116616, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.04513884, + "balance_loss_mlp": 1.01472521, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.8991770118305182, + "language_loss": 0.89578795, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91720861, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10717773, + "step": 13937, + "time_per_iteration": 2.4558515548706055 + }, + { + "auxiliary_loss_clip": 0.01115853, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.04472041, + "balance_loss_mlp": 1.01611698, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.6781703435461321, + "language_loss": 0.81623393, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83767223, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11883545, + "step": 13938, + "time_per_iteration": 2.596726655960083 + }, + { + "auxiliary_loss_clip": 0.01118181, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.04418612, + "balance_loss_mlp": 1.01706493, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.7719363702527426, + "language_loss": 0.70863497, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.73011345, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12591553, + "step": 13939, + "time_per_iteration": 2.4644393920898438 + }, + { + "auxiliary_loss_clip": 0.01124588, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.04821205, + "balance_loss_mlp": 1.02209425, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 1.596226664787947, + "language_loss": 0.75899088, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78058136, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12371826, + "step": 13940, + "time_per_iteration": 3.921950578689575 + }, + { + "auxiliary_loss_clip": 0.01109707, + "auxiliary_loss_mlp": 0.01034635, + "balance_loss_clip": 1.03879666, + "balance_loss_mlp": 1.02310145, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.7024908517030548, + "language_loss": 0.76476848, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78621197, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11529541, + "step": 13941, + "time_per_iteration": 2.437533378601074 + }, + { + "auxiliary_loss_clip": 0.01117394, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.04062271, + "balance_loss_mlp": 1.01701665, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.3752342262442503, + "language_loss": 0.73405617, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75552845, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12823486, + "step": 13942, + "time_per_iteration": 2.445096492767334 + }, + { + "auxiliary_loss_clip": 0.01123083, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.04615831, + "balance_loss_mlp": 1.02144527, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.5331067109705885, + "language_loss": 0.79255724, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81412828, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12567139, + "step": 13943, + "time_per_iteration": 2.4926304817199707 + }, + { + "auxiliary_loss_clip": 0.01121193, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.04971075, + "balance_loss_mlp": 1.01760578, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.8302551613260625, + "language_loss": 0.85225356, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.873757, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11553955, + "step": 13944, + "time_per_iteration": 2.4135537147521973 + }, + { + "auxiliary_loss_clip": 0.01041303, + "auxiliary_loss_mlp": 0.01004522, + "balance_loss_clip": 1.01575899, + "balance_loss_mlp": 1.00306785, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6152346731582494, + "language_loss": 0.50228357, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52274185, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.25488281, + "router_z_loss_mlp": 0.01454163, + "step": 13945, + "time_per_iteration": 3.17964768409729 + }, + { + "auxiliary_loss_clip": 0.01110383, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.03996074, + "balance_loss_mlp": 1.01571488, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 2.686967606739855, + "language_loss": 0.64870846, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67008132, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11193848, + "step": 13946, + "time_per_iteration": 2.4810690879821777 + }, + { + "auxiliary_loss_clip": 0.01119599, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.04791152, + "balance_loss_mlp": 1.01574445, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.6977040222878723, + "language_loss": 0.67381227, + "learning_rate": 2.672032068397829e-07, + "loss": 0.6952858, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12030029, + "step": 13947, + "time_per_iteration": 2.5091426372528076 + }, + { + "auxiliary_loss_clip": 0.01119268, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.04492927, + "balance_loss_mlp": 1.01587605, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.8865818447462592, + "language_loss": 0.69811869, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.71959478, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12469482, + "step": 13948, + "time_per_iteration": 2.542379140853882 + }, + { + "auxiliary_loss_clip": 0.0111242, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.0430063, + "balance_loss_mlp": 1.0195024, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 2.029530451034656, + "language_loss": 0.85144156, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87286747, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10675049, + "step": 13949, + "time_per_iteration": 2.4605038166046143 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.04481649, + "balance_loss_mlp": 1.01578212, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 3.9335649803785566, + "language_loss": 0.70620501, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72761846, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11126709, + "step": 13950, + "time_per_iteration": 2.474043846130371 + }, + { + "auxiliary_loss_clip": 0.01109373, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.03888464, + "balance_loss_mlp": 1.01740193, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.872310358354566, + "language_loss": 0.6469909, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66837811, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.1194458, + "step": 13951, + "time_per_iteration": 2.4764275550842285 + }, + { + "auxiliary_loss_clip": 0.01111076, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.04028022, + "balance_loss_mlp": 1.01871681, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.6011033839917348, + "language_loss": 0.70194048, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72334552, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10717773, + "step": 13952, + "time_per_iteration": 2.5303475856781006 + }, + { + "auxiliary_loss_clip": 0.01109903, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.03978181, + "balance_loss_mlp": 1.01474047, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 4.416037052537848, + "language_loss": 0.73118234, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.75253689, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10809326, + "step": 13953, + "time_per_iteration": 2.4252266883850098 + }, + { + "auxiliary_loss_clip": 0.01112271, + "auxiliary_loss_mlp": 0.01025245, + "balance_loss_clip": 1.04290628, + "balance_loss_mlp": 1.01409245, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 2.5597344284079395, + "language_loss": 0.682392, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70376718, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11151123, + "step": 13954, + "time_per_iteration": 2.4382081031799316 + }, + { + "auxiliary_loss_clip": 0.01115337, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.0438782, + "balance_loss_mlp": 1.01737118, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.7082647416713266, + "language_loss": 0.73164201, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75307584, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10675049, + "step": 13955, + "time_per_iteration": 2.4327876567840576 + }, + { + "auxiliary_loss_clip": 0.01112199, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.04007673, + "balance_loss_mlp": 1.01589513, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.9667422026148578, + "language_loss": 0.66648334, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68787909, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11480713, + "step": 13956, + "time_per_iteration": 3.8514323234558105 + }, + { + "auxiliary_loss_clip": 0.01112806, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.03951645, + "balance_loss_mlp": 1.0192765, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.7914905831022243, + "language_loss": 0.79775524, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81919909, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12298584, + "step": 13957, + "time_per_iteration": 2.5405123233795166 + }, + { + "auxiliary_loss_clip": 0.01037059, + "auxiliary_loss_mlp": 0.01001903, + "balance_loss_clip": 1.01258826, + "balance_loss_mlp": 1.00074387, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7616774564900761, + "language_loss": 0.53361547, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55400509, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01159668, + "step": 13958, + "time_per_iteration": 3.2216129302978516 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.04437423, + "balance_loss_mlp": 1.02143157, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.7245187678771812, + "language_loss": 0.73808253, + "learning_rate": 2.648741917459574e-07, + "loss": 0.759574, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.12268066, + "step": 13959, + "time_per_iteration": 2.419145107269287 + }, + { + "auxiliary_loss_clip": 0.01109003, + "auxiliary_loss_mlp": 0.01023628, + "balance_loss_clip": 1.04019737, + "balance_loss_mlp": 1.01276183, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.7884657847874355, + "language_loss": 0.55561203, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57693833, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10864258, + "step": 13960, + "time_per_iteration": 2.4980382919311523 + }, + { + "auxiliary_loss_clip": 0.01033537, + "auxiliary_loss_mlp": 0.0100186, + "balance_loss_clip": 1.00914323, + "balance_loss_mlp": 1.0005486, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 1.2904907008148703, + "language_loss": 0.60720861, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62756264, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.01312256, + "step": 13961, + "time_per_iteration": 3.168978691101074 + }, + { + "auxiliary_loss_clip": 0.01110961, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.04009759, + "balance_loss_mlp": 1.01921272, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.146634602739971, + "language_loss": 0.68305415, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70446348, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10754395, + "step": 13962, + "time_per_iteration": 2.4248862266540527 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.03950667, + "balance_loss_mlp": 1.01724625, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.9623139587133056, + "language_loss": 0.73513824, + "learning_rate": 2.640999582304841e-07, + "loss": 0.7565695, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11578369, + "step": 13963, + "time_per_iteration": 2.4073636531829834 + }, + { + "auxiliary_loss_clip": 0.01111855, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.04030681, + "balance_loss_mlp": 1.01976371, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.5558646191126313, + "language_loss": 0.76586151, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78728688, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10925293, + "step": 13964, + "time_per_iteration": 2.6472246646881104 + }, + { + "auxiliary_loss_clip": 0.01111615, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.03780127, + "balance_loss_mlp": 1.02052236, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 1.8802112269047253, + "language_loss": 0.7841388, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80559081, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1305542, + "step": 13965, + "time_per_iteration": 2.5289149284362793 + }, + { + "auxiliary_loss_clip": 0.01110532, + "auxiliary_loss_mlp": 0.01022876, + "balance_loss_clip": 1.0407598, + "balance_loss_mlp": 1.01212347, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 1.5970673445917238, + "language_loss": 0.66071612, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68205023, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10754395, + "step": 13966, + "time_per_iteration": 2.588500738143921 + }, + { + "auxiliary_loss_clip": 0.01114741, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.04331899, + "balance_loss_mlp": 1.01963031, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 2.4031260267149164, + "language_loss": 0.74385035, + "learning_rate": 2.633267779230177e-07, + "loss": 0.7653088, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11468506, + "step": 13967, + "time_per_iteration": 2.508200168609619 + }, + { + "auxiliary_loss_clip": 0.01108408, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.03815424, + "balance_loss_mlp": 1.02047622, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 3.0713405603062434, + "language_loss": 0.83301008, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85441685, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11785889, + "step": 13968, + "time_per_iteration": 2.418562650680542 + }, + { + "auxiliary_loss_clip": 0.01110929, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.03900814, + "balance_loss_mlp": 1.02139139, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 2.1432427432831234, + "language_loss": 0.77446371, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79590178, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11480713, + "step": 13969, + "time_per_iteration": 2.4402694702148438 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.03533816, + "balance_loss_mlp": 1.01538873, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.110114010085767, + "language_loss": 0.7677592, + "learning_rate": 2.627475841423923e-07, + "loss": 0.78912401, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12548828, + "step": 13970, + "time_per_iteration": 2.4613301753997803 + }, + { + "auxiliary_loss_clip": 0.01108694, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.03671014, + "balance_loss_mlp": 1.02063203, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 2.200819007978348, + "language_loss": 0.72190166, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74330997, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11499023, + "step": 13971, + "time_per_iteration": 3.906008720397949 + }, + { + "auxiliary_loss_clip": 0.01111736, + "auxiliary_loss_mlp": 0.01026921, + "balance_loss_clip": 1.03855217, + "balance_loss_mlp": 1.0143981, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 2.4009959275369903, + "language_loss": 0.77276295, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79414952, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12530518, + "step": 13972, + "time_per_iteration": 2.4091179370880127 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01036405, + "balance_loss_clip": 1.04143047, + "balance_loss_mlp": 1.02385807, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.2998365076443947, + "language_loss": 0.68677986, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70826107, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.12554932, + "step": 13973, + "time_per_iteration": 2.5365347862243652 + }, + { + "auxiliary_loss_clip": 0.0111485, + "auxiliary_loss_mlp": 0.01026189, + "balance_loss_clip": 1.0420301, + "balance_loss_mlp": 1.01454234, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 2.34218914158588, + "language_loss": 0.78388125, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80529165, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11645508, + "step": 13974, + "time_per_iteration": 2.4257516860961914 + }, + { + "auxiliary_loss_clip": 0.01112457, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.03981256, + "balance_loss_mlp": 1.01625061, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5586172208155522, + "language_loss": 0.72904825, + "learning_rate": 2.617835788078868e-07, + "loss": 0.75044554, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11010742, + "step": 13975, + "time_per_iteration": 2.4730722904205322 + }, + { + "auxiliary_loss_clip": 0.01113644, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.04297078, + "balance_loss_mlp": 1.02234697, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.7247308006600028, + "language_loss": 0.72420418, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74569666, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.13256836, + "step": 13976, + "time_per_iteration": 4.002760410308838 + }, + { + "auxiliary_loss_clip": 0.01114372, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.04061365, + "balance_loss_mlp": 1.01800632, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.7482136462937832, + "language_loss": 0.72278154, + "learning_rate": 2.61398438016311e-07, + "loss": 0.74422431, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11907959, + "step": 13977, + "time_per_iteration": 2.54225492477417 + }, + { + "auxiliary_loss_clip": 0.01116982, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.04277635, + "balance_loss_mlp": 1.01612425, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.4645483076194337, + "language_loss": 0.6859169, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70736337, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11547852, + "step": 13978, + "time_per_iteration": 2.5439882278442383 + }, + { + "auxiliary_loss_clip": 0.011142, + "auxiliary_loss_mlp": 0.01027079, + "balance_loss_clip": 1.04592943, + "balance_loss_mlp": 1.0159148, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.731986083219096, + "language_loss": 0.77601624, + "learning_rate": 2.610135609365145e-07, + "loss": 0.79742908, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.11157227, + "step": 13979, + "time_per_iteration": 2.3914906978607178 + }, + { + "auxiliary_loss_clip": 0.01119363, + "auxiliary_loss_mlp": 0.01026909, + "balance_loss_clip": 1.0486784, + "balance_loss_mlp": 1.01578093, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 2.323958782471739, + "language_loss": 0.78186852, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80333126, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11120605, + "step": 13980, + "time_per_iteration": 2.442678213119507 + }, + { + "auxiliary_loss_clip": 0.0111123, + "auxiliary_loss_mlp": 0.0102616, + "balance_loss_clip": 1.04261398, + "balance_loss_mlp": 1.01500165, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.5721447298554698, + "language_loss": 0.86941886, + "learning_rate": 2.606289476268757e-07, + "loss": 0.89079285, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.11157227, + "step": 13981, + "time_per_iteration": 2.4831159114837646 + }, + { + "auxiliary_loss_clip": 0.01112357, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.04115641, + "balance_loss_mlp": 1.02098417, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 2.4912499052542567, + "language_loss": 0.67411673, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.69556636, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11621094, + "step": 13982, + "time_per_iteration": 2.4702982902526855 + }, + { + "auxiliary_loss_clip": 0.01114566, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.0415318, + "balance_loss_mlp": 1.02279282, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.6314955742291444, + "language_loss": 0.6809147, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70241892, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.13061523, + "step": 13983, + "time_per_iteration": 3.9405436515808105 + }, + { + "auxiliary_loss_clip": 0.01113281, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.03789902, + "balance_loss_mlp": 1.01972246, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.8029782605688778, + "language_loss": 0.79276013, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.81421435, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12420654, + "step": 13984, + "time_per_iteration": 2.525219440460205 + }, + { + "auxiliary_loss_clip": 0.01108207, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.03574741, + "balance_loss_mlp": 1.01899147, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 1.8054202295092645, + "language_loss": 0.60677719, + "learning_rate": 2.598605125513842e-07, + "loss": 0.6281637, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11437988, + "step": 13985, + "time_per_iteration": 2.4206035137176514 + }, + { + "auxiliary_loss_clip": 0.01115748, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.04337239, + "balance_loss_mlp": 1.01549244, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.9330734322493917, + "language_loss": 0.81890404, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.84033263, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11621094, + "step": 13986, + "time_per_iteration": 2.451042890548706 + }, + { + "auxiliary_loss_clip": 0.01116946, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.04532349, + "balance_loss_mlp": 1.01520443, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.7802533141612147, + "language_loss": 0.65734833, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67878675, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11694336, + "step": 13987, + "time_per_iteration": 2.503467082977295 + }, + { + "auxiliary_loss_clip": 0.01110706, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.04047525, + "balance_loss_mlp": 1.01908183, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 5.537693729789411, + "language_loss": 0.67151517, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69292563, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11260986, + "step": 13988, + "time_per_iteration": 2.4755938053131104 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.03910458, + "balance_loss_mlp": 1.02225387, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 4.760630017345518, + "language_loss": 0.80983341, + "learning_rate": 2.590931332560622e-07, + "loss": 0.83132327, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12689209, + "step": 13989, + "time_per_iteration": 2.4072067737579346 + }, + { + "auxiliary_loss_clip": 0.01109375, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.03813541, + "balance_loss_mlp": 1.01420569, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.8713069758128202, + "language_loss": 0.75125271, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77260691, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11846924, + "step": 13990, + "time_per_iteration": 2.5255115032196045 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.03694069, + "balance_loss_mlp": 1.01782477, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.5503865466616396, + "language_loss": 0.8090468, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.83037508, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.10961914, + "step": 13991, + "time_per_iteration": 2.513535261154175 + }, + { + "auxiliary_loss_clip": 0.01113402, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.04253602, + "balance_loss_mlp": 1.01890683, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.000702010282502, + "language_loss": 0.70780969, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72924501, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11218262, + "step": 13992, + "time_per_iteration": 2.4990835189819336 + }, + { + "auxiliary_loss_clip": 0.01115687, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.04155087, + "balance_loss_mlp": 1.02112508, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 3.8243257408467626, + "language_loss": 0.76625752, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78774548, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11993408, + "step": 13993, + "time_per_iteration": 2.423332452774048 + }, + { + "auxiliary_loss_clip": 0.01123555, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.04325688, + "balance_loss_mlp": 1.02185118, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 1.967974345888446, + "language_loss": 0.74196678, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.7635588, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13806152, + "step": 13994, + "time_per_iteration": 2.496962070465088 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01030667, + "balance_loss_clip": 1.04020321, + "balance_loss_mlp": 1.01921678, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.625294054144877, + "language_loss": 0.59507376, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61648405, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11456299, + "step": 13995, + "time_per_iteration": 2.4484217166900635 + }, + { + "auxiliary_loss_clip": 0.01115358, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.04421675, + "balance_loss_mlp": 1.01646018, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.6233582057308658, + "language_loss": 0.71683133, + "learning_rate": 2.577527613603163e-07, + "loss": 0.7382735, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12408447, + "step": 13996, + "time_per_iteration": 2.5024540424346924 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01028191, + "balance_loss_clip": 1.03671575, + "balance_loss_mlp": 1.01691389, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.755703108842961, + "language_loss": 0.63965631, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66101724, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.112854, + "step": 13997, + "time_per_iteration": 2.4598212242126465 + }, + { + "auxiliary_loss_clip": 0.01122056, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.046422, + "balance_loss_mlp": 1.01941323, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.1073310704804022, + "language_loss": 0.82284433, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84438789, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12902832, + "step": 13998, + "time_per_iteration": 2.4684579372406006 + }, + { + "auxiliary_loss_clip": 0.01112994, + "auxiliary_loss_mlp": 0.01032165, + "balance_loss_clip": 1.04074144, + "balance_loss_mlp": 1.01985645, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.5578943040754378, + "language_loss": 0.79953527, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82098687, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12316895, + "step": 13999, + "time_per_iteration": 2.4639108180999756 + }, + { + "auxiliary_loss_clip": 0.01119539, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.04447138, + "balance_loss_mlp": 1.01882982, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 3.0523991511197464, + "language_loss": 0.66926432, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69077373, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12573242, + "step": 14000, + "time_per_iteration": 3.9636919498443604 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01027165, + "balance_loss_clip": 1.0435046, + "balance_loss_mlp": 1.01498818, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.8078602261760315, + "language_loss": 0.79627132, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81771672, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12188721, + "step": 14001, + "time_per_iteration": 2.4894590377807617 + }, + { + "auxiliary_loss_clip": 0.01112371, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.03975737, + "balance_loss_mlp": 1.01850188, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.6295816696281655, + "language_loss": 0.78767586, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80911648, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.13195801, + "step": 14002, + "time_per_iteration": 2.4097208976745605 + }, + { + "auxiliary_loss_clip": 0.01123537, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.04894066, + "balance_loss_mlp": 1.01529872, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.5405170909058172, + "language_loss": 0.78313738, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80464798, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12213135, + "step": 14003, + "time_per_iteration": 2.5099895000457764 + }, + { + "auxiliary_loss_clip": 0.0112748, + "auxiliary_loss_mlp": 0.01026046, + "balance_loss_clip": 1.0544641, + "balance_loss_mlp": 1.01444721, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 3.5881436631707158, + "language_loss": 0.65995312, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.68148839, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11602783, + "step": 14004, + "time_per_iteration": 2.5393950939178467 + }, + { + "auxiliary_loss_clip": 0.01117026, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.0442996, + "balance_loss_mlp": 1.01911104, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 2.26356324780971, + "language_loss": 0.76082951, + "learning_rate": 2.560341831785724e-07, + "loss": 0.78231418, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12329102, + "step": 14005, + "time_per_iteration": 2.4670956134796143 + }, + { + "auxiliary_loss_clip": 0.01115094, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.042521, + "balance_loss_mlp": 1.01756811, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.7730242085746912, + "language_loss": 0.77316284, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79461205, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12249756, + "step": 14006, + "time_per_iteration": 2.410100221633911 + }, + { + "auxiliary_loss_clip": 0.0112057, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.04843998, + "balance_loss_mlp": 1.02147937, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.9973478652264403, + "language_loss": 0.77462828, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79616642, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11761475, + "step": 14007, + "time_per_iteration": 2.4905216693878174 + }, + { + "auxiliary_loss_clip": 0.01115269, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.03984559, + "balance_loss_mlp": 1.01926816, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.871809669594422, + "language_loss": 0.66024965, + "learning_rate": 2.554625138886102e-07, + "loss": 0.6817196, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12445068, + "step": 14008, + "time_per_iteration": 2.5696611404418945 + }, + { + "auxiliary_loss_clip": 0.01041798, + "auxiliary_loss_mlp": 0.01005174, + "balance_loss_clip": 1.01705217, + "balance_loss_mlp": 1.00373757, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.719005888380301, + "language_loss": 0.5688163, + "learning_rate": 2.552720897550631e-07, + "loss": 0.58928609, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01435852, + "step": 14009, + "time_per_iteration": 3.1259796619415283 + }, + { + "auxiliary_loss_clip": 0.01110255, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.03790522, + "balance_loss_mlp": 1.01967871, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 2.0210945243791425, + "language_loss": 0.78055084, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80195808, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10791016, + "step": 14010, + "time_per_iteration": 2.475193500518799 + }, + { + "auxiliary_loss_clip": 0.01119527, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.04447317, + "balance_loss_mlp": 1.02169633, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.6713298592813979, + "language_loss": 0.72429788, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74583554, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12548828, + "step": 14011, + "time_per_iteration": 2.464385986328125 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.03482807, + "balance_loss_mlp": 1.02631056, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.9632854261635688, + "language_loss": 0.84507632, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86651635, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11627197, + "step": 14012, + "time_per_iteration": 2.4728267192840576 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.03589916, + "balance_loss_mlp": 1.01964784, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.536617784299576, + "language_loss": 0.68011302, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.7014302, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 0.66113281, + "router_z_loss_mlp": 0.1005249, + "step": 14013, + "time_per_iteration": 2.512286424636841 + }, + { + "auxiliary_loss_clip": 0.01118866, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.04111028, + "balance_loss_mlp": 1.01809144, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 3.0984258633105646, + "language_loss": 0.78996658, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81146508, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12896729, + "step": 14014, + "time_per_iteration": 2.5711135864257812 + }, + { + "auxiliary_loss_clip": 0.01114216, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.04082608, + "balance_loss_mlp": 1.01929903, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.8737949833492982, + "language_loss": 0.6771754, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69862443, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11383057, + "step": 14015, + "time_per_iteration": 3.939906120300293 + }, + { + "auxiliary_loss_clip": 0.01115438, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.0430541, + "balance_loss_mlp": 1.01528525, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.7583437674328826, + "language_loss": 0.76065487, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78208673, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12469482, + "step": 14016, + "time_per_iteration": 2.4472029209136963 + }, + { + "auxiliary_loss_clip": 0.01116189, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.04467404, + "balance_loss_mlp": 1.01794362, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 2.1580682763974948, + "language_loss": 0.79511786, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81657422, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1151123, + "step": 14017, + "time_per_iteration": 2.5548455715179443 + }, + { + "auxiliary_loss_clip": 0.01113842, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.04251409, + "balance_loss_mlp": 1.01956654, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.138245684481029, + "language_loss": 0.6252625, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.6467092, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11254883, + "step": 14018, + "time_per_iteration": 2.391826868057251 + }, + { + "auxiliary_loss_clip": 0.01116476, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.04368496, + "balance_loss_mlp": 1.01919913, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.9851739320397328, + "language_loss": 0.79747057, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81893945, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11224365, + "step": 14019, + "time_per_iteration": 2.476085662841797 + }, + { + "auxiliary_loss_clip": 0.01111964, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.03844237, + "balance_loss_mlp": 1.01961243, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7749481600379473, + "language_loss": 0.78802305, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80945712, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11846924, + "step": 14020, + "time_per_iteration": 3.993533134460449 + }, + { + "auxiliary_loss_clip": 0.01110009, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.03773856, + "balance_loss_mlp": 1.01802695, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.7319769558587985, + "language_loss": 0.71481824, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73621154, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11303711, + "step": 14021, + "time_per_iteration": 2.4103927612304688 + }, + { + "auxiliary_loss_clip": 0.01114194, + "auxiliary_loss_mlp": 0.01039725, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.0275774, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.9659892261406973, + "language_loss": 0.6975224, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71906161, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12158203, + "step": 14022, + "time_per_iteration": 2.5335798263549805 + }, + { + "auxiliary_loss_clip": 0.01118486, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.04416764, + "balance_loss_mlp": 1.02530038, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 1.882270148549512, + "language_loss": 0.72233009, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74391323, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.14501953, + "step": 14023, + "time_per_iteration": 2.4785280227661133 + }, + { + "auxiliary_loss_clip": 0.0111371, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.04307246, + "balance_loss_mlp": 1.02186871, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.4263004545667957, + "language_loss": 0.66972262, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69120967, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.13128662, + "step": 14024, + "time_per_iteration": 2.6020641326904297 + }, + { + "auxiliary_loss_clip": 0.01107188, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.03726816, + "balance_loss_mlp": 1.01705253, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.986757402203235, + "language_loss": 0.8070932, + "learning_rate": 2.522343063158261e-07, + "loss": 0.82846916, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.13348389, + "step": 14025, + "time_per_iteration": 2.441878318786621 + }, + { + "auxiliary_loss_clip": 0.0111035, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.04033041, + "balance_loss_mlp": 1.0217123, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.6148542521671128, + "language_loss": 0.77623761, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79765826, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10003662, + "step": 14026, + "time_per_iteration": 2.5481786727905273 + }, + { + "auxiliary_loss_clip": 0.01115141, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.04495347, + "balance_loss_mlp": 1.02179909, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.8807292879355395, + "language_loss": 0.82889247, + "learning_rate": 2.518557757400945e-07, + "loss": 0.85037369, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11187744, + "step": 14027, + "time_per_iteration": 4.034904718399048 + }, + { + "auxiliary_loss_clip": 0.01112811, + "auxiliary_loss_mlp": 0.01035875, + "balance_loss_clip": 1.04058206, + "balance_loss_mlp": 1.02388239, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.841482616836198, + "language_loss": 0.56684804, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58833492, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11993408, + "step": 14028, + "time_per_iteration": 2.5828890800476074 + }, + { + "auxiliary_loss_clip": 0.01113736, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.04203176, + "balance_loss_mlp": 1.01723254, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 1.7340277564934858, + "language_loss": 0.63637149, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65779251, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11132812, + "step": 14029, + "time_per_iteration": 2.453216552734375 + }, + { + "auxiliary_loss_clip": 0.0111077, + "auxiliary_loss_mlp": 0.01031646, + "balance_loss_clip": 1.04118383, + "balance_loss_mlp": 1.02102458, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5631675052044054, + "language_loss": 0.75075877, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77218282, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10620117, + "step": 14030, + "time_per_iteration": 2.5257298946380615 + }, + { + "auxiliary_loss_clip": 0.01114262, + "auxiliary_loss_mlp": 0.01028133, + "balance_loss_clip": 1.04418755, + "balance_loss_mlp": 1.01699901, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.9188732892746772, + "language_loss": 0.8361932, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85761714, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11138916, + "step": 14031, + "time_per_iteration": 2.47873854637146 + }, + { + "auxiliary_loss_clip": 0.0111613, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.04542387, + "balance_loss_mlp": 1.01566231, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.7205369541115814, + "language_loss": 0.80369294, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82511652, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10571289, + "step": 14032, + "time_per_iteration": 2.4306347370147705 + }, + { + "auxiliary_loss_clip": 0.0111121, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.03893209, + "balance_loss_mlp": 1.01801729, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.441069075278378, + "language_loss": 0.75358599, + "learning_rate": 2.507217751976478e-07, + "loss": 0.7750079, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12976074, + "step": 14033, + "time_per_iteration": 2.4503681659698486 + }, + { + "auxiliary_loss_clip": 0.01109789, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.03859472, + "balance_loss_mlp": 1.0217309, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.8920145247782718, + "language_loss": 0.83612835, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85754991, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10632324, + "step": 14034, + "time_per_iteration": 2.39064884185791 + }, + { + "auxiliary_loss_clip": 0.0111935, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.04667759, + "balance_loss_mlp": 1.01343513, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 2.124059193085324, + "language_loss": 0.78417623, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80561829, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11407471, + "step": 14035, + "time_per_iteration": 2.5698189735412598 + }, + { + "auxiliary_loss_clip": 0.01117944, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.04656482, + "balance_loss_mlp": 1.01779771, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.4063604663353038, + "language_loss": 0.72237682, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74384707, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.112854, + "step": 14036, + "time_per_iteration": 2.5680465698242188 + }, + { + "auxiliary_loss_clip": 0.01109257, + "auxiliary_loss_mlp": 0.01023118, + "balance_loss_clip": 1.04273963, + "balance_loss_mlp": 1.01324785, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.765236710933104, + "language_loss": 0.69445848, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71578228, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 0.66455078, + "router_z_loss_mlp": 0.09881592, + "step": 14037, + "time_per_iteration": 2.5260300636291504 + }, + { + "auxiliary_loss_clip": 0.0111196, + "auxiliary_loss_mlp": 0.01024659, + "balance_loss_clip": 1.04109502, + "balance_loss_mlp": 1.01406765, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.6509517696264497, + "language_loss": 0.69164234, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71300852, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10595703, + "step": 14038, + "time_per_iteration": 2.5617096424102783 + }, + { + "auxiliary_loss_clip": 0.0111591, + "auxiliary_loss_mlp": 0.01030179, + "balance_loss_clip": 1.04405117, + "balance_loss_mlp": 1.01889539, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.604480328307267, + "language_loss": 0.76449466, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78595549, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11291504, + "step": 14039, + "time_per_iteration": 2.5965981483459473 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.04360938, + "balance_loss_mlp": 1.01674271, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 2.281536651150995, + "language_loss": 0.7915076, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81299454, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11761475, + "step": 14040, + "time_per_iteration": 2.5109376907348633 + }, + { + "auxiliary_loss_clip": 0.01122973, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.04910374, + "balance_loss_mlp": 1.02166283, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 1.9291423320297172, + "language_loss": 0.69508761, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71665823, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12426758, + "step": 14041, + "time_per_iteration": 2.500830888748169 + }, + { + "auxiliary_loss_clip": 0.01108204, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03652978, + "balance_loss_mlp": 1.0206852, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 2.442509818561997, + "language_loss": 0.69181693, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71322203, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11627197, + "step": 14042, + "time_per_iteration": 2.496737480163574 + }, + { + "auxiliary_loss_clip": 0.01111184, + "auxiliary_loss_mlp": 0.01029898, + "balance_loss_clip": 1.04232049, + "balance_loss_mlp": 1.01922917, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.7944406836412825, + "language_loss": 0.74667537, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76808619, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10668945, + "step": 14043, + "time_per_iteration": 4.050813436508179 + }, + { + "auxiliary_loss_clip": 0.01113909, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.04295754, + "balance_loss_mlp": 1.0157454, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.0462780462021057, + "language_loss": 0.7193135, + "learning_rate": 2.486489774343865e-07, + "loss": 0.74071842, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10852051, + "step": 14044, + "time_per_iteration": 2.4804747104644775 + }, + { + "auxiliary_loss_clip": 0.0110732, + "auxiliary_loss_mlp": 0.01025101, + "balance_loss_clip": 1.03813589, + "balance_loss_mlp": 1.01397276, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.6175046109303528, + "language_loss": 0.7501142, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77143836, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11120605, + "step": 14045, + "time_per_iteration": 2.514373779296875 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.04096711, + "balance_loss_mlp": 1.01753998, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 2.1234907212790515, + "language_loss": 0.78711367, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80851746, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11016846, + "step": 14046, + "time_per_iteration": 2.3953006267547607 + }, + { + "auxiliary_loss_clip": 0.01117047, + "auxiliary_loss_mlp": 0.01024111, + "balance_loss_clip": 1.04413462, + "balance_loss_mlp": 1.01214218, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 1.9787050984384027, + "language_loss": 0.78481686, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80622852, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11962891, + "step": 14047, + "time_per_iteration": 2.496929407119751 + }, + { + "auxiliary_loss_clip": 0.01120605, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.04610419, + "balance_loss_mlp": 1.02746415, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 1.7991684658611147, + "language_loss": 0.7259472, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74755245, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12457275, + "step": 14048, + "time_per_iteration": 2.497499704360962 + }, + { + "auxiliary_loss_clip": 0.01113053, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.0419116, + "balance_loss_mlp": 1.01967061, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.5910604616024837, + "language_loss": 0.73351049, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75495303, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11529541, + "step": 14049, + "time_per_iteration": 2.4602627754211426 + }, + { + "auxiliary_loss_clip": 0.01043246, + "auxiliary_loss_mlp": 0.01000682, + "balance_loss_clip": 1.01784837, + "balance_loss_mlp": 0.99927247, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.8014994071640642, + "language_loss": 0.60694194, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62738127, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01408386, + "step": 14050, + "time_per_iteration": 3.0344998836517334 + }, + { + "auxiliary_loss_clip": 0.01115041, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.04473186, + "balance_loss_mlp": 1.01874733, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.3759989017880887, + "language_loss": 0.72209477, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74355334, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12072754, + "step": 14051, + "time_per_iteration": 2.4340574741363525 + }, + { + "auxiliary_loss_clip": 0.01113191, + "auxiliary_loss_mlp": 0.01025744, + "balance_loss_clip": 1.04159212, + "balance_loss_mlp": 1.01446724, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 2.017720699048936, + "language_loss": 0.74339068, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76477998, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11273193, + "step": 14052, + "time_per_iteration": 2.456310272216797 + }, + { + "auxiliary_loss_clip": 0.01106551, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.03943753, + "balance_loss_mlp": 1.0184437, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.7248897485130301, + "language_loss": 0.74219358, + "learning_rate": 2.469590285884575e-07, + "loss": 0.76354909, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.10559082, + "step": 14053, + "time_per_iteration": 2.4146769046783447 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.04340887, + "balance_loss_mlp": 1.0210191, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 2.1995978733234596, + "language_loss": 0.74147129, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76298738, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11981201, + "step": 14054, + "time_per_iteration": 2.4099011421203613 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.04762542, + "balance_loss_mlp": 1.01714122, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 2.5606173631330034, + "language_loss": 0.78487504, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80638558, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11761475, + "step": 14055, + "time_per_iteration": 2.6344032287597656 + }, + { + "auxiliary_loss_clip": 0.01116666, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.04671907, + "balance_loss_mlp": 1.01919317, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.8527516903203696, + "language_loss": 0.73028064, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75174963, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.1104126, + "step": 14056, + "time_per_iteration": 2.5277884006500244 + }, + { + "auxiliary_loss_clip": 0.01124864, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.04965532, + "balance_loss_mlp": 1.01903701, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.443368469883425, + "language_loss": 0.67820096, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69975889, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11895752, + "step": 14057, + "time_per_iteration": 2.5144805908203125 + }, + { + "auxiliary_loss_clip": 0.01117906, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.04654789, + "balance_loss_mlp": 1.01719368, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.6589810661755517, + "language_loss": 0.77319831, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79466212, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11273193, + "step": 14058, + "time_per_iteration": 4.005910634994507 + }, + { + "auxiliary_loss_clip": 0.01114315, + "auxiliary_loss_mlp": 0.01029448, + "balance_loss_clip": 1.04219341, + "balance_loss_mlp": 1.01839697, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.5304257839165452, + "language_loss": 0.69896519, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72040284, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1105957, + "step": 14059, + "time_per_iteration": 2.497392177581787 + }, + { + "auxiliary_loss_clip": 0.01119784, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.04436076, + "balance_loss_mlp": 1.02213728, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 1.9564667811193106, + "language_loss": 0.57292175, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59446418, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12329102, + "step": 14060, + "time_per_iteration": 2.4618325233459473 + }, + { + "auxiliary_loss_clip": 0.01116433, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.04183435, + "balance_loss_mlp": 1.01902997, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.5861221836574597, + "language_loss": 0.75974494, + "learning_rate": 2.454613720076277e-07, + "loss": 0.78122944, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12982178, + "step": 14061, + "time_per_iteration": 2.434497117996216 + }, + { + "auxiliary_loss_clip": 0.01123539, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.04915023, + "balance_loss_mlp": 1.01659477, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 5.151095675491781, + "language_loss": 0.71038032, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73190701, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12542725, + "step": 14062, + "time_per_iteration": 2.455336093902588 + }, + { + "auxiliary_loss_clip": 0.01046823, + "auxiliary_loss_mlp": 0.01003273, + "balance_loss_clip": 1.02172613, + "balance_loss_mlp": 1.00180066, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6321421629641069, + "language_loss": 0.52652967, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54703063, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01470947, + "step": 14063, + "time_per_iteration": 4.587769031524658 + }, + { + "auxiliary_loss_clip": 0.0110766, + "auxiliary_loss_mlp": 0.0102266, + "balance_loss_clip": 1.03992867, + "balance_loss_mlp": 1.0126102, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 2.051596149416633, + "language_loss": 0.82475746, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84606069, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 0.67724609, + "router_z_loss_mlp": 0.10040283, + "step": 14064, + "time_per_iteration": 2.4450137615203857 + }, + { + "auxiliary_loss_clip": 0.01115087, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.04149497, + "balance_loss_mlp": 1.02151775, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 2.0852162591568044, + "language_loss": 0.72780764, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74931312, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.13928223, + "step": 14065, + "time_per_iteration": 2.448451042175293 + }, + { + "auxiliary_loss_clip": 0.01114625, + "auxiliary_loss_mlp": 0.01027205, + "balance_loss_clip": 1.04539716, + "balance_loss_mlp": 1.01621413, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.451667962739559, + "language_loss": 0.77665067, + "learning_rate": 2.445274987130146e-07, + "loss": 0.798069, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10992432, + "step": 14066, + "time_per_iteration": 2.5228381156921387 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01028181, + "balance_loss_clip": 1.04432368, + "balance_loss_mlp": 1.01617706, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.5875083217809374, + "language_loss": 0.70315337, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72460115, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11993408, + "step": 14067, + "time_per_iteration": 2.490299701690674 + }, + { + "auxiliary_loss_clip": 0.01112815, + "auxiliary_loss_mlp": 0.0102601, + "balance_loss_clip": 1.0404501, + "balance_loss_mlp": 1.01451838, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 2.0445443472024305, + "language_loss": 0.71318257, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73457086, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11486816, + "step": 14068, + "time_per_iteration": 2.512277603149414 + }, + { + "auxiliary_loss_clip": 0.01067755, + "auxiliary_loss_mlp": 0.01008571, + "balance_loss_clip": 1.04322553, + "balance_loss_mlp": 1.00696492, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6949305802423857, + "language_loss": 0.60492206, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62568533, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01605225, + "step": 14069, + "time_per_iteration": 3.237640857696533 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01028824, + "balance_loss_clip": 1.04730487, + "balance_loss_mlp": 1.01742196, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.6416577776008163, + "language_loss": 0.74574757, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76724827, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11401367, + "step": 14070, + "time_per_iteration": 2.464723825454712 + }, + { + "auxiliary_loss_clip": 0.01113118, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.04194212, + "balance_loss_mlp": 1.01663709, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.5805053417487498, + "language_loss": 0.6690191, + "learning_rate": 2.435952896106039e-07, + "loss": 0.6904484, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1317749, + "step": 14071, + "time_per_iteration": 4.0816709995269775 + }, + { + "auxiliary_loss_clip": 0.01048575, + "auxiliary_loss_mlp": 0.01007112, + "balance_loss_clip": 1.02351356, + "balance_loss_mlp": 1.00573647, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7346355679854004, + "language_loss": 0.60982418, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63038099, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.01376343, + "step": 14072, + "time_per_iteration": 2.955857038497925 + }, + { + "auxiliary_loss_clip": 0.01116882, + "auxiliary_loss_mlp": 0.01027515, + "balance_loss_clip": 1.04313576, + "balance_loss_mlp": 1.01431274, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.7518687663310981, + "language_loss": 0.72248775, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74393171, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13189697, + "step": 14073, + "time_per_iteration": 2.4675040245056152 + }, + { + "auxiliary_loss_clip": 0.01121722, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.04445255, + "balance_loss_mlp": 1.02277374, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.2309263649634756, + "language_loss": 0.77732593, + "learning_rate": 2.430367633291155e-07, + "loss": 0.79890203, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13104248, + "step": 14074, + "time_per_iteration": 2.6086347103118896 + }, + { + "auxiliary_loss_clip": 0.01118042, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.04666841, + "balance_loss_mlp": 1.01920962, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 2.2299777089653134, + "language_loss": 0.75309795, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77458733, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11694336, + "step": 14075, + "time_per_iteration": 2.4522039890289307 + }, + { + "auxiliary_loss_clip": 0.01108342, + "auxiliary_loss_mlp": 0.01026368, + "balance_loss_clip": 1.03728008, + "balance_loss_mlp": 1.01453042, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 2.103468763799052, + "language_loss": 0.72944868, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75079578, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.1184082, + "step": 14076, + "time_per_iteration": 2.4370665550231934 + }, + { + "auxiliary_loss_clip": 0.0112055, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.04504299, + "balance_loss_mlp": 1.01900303, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 2.3540661499211577, + "language_loss": 0.78158087, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.80310029, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12384033, + "step": 14077, + "time_per_iteration": 2.4856619834899902 + }, + { + "auxiliary_loss_clip": 0.01122405, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.04573643, + "balance_loss_mlp": 1.02138567, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 3.0445357477287263, + "language_loss": 0.75080431, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77236307, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12084961, + "step": 14078, + "time_per_iteration": 2.4356191158294678 + }, + { + "auxiliary_loss_clip": 0.01111087, + "auxiliary_loss_mlp": 0.01021896, + "balance_loss_clip": 1.03967762, + "balance_loss_mlp": 1.01013601, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.452108120632482, + "language_loss": 0.85307401, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87440383, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11755371, + "step": 14079, + "time_per_iteration": 2.3808376789093018 + }, + { + "auxiliary_loss_clip": 0.0111585, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.03844726, + "balance_loss_mlp": 1.02055895, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.42229545698127, + "language_loss": 0.58864796, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61014038, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1282959, + "step": 14080, + "time_per_iteration": 2.4604671001434326 + }, + { + "auxiliary_loss_clip": 0.01125564, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.05025148, + "balance_loss_mlp": 1.01829624, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.1588978960139666, + "language_loss": 0.66364062, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68520463, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12530518, + "step": 14081, + "time_per_iteration": 2.411558151245117 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01029326, + "balance_loss_clip": 1.04361844, + "balance_loss_mlp": 1.0182035, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 2.024630690956552, + "language_loss": 0.72906041, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75051904, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11120605, + "step": 14082, + "time_per_iteration": 2.5968594551086426 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.04335737, + "balance_loss_mlp": 1.01551974, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 2.0322157592487264, + "language_loss": 0.75654358, + "learning_rate": 2.413647829539809e-07, + "loss": 0.77797699, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11175537, + "step": 14083, + "time_per_iteration": 2.438552141189575 + }, + { + "auxiliary_loss_clip": 0.0112325, + "auxiliary_loss_mlp": 0.01027916, + "balance_loss_clip": 1.04711616, + "balance_loss_mlp": 1.01500571, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.8851445180785649, + "language_loss": 0.65712178, + "learning_rate": 2.411793407010092e-07, + "loss": 0.67863345, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12921143, + "step": 14084, + "time_per_iteration": 2.517256736755371 + }, + { + "auxiliary_loss_clip": 0.01120198, + "auxiliary_loss_mlp": 0.01033741, + "balance_loss_clip": 1.0488497, + "balance_loss_mlp": 1.02209425, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 4.017649049621743, + "language_loss": 0.70105982, + "learning_rate": 2.409939651426938e-07, + "loss": 0.72259915, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11639404, + "step": 14085, + "time_per_iteration": 2.4012703895568848 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.04067659, + "balance_loss_mlp": 1.02243412, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.4668962873586926, + "language_loss": 0.71072543, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73221159, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11486816, + "step": 14086, + "time_per_iteration": 2.5237128734588623 + }, + { + "auxiliary_loss_clip": 0.01116033, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.04373145, + "balance_loss_mlp": 1.02106071, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.9400645162599535, + "language_loss": 0.74673802, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.76823437, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12536621, + "step": 14087, + "time_per_iteration": 4.035231590270996 + }, + { + "auxiliary_loss_clip": 0.01114725, + "auxiliary_loss_mlp": 0.01024461, + "balance_loss_clip": 1.04422927, + "balance_loss_mlp": 1.01125264, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.3626272230476077, + "language_loss": 0.73735487, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.75874668, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.13214111, + "step": 14088, + "time_per_iteration": 2.510451078414917 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.04351509, + "balance_loss_mlp": 1.0186379, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.9534622306532414, + "language_loss": 0.72269505, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74416542, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12561035, + "step": 14089, + "time_per_iteration": 2.5380687713623047 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.04354894, + "balance_loss_mlp": 1.02098453, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.549379283877797, + "language_loss": 0.79324126, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81472099, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11761475, + "step": 14090, + "time_per_iteration": 2.4472463130950928 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.04252315, + "balance_loss_mlp": 1.02489257, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 2.263172065395685, + "language_loss": 0.77216905, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.7937144, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.13287354, + "step": 14091, + "time_per_iteration": 2.4004690647125244 + }, + { + "auxiliary_loss_clip": 0.01050413, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.02610946, + "balance_loss_mlp": 1.00161123, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8205447113772821, + "language_loss": 0.59408438, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61461711, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.01246643, + "step": 14092, + "time_per_iteration": 3.1368563175201416 + }, + { + "auxiliary_loss_clip": 0.01118849, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.04703546, + "balance_loss_mlp": 1.02063322, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 1.981341421742123, + "language_loss": 0.70707762, + "learning_rate": 2.395133625267756e-07, + "loss": 0.7285915, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11901855, + "step": 14093, + "time_per_iteration": 2.4348456859588623 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.04242992, + "balance_loss_mlp": 1.01581144, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 1.8824704661917262, + "language_loss": 0.83867115, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.86005265, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10864258, + "step": 14094, + "time_per_iteration": 2.523042678833008 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.04134822, + "balance_loss_mlp": 1.01752615, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.6687212942905212, + "language_loss": 0.71142185, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73279786, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.11022949, + "step": 14095, + "time_per_iteration": 2.498687982559204 + }, + { + "auxiliary_loss_clip": 0.01114594, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.04226756, + "balance_loss_mlp": 1.03115797, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.842992408135798, + "language_loss": 0.8074007, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82897866, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12054443, + "step": 14096, + "time_per_iteration": 2.4450080394744873 + }, + { + "auxiliary_loss_clip": 0.0112347, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.04389083, + "balance_loss_mlp": 1.02140355, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.9129484427759296, + "language_loss": 0.77706593, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79864681, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13220215, + "step": 14097, + "time_per_iteration": 2.461010694503784 + }, + { + "auxiliary_loss_clip": 0.01123232, + "auxiliary_loss_mlp": 0.01022734, + "balance_loss_clip": 1.05067801, + "balance_loss_mlp": 1.01183879, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.7050324416520115, + "language_loss": 0.80853581, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82999551, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10894775, + "step": 14098, + "time_per_iteration": 2.415402889251709 + }, + { + "auxiliary_loss_clip": 0.01117697, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.04383707, + "balance_loss_mlp": 1.02302694, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 1.7265526308468655, + "language_loss": 0.72054917, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.74208188, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12554932, + "step": 14099, + "time_per_iteration": 2.465075969696045 + }, + { + "auxiliary_loss_clip": 0.01115878, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.04295874, + "balance_loss_mlp": 1.01728976, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 1.8557389659135615, + "language_loss": 0.63589233, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65735352, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12963867, + "step": 14100, + "time_per_iteration": 2.609935760498047 + }, + { + "auxiliary_loss_clip": 0.01114802, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.04032815, + "balance_loss_mlp": 1.01764393, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 3.108536465530467, + "language_loss": 0.73575473, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75720322, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1239624, + "step": 14101, + "time_per_iteration": 2.5133707523345947 + }, + { + "auxiliary_loss_clip": 0.01115952, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.04286122, + "balance_loss_mlp": 1.01975238, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 2.445333202640738, + "language_loss": 0.71376455, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73523498, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11334229, + "step": 14102, + "time_per_iteration": 3.8926913738250732 + }, + { + "auxiliary_loss_clip": 0.01119645, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.04252207, + "balance_loss_mlp": 1.02218544, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.082137600440622, + "language_loss": 0.81560862, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83715522, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12823486, + "step": 14103, + "time_per_iteration": 2.4695091247558594 + }, + { + "auxiliary_loss_clip": 0.01123856, + "auxiliary_loss_mlp": 0.01028204, + "balance_loss_clip": 1.05134821, + "balance_loss_mlp": 1.01674235, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 2.1957901625279455, + "language_loss": 0.78384566, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80536634, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11456299, + "step": 14104, + "time_per_iteration": 2.4320225715637207 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.0449295, + "balance_loss_mlp": 1.02410686, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 2.295243855004551, + "language_loss": 0.78834891, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.80988634, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12011719, + "step": 14105, + "time_per_iteration": 2.533785820007324 + }, + { + "auxiliary_loss_clip": 0.01122904, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.04685473, + "balance_loss_mlp": 1.01635945, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 10.268308107279628, + "language_loss": 0.50178248, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52330261, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12750244, + "step": 14106, + "time_per_iteration": 2.5101001262664795 + }, + { + "auxiliary_loss_clip": 0.01113829, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.04283714, + "balance_loss_mlp": 1.01851726, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 1.982526244014867, + "language_loss": 0.75613511, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77756739, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10888672, + "step": 14107, + "time_per_iteration": 4.004260301589966 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.04321623, + "balance_loss_mlp": 1.01593125, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.7074553173871425, + "language_loss": 0.73540509, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75683379, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11486816, + "step": 14108, + "time_per_iteration": 2.570063829421997 + }, + { + "auxiliary_loss_clip": 0.01115087, + "auxiliary_loss_mlp": 0.01025343, + "balance_loss_clip": 1.04431152, + "balance_loss_mlp": 1.01256979, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 3.0783961597931833, + "language_loss": 0.72607398, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74747825, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.12750244, + "step": 14109, + "time_per_iteration": 2.4341583251953125 + }, + { + "auxiliary_loss_clip": 0.01112745, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.04325414, + "balance_loss_mlp": 1.01958084, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 3.0993859924495863, + "language_loss": 0.73720765, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.7586509, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.12005615, + "step": 14110, + "time_per_iteration": 2.4432451725006104 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.04575157, + "balance_loss_mlp": 1.01969278, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.799354715158686, + "language_loss": 0.76316786, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78466237, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11663818, + "step": 14111, + "time_per_iteration": 2.514833450317383 + }, + { + "auxiliary_loss_clip": 0.01117859, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.04706407, + "balance_loss_mlp": 1.01630878, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 2.333665665581655, + "language_loss": 0.67102361, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.692469, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10375977, + "step": 14112, + "time_per_iteration": 2.594015598297119 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.04875863, + "balance_loss_mlp": 1.01898479, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 2.6396960708762762, + "language_loss": 0.73720038, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75873828, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11499023, + "step": 14113, + "time_per_iteration": 2.519824981689453 + }, + { + "auxiliary_loss_clip": 0.01111417, + "auxiliary_loss_mlp": 0.01028003, + "balance_loss_clip": 1.04157948, + "balance_loss_mlp": 1.01694083, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 2.4526948887506883, + "language_loss": 0.66370749, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68510175, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.1105957, + "step": 14114, + "time_per_iteration": 2.50990629196167 + }, + { + "auxiliary_loss_clip": 0.01121148, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.04776466, + "balance_loss_mlp": 1.02025342, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.7730954684166476, + "language_loss": 0.78544259, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80697834, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12176514, + "step": 14115, + "time_per_iteration": 3.8364651203155518 + }, + { + "auxiliary_loss_clip": 0.01119157, + "auxiliary_loss_mlp": 0.01028597, + "balance_loss_clip": 1.04712152, + "balance_loss_mlp": 1.01726651, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 2.4239335669936115, + "language_loss": 0.79374826, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81522584, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11328125, + "step": 14116, + "time_per_iteration": 2.4604263305664062 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.03982186, + "balance_loss_mlp": 1.01735985, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.7962208547597716, + "language_loss": 0.68468738, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70611084, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11413574, + "step": 14117, + "time_per_iteration": 2.4023022651672363 + }, + { + "auxiliary_loss_clip": 0.01106748, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.0336777, + "balance_loss_mlp": 1.01332498, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 2.5197942610773394, + "language_loss": 0.65424407, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67556345, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11865234, + "step": 14118, + "time_per_iteration": 2.5090646743774414 + }, + { + "auxiliary_loss_clip": 0.01109612, + "auxiliary_loss_mlp": 0.01029344, + "balance_loss_clip": 1.0391705, + "balance_loss_mlp": 1.01859164, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.553802788564441, + "language_loss": 0.73346353, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75485313, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10742188, + "step": 14119, + "time_per_iteration": 2.5460948944091797 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01028572, + "balance_loss_clip": 1.04485607, + "balance_loss_mlp": 1.01587605, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.768083437308993, + "language_loss": 0.77822888, + "learning_rate": 2.345478926864446e-07, + "loss": 0.799703, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1270752, + "step": 14120, + "time_per_iteration": 2.4644525051116943 + }, + { + "auxiliary_loss_clip": 0.01111023, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.03893447, + "balance_loss_mlp": 1.01700902, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 2.0981406761101793, + "language_loss": 0.7609939, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.7824049, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.13085938, + "step": 14121, + "time_per_iteration": 2.468080759048462 + }, + { + "auxiliary_loss_clip": 0.01038274, + "auxiliary_loss_mlp": 0.00999621, + "balance_loss_clip": 1.01316476, + "balance_loss_mlp": 0.99817759, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8090818478987551, + "language_loss": 0.60080022, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62117916, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01443481, + "step": 14122, + "time_per_iteration": 3.1076014041900635 + }, + { + "auxiliary_loss_clip": 0.01119741, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.0458324, + "balance_loss_mlp": 1.01981461, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 2.956841600096265, + "language_loss": 0.79847562, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81998944, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1182251, + "step": 14123, + "time_per_iteration": 2.5868945121765137 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01029707, + "balance_loss_clip": 1.03796947, + "balance_loss_mlp": 1.0174818, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 1.992819595823086, + "language_loss": 0.83109105, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.8524437, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 0.67626953, + "router_z_loss_mlp": 0.12219238, + "step": 14124, + "time_per_iteration": 2.495818853378296 + }, + { + "auxiliary_loss_clip": 0.01111623, + "auxiliary_loss_mlp": 0.01030495, + "balance_loss_clip": 1.03950906, + "balance_loss_mlp": 1.01853251, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.8726959374456043, + "language_loss": 0.71497715, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73639834, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11962891, + "step": 14125, + "time_per_iteration": 2.5236778259277344 + }, + { + "auxiliary_loss_clip": 0.01117826, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.04070449, + "balance_loss_mlp": 1.02400732, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.6399507479508526, + "language_loss": 0.73954391, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.76108909, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12689209, + "step": 14126, + "time_per_iteration": 2.488358497619629 + }, + { + "auxiliary_loss_clip": 0.01110036, + "auxiliary_loss_mlp": 0.01029851, + "balance_loss_clip": 1.03896141, + "balance_loss_mlp": 1.01791215, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.4509475010203823, + "language_loss": 0.67605263, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69745153, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11920166, + "step": 14127, + "time_per_iteration": 2.503436803817749 + }, + { + "auxiliary_loss_clip": 0.01120845, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.04670274, + "balance_loss_mlp": 1.01576543, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.7675546929853312, + "language_loss": 0.6910547, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71254206, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12139893, + "step": 14128, + "time_per_iteration": 2.486022472381592 + }, + { + "auxiliary_loss_clip": 0.01111403, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.04175544, + "balance_loss_mlp": 1.02106476, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 2.0297537828826844, + "language_loss": 0.7818985, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80333745, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11413574, + "step": 14129, + "time_per_iteration": 2.4430482387542725 + }, + { + "auxiliary_loss_clip": 0.01118756, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.0466578, + "balance_loss_mlp": 1.01916158, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 1.6251851671187123, + "language_loss": 0.67950678, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70100033, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11431885, + "step": 14130, + "time_per_iteration": 2.4904778003692627 + }, + { + "auxiliary_loss_clip": 0.01108673, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.03690052, + "balance_loss_mlp": 1.01745725, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 1.9931104953796137, + "language_loss": 0.71021903, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73159468, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11431885, + "step": 14131, + "time_per_iteration": 4.164042949676514 + }, + { + "auxiliary_loss_clip": 0.01121238, + "auxiliary_loss_mlp": 0.01025433, + "balance_loss_clip": 1.04644668, + "balance_loss_mlp": 1.01399517, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.9117184071847444, + "language_loss": 0.68060154, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70206827, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11437988, + "step": 14132, + "time_per_iteration": 2.4842264652252197 + }, + { + "auxiliary_loss_clip": 0.01104439, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.03542268, + "balance_loss_mlp": 1.02042437, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 2.57689944321179, + "language_loss": 0.70477784, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.7261343, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10784912, + "step": 14133, + "time_per_iteration": 2.5054593086242676 + }, + { + "auxiliary_loss_clip": 0.01038399, + "auxiliary_loss_mlp": 0.0100277, + "balance_loss_clip": 1.01383567, + "balance_loss_mlp": 1.00118709, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7227211272389268, + "language_loss": 0.57562083, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.5960325, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01585388, + "step": 14134, + "time_per_iteration": 3.1858835220336914 + }, + { + "auxiliary_loss_clip": 0.01117232, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.04307604, + "balance_loss_mlp": 1.01604879, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.195546852076095, + "language_loss": 0.79181528, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.8132723, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12420654, + "step": 14135, + "time_per_iteration": 2.4717214107513428 + }, + { + "auxiliary_loss_clip": 0.01117186, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.04140139, + "balance_loss_mlp": 1.01700127, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 1.8988360729933018, + "language_loss": 0.63794744, + "learning_rate": 2.316284127127044e-07, + "loss": 0.6594131, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12365723, + "step": 14136, + "time_per_iteration": 2.4153921604156494 + }, + { + "auxiliary_loss_clip": 0.01118286, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.04450262, + "balance_loss_mlp": 1.02705264, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 2.0494201227594164, + "language_loss": 0.84311187, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86471462, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1494751, + "step": 14137, + "time_per_iteration": 2.432555675506592 + }, + { + "auxiliary_loss_clip": 0.01119149, + "auxiliary_loss_mlp": 0.01027112, + "balance_loss_clip": 1.04637575, + "balance_loss_mlp": 1.01648426, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.3383931920818997, + "language_loss": 0.78916967, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.81063229, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10632324, + "step": 14138, + "time_per_iteration": 2.4389703273773193 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.04501617, + "balance_loss_mlp": 1.01302481, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.9153054010520696, + "language_loss": 0.64815223, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66957808, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11590576, + "step": 14139, + "time_per_iteration": 2.4400134086608887 + }, + { + "auxiliary_loss_clip": 0.01120682, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.04886007, + "balance_loss_mlp": 1.0177753, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.649264943486243, + "language_loss": 0.70708418, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72857761, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10888672, + "step": 14140, + "time_per_iteration": 2.4188435077667236 + }, + { + "auxiliary_loss_clip": 0.01120378, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.04559386, + "balance_loss_mlp": 1.01785636, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 1.750174628369344, + "language_loss": 0.64217502, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66367763, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12023926, + "step": 14141, + "time_per_iteration": 2.5035898685455322 + }, + { + "auxiliary_loss_clip": 0.01116906, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.0449177, + "balance_loss_mlp": 1.02160144, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 2.5851470241831684, + "language_loss": 0.70706111, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72856289, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11676025, + "step": 14142, + "time_per_iteration": 2.5588159561157227 + }, + { + "auxiliary_loss_clip": 0.01111839, + "auxiliary_loss_mlp": 0.01025413, + "balance_loss_clip": 1.03967452, + "balance_loss_mlp": 1.01436782, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 7.655471229741175, + "language_loss": 0.65615642, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67752898, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11035156, + "step": 14143, + "time_per_iteration": 2.52058482170105 + }, + { + "auxiliary_loss_clip": 0.01114008, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.03876102, + "balance_loss_mlp": 1.01798725, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 2.2909955343518744, + "language_loss": 0.67836398, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69980705, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12322998, + "step": 14144, + "time_per_iteration": 2.48641037940979 + }, + { + "auxiliary_loss_clip": 0.01111501, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.04080462, + "balance_loss_mlp": 1.01912344, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.9209406651594327, + "language_loss": 0.64585954, + "learning_rate": 2.299937473050777e-07, + "loss": 0.66728711, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.12133789, + "step": 14145, + "time_per_iteration": 3.8506956100463867 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.04817808, + "balance_loss_mlp": 1.02231884, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9110292730276328, + "language_loss": 0.85132289, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87286645, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.12554932, + "step": 14146, + "time_per_iteration": 2.5120480060577393 + }, + { + "auxiliary_loss_clip": 0.01114022, + "auxiliary_loss_mlp": 0.01027689, + "balance_loss_clip": 1.04318047, + "balance_loss_mlp": 1.0163281, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 4.783058451049038, + "language_loss": 0.83841217, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85982931, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11352539, + "step": 14147, + "time_per_iteration": 2.4463307857513428 + }, + { + "auxiliary_loss_clip": 0.01118376, + "auxiliary_loss_mlp": 0.01029543, + "balance_loss_clip": 1.04355037, + "balance_loss_mlp": 1.01799726, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.9154188167984207, + "language_loss": 0.857862, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87934119, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11541748, + "step": 14148, + "time_per_iteration": 2.421020269393921 + }, + { + "auxiliary_loss_clip": 0.01115709, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.04354024, + "balance_loss_mlp": 1.01782179, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 2.1323952897707454, + "language_loss": 0.72037756, + "learning_rate": 2.292689741370204e-07, + "loss": 0.7418341, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12127686, + "step": 14149, + "time_per_iteration": 2.4357266426086426 + }, + { + "auxiliary_loss_clip": 0.01117461, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.04413342, + "balance_loss_mlp": 1.02060413, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.7390604335895499, + "language_loss": 0.76184422, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78334236, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11761475, + "step": 14150, + "time_per_iteration": 2.5045483112335205 + }, + { + "auxiliary_loss_clip": 0.01115636, + "auxiliary_loss_mlp": 0.01030396, + "balance_loss_clip": 1.04537857, + "balance_loss_mlp": 1.01923227, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.732546754883835, + "language_loss": 0.72471881, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74617922, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11169434, + "step": 14151, + "time_per_iteration": 3.9883224964141846 + }, + { + "auxiliary_loss_clip": 0.01042653, + "auxiliary_loss_mlp": 0.01005796, + "balance_loss_clip": 1.01750362, + "balance_loss_mlp": 1.0043726, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8852232640377913, + "language_loss": 0.59638184, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61686641, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01423645, + "step": 14152, + "time_per_iteration": 2.84964919090271 + }, + { + "auxiliary_loss_clip": 0.01052098, + "auxiliary_loss_mlp": 0.0100395, + "balance_loss_clip": 1.02695239, + "balance_loss_mlp": 1.00264931, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6919894235918588, + "language_loss": 0.61082864, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63138914, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01300049, + "step": 14153, + "time_per_iteration": 3.132270336151123 + }, + { + "auxiliary_loss_clip": 0.0111718, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.04653454, + "balance_loss_mlp": 1.01871097, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 2.2624586046581485, + "language_loss": 0.80727863, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.8287552, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11773682, + "step": 14154, + "time_per_iteration": 2.5336947441101074 + }, + { + "auxiliary_loss_clip": 0.01112651, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.04521394, + "balance_loss_mlp": 1.0193367, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.6041642562128569, + "language_loss": 0.79633075, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81775278, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.10217285, + "step": 14155, + "time_per_iteration": 2.5712766647338867 + }, + { + "auxiliary_loss_clip": 0.01118423, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.04101634, + "balance_loss_mlp": 1.01591659, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.7327474574955029, + "language_loss": 0.70619291, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72765672, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12042236, + "step": 14156, + "time_per_iteration": 2.4340341091156006 + }, + { + "auxiliary_loss_clip": 0.01112213, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.0426321, + "balance_loss_mlp": 1.01503956, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.7750065804584827, + "language_loss": 0.73350167, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75488424, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11016846, + "step": 14157, + "time_per_iteration": 2.463561773300171 + }, + { + "auxiliary_loss_clip": 0.01113847, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.04520833, + "balance_loss_mlp": 1.01619673, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 2.319363496187538, + "language_loss": 0.79613984, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81754488, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10461426, + "step": 14158, + "time_per_iteration": 2.4574975967407227 + }, + { + "auxiliary_loss_clip": 0.01111055, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.03998756, + "balance_loss_mlp": 1.01861477, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 1.944550405856579, + "language_loss": 0.78908187, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81050777, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12915039, + "step": 14159, + "time_per_iteration": 3.880204677581787 + }, + { + "auxiliary_loss_clip": 0.01112704, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.04090714, + "balance_loss_mlp": 1.01880538, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 6.996008902638425, + "language_loss": 0.70859456, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73002595, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11633301, + "step": 14160, + "time_per_iteration": 2.401853084564209 + }, + { + "auxiliary_loss_clip": 0.01121967, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.04485536, + "balance_loss_mlp": 1.01762152, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 1.9313563887322682, + "language_loss": 0.70093071, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72245181, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12518311, + "step": 14161, + "time_per_iteration": 2.640679121017456 + }, + { + "auxiliary_loss_clip": 0.01112619, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.03794611, + "balance_loss_mlp": 1.0173018, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.872516597510504, + "language_loss": 0.77834332, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.7997582, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11560059, + "step": 14162, + "time_per_iteration": 2.5312533378601074 + }, + { + "auxiliary_loss_clip": 0.0112009, + "auxiliary_loss_mlp": 0.01035141, + "balance_loss_clip": 1.04709172, + "balance_loss_mlp": 1.02267778, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 2.490805416347065, + "language_loss": 0.76520216, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.78675449, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12469482, + "step": 14163, + "time_per_iteration": 2.60496187210083 + }, + { + "auxiliary_loss_clip": 0.01056191, + "auxiliary_loss_mlp": 0.01004887, + "balance_loss_clip": 1.03243613, + "balance_loss_mlp": 1.00363207, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.7016053129869826, + "language_loss": 0.5500952, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57070589, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.23754883, + "router_z_loss_mlp": 0.01254272, + "step": 14164, + "time_per_iteration": 3.128241539001465 + }, + { + "auxiliary_loss_clip": 0.01111592, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.0395391, + "balance_loss_mlp": 1.02203333, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 2.629649378100821, + "language_loss": 0.72769165, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.7491622, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.13439941, + "step": 14165, + "time_per_iteration": 2.4751291275024414 + }, + { + "auxiliary_loss_clip": 0.01115106, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.04259181, + "balance_loss_mlp": 1.01835537, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.5844711710731483, + "language_loss": 0.67436647, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69582075, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11968994, + "step": 14166, + "time_per_iteration": 2.5089855194091797 + }, + { + "auxiliary_loss_clip": 0.01114533, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.04239202, + "balance_loss_mlp": 1.01849389, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 1.8293412746657205, + "language_loss": 0.73348111, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75491846, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10705566, + "step": 14167, + "time_per_iteration": 2.4771552085876465 + }, + { + "auxiliary_loss_clip": 0.01115757, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.0439167, + "balance_loss_mlp": 1.01729202, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.5311286643825979, + "language_loss": 0.80375195, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82519466, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11230469, + "step": 14168, + "time_per_iteration": 2.6440234184265137 + }, + { + "auxiliary_loss_clip": 0.0111714, + "auxiliary_loss_mlp": 0.01024937, + "balance_loss_clip": 1.045349, + "balance_loss_mlp": 1.01370692, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 2.1651802320502354, + "language_loss": 0.75935853, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78077924, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11230469, + "step": 14169, + "time_per_iteration": 2.561293601989746 + }, + { + "auxiliary_loss_clip": 0.01122008, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.04714823, + "balance_loss_mlp": 1.01669383, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.704851908039883, + "language_loss": 0.63545561, + "learning_rate": 2.254815511000452e-07, + "loss": 0.6569674, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12481689, + "step": 14170, + "time_per_iteration": 2.4818801879882812 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.03782368, + "balance_loss_mlp": 1.01681399, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.1969684973073935, + "language_loss": 0.86503673, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88643932, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11444092, + "step": 14171, + "time_per_iteration": 2.438142776489258 + }, + { + "auxiliary_loss_clip": 0.01118073, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.04460013, + "balance_loss_mlp": 1.02318776, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.8861071023142248, + "language_loss": 0.54792923, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.56945843, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11663818, + "step": 14172, + "time_per_iteration": 2.541477680206299 + }, + { + "auxiliary_loss_clip": 0.0111199, + "auxiliary_loss_mlp": 0.01025126, + "balance_loss_clip": 1.04204762, + "balance_loss_mlp": 1.0153209, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.7010004760355284, + "language_loss": 0.69453955, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71591067, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.09802246, + "step": 14173, + "time_per_iteration": 3.8952372074127197 + }, + { + "auxiliary_loss_clip": 0.01112062, + "auxiliary_loss_mlp": 0.01035457, + "balance_loss_clip": 1.03886986, + "balance_loss_mlp": 1.02145004, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.7418973191702536, + "language_loss": 0.77422124, + "learning_rate": 2.247634997500205e-07, + "loss": 0.79569644, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.13995361, + "step": 14174, + "time_per_iteration": 2.48475980758667 + }, + { + "auxiliary_loss_clip": 0.01123707, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.04773331, + "balance_loss_mlp": 1.02028751, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.6820085659840194, + "language_loss": 0.82119083, + "learning_rate": 2.245841551883676e-07, + "loss": 0.84274852, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11773682, + "step": 14175, + "time_per_iteration": 2.4807209968566895 + }, + { + "auxiliary_loss_clip": 0.01122651, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.04789829, + "balance_loss_mlp": 1.0189873, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 9.104982530337107, + "language_loss": 0.65778357, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67932516, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12536621, + "step": 14176, + "time_per_iteration": 2.4574697017669678 + }, + { + "auxiliary_loss_clip": 0.01113667, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.04362631, + "balance_loss_mlp": 1.01639545, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.7710964909637466, + "language_loss": 0.78417552, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80559433, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11810303, + "step": 14177, + "time_per_iteration": 2.4848334789276123 + }, + { + "auxiliary_loss_clip": 0.01117706, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.04517746, + "balance_loss_mlp": 1.01785541, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.682851150445018, + "language_loss": 0.73396236, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75543553, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11761475, + "step": 14178, + "time_per_iteration": 2.520559549331665 + }, + { + "auxiliary_loss_clip": 0.01114768, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.04149652, + "balance_loss_mlp": 1.02537143, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.755688768368071, + "language_loss": 0.75078273, + "learning_rate": 2.238674502491935e-07, + "loss": 0.77230048, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11621094, + "step": 14179, + "time_per_iteration": 2.452789068222046 + }, + { + "auxiliary_loss_clip": 0.01116038, + "auxiliary_loss_mlp": 0.01026104, + "balance_loss_clip": 1.04471695, + "balance_loss_mlp": 1.01454639, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 1.9014965867656068, + "language_loss": 0.81853414, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83995551, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11547852, + "step": 14180, + "time_per_iteration": 2.5607612133026123 + }, + { + "auxiliary_loss_clip": 0.01114811, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.04286635, + "balance_loss_mlp": 1.01913977, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.227304756023522, + "language_loss": 0.60776031, + "learning_rate": 2.235095018591815e-07, + "loss": 0.62920642, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10656738, + "step": 14181, + "time_per_iteration": 2.5225541591644287 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01030581, + "balance_loss_clip": 1.04371226, + "balance_loss_mlp": 1.0198642, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.2218373754635588, + "language_loss": 0.72451532, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74595261, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.1071167, + "step": 14182, + "time_per_iteration": 2.4542973041534424 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.04035926, + "balance_loss_mlp": 1.02181137, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 3.5162113825147, + "language_loss": 0.7094363, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.7308991, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12658691, + "step": 14183, + "time_per_iteration": 2.4585084915161133 + }, + { + "auxiliary_loss_clip": 0.01110355, + "auxiliary_loss_mlp": 0.01028683, + "balance_loss_clip": 1.04218471, + "balance_loss_mlp": 1.01847315, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 2.0780604452568956, + "language_loss": 0.72476995, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74616039, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10211182, + "step": 14184, + "time_per_iteration": 2.4449386596679688 + }, + { + "auxiliary_loss_clip": 0.01118882, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.04336774, + "balance_loss_mlp": 1.02329636, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.926584135897264, + "language_loss": 0.7665984, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.7881456, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12536621, + "step": 14185, + "time_per_iteration": 2.403204917907715 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.04465079, + "balance_loss_mlp": 1.01535428, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 2.117547269818943, + "language_loss": 0.79477453, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81622624, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11578369, + "step": 14186, + "time_per_iteration": 2.413652181625366 + }, + { + "auxiliary_loss_clip": 0.01109553, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.03691411, + "balance_loss_mlp": 1.01625049, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.9341278358388991, + "language_loss": 0.62843597, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64981878, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12481689, + "step": 14187, + "time_per_iteration": 2.393028497695923 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.04784667, + "balance_loss_mlp": 1.01745057, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.7899134521200286, + "language_loss": 0.76303512, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.7846067, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.1262207, + "step": 14188, + "time_per_iteration": 2.520968198776245 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.04153061, + "balance_loss_mlp": 1.01526821, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 2.160782237216904, + "language_loss": 0.78386116, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80528986, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12481689, + "step": 14189, + "time_per_iteration": 2.462981700897217 + }, + { + "auxiliary_loss_clip": 0.01117776, + "auxiliary_loss_mlp": 0.01028181, + "balance_loss_clip": 1.04463291, + "balance_loss_mlp": 1.016433, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 2.072154601127411, + "language_loss": 0.79529142, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.816751, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11743164, + "step": 14190, + "time_per_iteration": 3.824352741241455 + }, + { + "auxiliary_loss_clip": 0.01110544, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.03960276, + "balance_loss_mlp": 1.02248907, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 3.3020179928356796, + "language_loss": 0.75908542, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78056252, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.14685059, + "step": 14191, + "time_per_iteration": 2.4386048316955566 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.04115236, + "balance_loss_mlp": 1.01874137, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.9229687558000539, + "language_loss": 0.69119012, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.7126227, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12115479, + "step": 14192, + "time_per_iteration": 2.4356484413146973 + }, + { + "auxiliary_loss_clip": 0.01126356, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.0483048, + "balance_loss_mlp": 1.02009761, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 5.280123107480918, + "language_loss": 0.63202846, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.65362477, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13195801, + "step": 14193, + "time_per_iteration": 2.5851056575775146 + }, + { + "auxiliary_loss_clip": 0.01118384, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.04378748, + "balance_loss_mlp": 1.02078688, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 2.188863483545897, + "language_loss": 0.76846921, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78997964, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11871338, + "step": 14194, + "time_per_iteration": 3.9571468830108643 + }, + { + "auxiliary_loss_clip": 0.0111782, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.04234004, + "balance_loss_mlp": 1.0178678, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 12.233913010354682, + "language_loss": 0.69879955, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.72027564, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11920166, + "step": 14195, + "time_per_iteration": 2.450622797012329 + }, + { + "auxiliary_loss_clip": 0.01112844, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.03941715, + "balance_loss_mlp": 1.01810622, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 2.6255487658413212, + "language_loss": 0.85929787, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88072574, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11846924, + "step": 14196, + "time_per_iteration": 2.4366049766540527 + }, + { + "auxiliary_loss_clip": 0.01050416, + "auxiliary_loss_mlp": 0.01003494, + "balance_loss_clip": 1.02559733, + "balance_loss_mlp": 1.00229597, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7646013726468716, + "language_loss": 0.55081642, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57135552, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01199341, + "step": 14197, + "time_per_iteration": 3.042616128921509 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.04456973, + "balance_loss_mlp": 1.02235103, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 8.217786235890323, + "language_loss": 0.81771255, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83919364, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11242676, + "step": 14198, + "time_per_iteration": 2.481078624725342 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.04717112, + "balance_loss_mlp": 1.01718879, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 2.517564035601769, + "language_loss": 0.68402398, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70549357, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10998535, + "step": 14199, + "time_per_iteration": 2.708235025405884 + }, + { + "auxiliary_loss_clip": 0.01109035, + "auxiliary_loss_mlp": 0.01025535, + "balance_loss_clip": 1.03921556, + "balance_loss_mlp": 1.01525915, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.529046339944739, + "language_loss": 0.86277843, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88412416, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.1027832, + "step": 14200, + "time_per_iteration": 2.4762187004089355 + }, + { + "auxiliary_loss_clip": 0.01117239, + "auxiliary_loss_mlp": 0.01026648, + "balance_loss_clip": 1.04320717, + "balance_loss_mlp": 1.01534677, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.8346277577414092, + "language_loss": 0.77967805, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.80111688, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11303711, + "step": 14201, + "time_per_iteration": 2.457943916320801 + }, + { + "auxiliary_loss_clip": 0.0111605, + "auxiliary_loss_mlp": 0.01027253, + "balance_loss_clip": 1.04360545, + "balance_loss_mlp": 1.01626205, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 3.377617436745533, + "language_loss": 0.68836659, + "learning_rate": 2.19767322694256e-07, + "loss": 0.70979965, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10992432, + "step": 14202, + "time_per_iteration": 2.507392644882202 + }, + { + "auxiliary_loss_clip": 0.01116429, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.04378653, + "balance_loss_mlp": 1.02185798, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 2.0946854335600618, + "language_loss": 0.80215383, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82365036, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11364746, + "step": 14203, + "time_per_iteration": 3.961622476577759 + }, + { + "auxiliary_loss_clip": 0.01116317, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.04297197, + "balance_loss_mlp": 1.01789546, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 2.610729075920314, + "language_loss": 0.66203511, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.68349969, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12249756, + "step": 14204, + "time_per_iteration": 2.562558174133301 + }, + { + "auxiliary_loss_clip": 0.01116619, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.04378903, + "balance_loss_mlp": 1.02126944, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.084909861836519, + "language_loss": 0.60239315, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.62389266, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12078857, + "step": 14205, + "time_per_iteration": 2.4859790802001953 + }, + { + "auxiliary_loss_clip": 0.0112192, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.05024958, + "balance_loss_mlp": 1.01532626, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.092329180747077, + "language_loss": 0.72364116, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74513137, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11767578, + "step": 14206, + "time_per_iteration": 2.5631628036499023 + }, + { + "auxiliary_loss_clip": 0.01117453, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.04494643, + "balance_loss_mlp": 1.01898623, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 3.326950101844525, + "language_loss": 0.76548076, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78696269, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11749268, + "step": 14207, + "time_per_iteration": 2.406419515609741 + }, + { + "auxiliary_loss_clip": 0.01116031, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.04142308, + "balance_loss_mlp": 1.01726544, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.7920515750639072, + "language_loss": 0.85290623, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87436152, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12231445, + "step": 14208, + "time_per_iteration": 2.484168767929077 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.04779649, + "balance_loss_mlp": 1.02061617, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.456733436038661, + "language_loss": 0.66181713, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68334424, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11352539, + "step": 14209, + "time_per_iteration": 2.4286019802093506 + }, + { + "auxiliary_loss_clip": 0.01111853, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.04110479, + "balance_loss_mlp": 1.01667786, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 2.239694920582549, + "language_loss": 0.70374072, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.7251339, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10791016, + "step": 14210, + "time_per_iteration": 2.536323308944702 + }, + { + "auxiliary_loss_clip": 0.01104756, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.03487408, + "balance_loss_mlp": 1.01764202, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.4306322983145987, + "language_loss": 0.70437264, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72570992, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11328125, + "step": 14211, + "time_per_iteration": 2.511521816253662 + }, + { + "auxiliary_loss_clip": 0.01117107, + "auxiliary_loss_mlp": 0.01030666, + "balance_loss_clip": 1.04384398, + "balance_loss_mlp": 1.01917386, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.2339361859190574, + "language_loss": 0.82176906, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.84324682, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11505127, + "step": 14212, + "time_per_iteration": 2.48915433883667 + }, + { + "auxiliary_loss_clip": 0.01109441, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.03737891, + "balance_loss_mlp": 1.01847994, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 2.104831954793471, + "language_loss": 0.66473299, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68614066, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12841797, + "step": 14213, + "time_per_iteration": 2.6275503635406494 + }, + { + "auxiliary_loss_clip": 0.01110653, + "auxiliary_loss_mlp": 0.01027889, + "balance_loss_clip": 1.03813064, + "balance_loss_mlp": 1.01620042, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 2.1495476181582007, + "language_loss": 0.78054482, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80193025, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11694336, + "step": 14214, + "time_per_iteration": 2.5034267902374268 + }, + { + "auxiliary_loss_clip": 0.01118525, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.0433284, + "balance_loss_mlp": 1.01457763, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 3.320064674652861, + "language_loss": 0.6649366, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68639451, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12695312, + "step": 14215, + "time_per_iteration": 2.431427240371704 + }, + { + "auxiliary_loss_clip": 0.01121089, + "auxiliary_loss_mlp": 0.01027679, + "balance_loss_clip": 1.0502665, + "balance_loss_mlp": 1.01610351, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.7846138413932366, + "language_loss": 0.62648499, + "learning_rate": 2.172890718362279e-07, + "loss": 0.6479727, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11584473, + "step": 14216, + "time_per_iteration": 2.6508736610412598 + }, + { + "auxiliary_loss_clip": 0.01111596, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.03744102, + "balance_loss_mlp": 1.02085757, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.8667659043621, + "language_loss": 0.65834475, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67978537, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1161499, + "step": 14217, + "time_per_iteration": 2.4173240661621094 + }, + { + "auxiliary_loss_clip": 0.01111273, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.04119658, + "balance_loss_mlp": 1.01868749, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.6786973065317596, + "language_loss": 0.65039015, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67180061, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11090088, + "step": 14218, + "time_per_iteration": 3.9986941814422607 + }, + { + "auxiliary_loss_clip": 0.01114117, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.03902388, + "balance_loss_mlp": 1.01863182, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 2.128840637255463, + "language_loss": 0.70598507, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72744024, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12762451, + "step": 14219, + "time_per_iteration": 2.4383978843688965 + }, + { + "auxiliary_loss_clip": 0.01120399, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.04464185, + "balance_loss_mlp": 1.02291775, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.3123328091259685, + "language_loss": 0.67084205, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69239455, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11932373, + "step": 14220, + "time_per_iteration": 2.4087936878204346 + }, + { + "auxiliary_loss_clip": 0.01108865, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.03939009, + "balance_loss_mlp": 1.01743448, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 2.1080726106773473, + "language_loss": 0.71640718, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73778045, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11029053, + "step": 14221, + "time_per_iteration": 2.46652889251709 + }, + { + "auxiliary_loss_clip": 0.0111458, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.04248071, + "balance_loss_mlp": 1.02155113, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.8764142849836187, + "language_loss": 0.60351026, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62498909, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11743164, + "step": 14222, + "time_per_iteration": 2.504237413406372 + }, + { + "auxiliary_loss_clip": 0.01108644, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.03867769, + "balance_loss_mlp": 1.01724279, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 2.057275789099042, + "language_loss": 0.84148192, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86285281, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11212158, + "step": 14223, + "time_per_iteration": 2.5436463356018066 + }, + { + "auxiliary_loss_clip": 0.01112863, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.04143798, + "balance_loss_mlp": 1.02442968, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.52137331909126, + "language_loss": 0.73920405, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76070607, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12902832, + "step": 14224, + "time_per_iteration": 2.445617198944092 + }, + { + "auxiliary_loss_clip": 0.01121884, + "auxiliary_loss_mlp": 0.01027483, + "balance_loss_clip": 1.04806209, + "balance_loss_mlp": 1.01603866, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 1.878818288977308, + "language_loss": 0.7541787, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77567232, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11450195, + "step": 14225, + "time_per_iteration": 2.4292030334472656 + }, + { + "auxiliary_loss_clip": 0.01114908, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.04468954, + "balance_loss_mlp": 1.023283, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.5641955627863084, + "language_loss": 0.76821971, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.78970802, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10644531, + "step": 14226, + "time_per_iteration": 2.4893620014190674 + }, + { + "auxiliary_loss_clip": 0.01110816, + "auxiliary_loss_mlp": 0.01037002, + "balance_loss_clip": 1.03711247, + "balance_loss_mlp": 1.02370381, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 2.2637783685395925, + "language_loss": 0.55090249, + "learning_rate": 2.153511688875702e-07, + "loss": 0.57238066, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.13305664, + "step": 14227, + "time_per_iteration": 2.428786516189575 + }, + { + "auxiliary_loss_clip": 0.01114539, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.01684093, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 2.20846950954805, + "language_loss": 0.65478158, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67621201, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11663818, + "step": 14228, + "time_per_iteration": 2.4541099071502686 + }, + { + "auxiliary_loss_clip": 0.01119908, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.04616308, + "balance_loss_mlp": 1.02103245, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 2.1455026575101663, + "language_loss": 0.73845011, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.75998545, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12579346, + "step": 14229, + "time_per_iteration": 2.421252965927124 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01027436, + "balance_loss_clip": 1.04172802, + "balance_loss_mlp": 1.01661134, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 1.7478247336909736, + "language_loss": 0.73086184, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.75226742, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10821533, + "step": 14230, + "time_per_iteration": 2.4948651790618896 + }, + { + "auxiliary_loss_clip": 0.0111706, + "auxiliary_loss_mlp": 0.0102465, + "balance_loss_clip": 1.04493666, + "balance_loss_mlp": 1.01361132, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 1.8294693394098436, + "language_loss": 0.82209444, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84351158, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1104126, + "step": 14231, + "time_per_iteration": 2.429835081100464 + }, + { + "auxiliary_loss_clip": 0.01120439, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.04589963, + "balance_loss_mlp": 1.02239358, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.8860074128199031, + "language_loss": 0.67705286, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.69861698, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13592529, + "step": 14232, + "time_per_iteration": 2.405748128890991 + }, + { + "auxiliary_loss_clip": 0.0112903, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.05376232, + "balance_loss_mlp": 1.02051163, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.4757627086414395, + "language_loss": 0.67162305, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.6932441, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12561035, + "step": 14233, + "time_per_iteration": 3.872325897216797 + }, + { + "auxiliary_loss_clip": 0.01110997, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.04088473, + "balance_loss_mlp": 1.01997244, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.8392034762511478, + "language_loss": 0.7693578, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.79077983, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11230469, + "step": 14234, + "time_per_iteration": 2.414595365524292 + }, + { + "auxiliary_loss_clip": 0.01041269, + "auxiliary_loss_mlp": 0.01005438, + "balance_loss_clip": 1.01649404, + "balance_loss_mlp": 1.00402939, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7470219419229236, + "language_loss": 0.57997191, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60043907, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01408386, + "step": 14235, + "time_per_iteration": 3.0565803050994873 + }, + { + "auxiliary_loss_clip": 0.01051784, + "auxiliary_loss_mlp": 0.01005105, + "balance_loss_clip": 1.02751064, + "balance_loss_mlp": 1.00386071, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7763550167574823, + "language_loss": 0.56601757, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58658648, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.24267578, + "router_z_loss_mlp": 0.01243591, + "step": 14236, + "time_per_iteration": 2.9690654277801514 + }, + { + "auxiliary_loss_clip": 0.01114954, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.04283679, + "balance_loss_mlp": 1.01807594, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.6839552614948472, + "language_loss": 0.70320243, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72465289, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12011719, + "step": 14237, + "time_per_iteration": 2.4995429515838623 + }, + { + "auxiliary_loss_clip": 0.01109901, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.03906441, + "balance_loss_mlp": 1.01771522, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.372803490729078, + "language_loss": 0.64028609, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.66167545, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11315918, + "step": 14238, + "time_per_iteration": 3.907799005508423 + }, + { + "auxiliary_loss_clip": 0.01106162, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03777671, + "balance_loss_mlp": 1.02175546, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.4025480779503552, + "language_loss": 0.6917156, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71310782, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.11303711, + "step": 14239, + "time_per_iteration": 2.475708246231079 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.04352844, + "balance_loss_mlp": 1.01959109, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 2.2035092204995985, + "language_loss": 0.66597128, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68746084, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11834717, + "step": 14240, + "time_per_iteration": 2.5866756439208984 + }, + { + "auxiliary_loss_clip": 0.01122399, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.04461741, + "balance_loss_mlp": 1.02309394, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.6109710653718985, + "language_loss": 0.62072861, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64231008, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12658691, + "step": 14241, + "time_per_iteration": 2.536651611328125 + }, + { + "auxiliary_loss_clip": 0.01119185, + "auxiliary_loss_mlp": 0.01038522, + "balance_loss_clip": 1.04078913, + "balance_loss_mlp": 1.02556443, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.5565569330850908, + "language_loss": 0.74631733, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76789439, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12957764, + "step": 14242, + "time_per_iteration": 2.630509853363037 + }, + { + "auxiliary_loss_clip": 0.01115098, + "auxiliary_loss_mlp": 0.01040927, + "balance_loss_clip": 1.04091835, + "balance_loss_mlp": 1.02874994, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 3.5484428015652973, + "language_loss": 0.76384747, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78540772, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1217041, + "step": 14243, + "time_per_iteration": 2.474231004714966 + }, + { + "auxiliary_loss_clip": 0.01110135, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.03735948, + "balance_loss_mlp": 1.01848841, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.7369194147309666, + "language_loss": 0.68637645, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70777994, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11712646, + "step": 14244, + "time_per_iteration": 2.5347273349761963 + }, + { + "auxiliary_loss_clip": 0.01033114, + "auxiliary_loss_mlp": 0.01005255, + "balance_loss_clip": 1.00916743, + "balance_loss_mlp": 1.00375307, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.753711346213865, + "language_loss": 0.58448899, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.6048727, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.01499939, + "step": 14245, + "time_per_iteration": 2.990520715713501 + }, + { + "auxiliary_loss_clip": 0.01120842, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.04397035, + "balance_loss_mlp": 1.02227533, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 2.0329647493301146, + "language_loss": 0.77398634, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79555774, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.14019775, + "step": 14246, + "time_per_iteration": 3.878798246383667 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01025819, + "balance_loss_clip": 1.03939772, + "balance_loss_mlp": 1.01426733, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 2.6023694984926866, + "language_loss": 0.81916988, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.84055245, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11566162, + "step": 14247, + "time_per_iteration": 2.4461097717285156 + }, + { + "auxiliary_loss_clip": 0.0112016, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.04660273, + "balance_loss_mlp": 1.01559639, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 3.948492975721483, + "language_loss": 0.77676839, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.7982474, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.121521, + "step": 14248, + "time_per_iteration": 2.423224687576294 + }, + { + "auxiliary_loss_clip": 0.0111306, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.03952718, + "balance_loss_mlp": 1.01910913, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 3.5020649909380825, + "language_loss": 0.77907634, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80052519, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12713623, + "step": 14249, + "time_per_iteration": 2.499256134033203 + }, + { + "auxiliary_loss_clip": 0.0111764, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.04684711, + "balance_loss_mlp": 1.02113199, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.857792654550398, + "language_loss": 0.78513825, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80663866, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11279297, + "step": 14250, + "time_per_iteration": 2.5062062740325928 + }, + { + "auxiliary_loss_clip": 0.01112494, + "auxiliary_loss_mlp": 0.01026695, + "balance_loss_clip": 1.043594, + "balance_loss_mlp": 1.01652658, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 2.1509489914196043, + "language_loss": 0.79765058, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81904244, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10174561, + "step": 14251, + "time_per_iteration": 2.4585487842559814 + }, + { + "auxiliary_loss_clip": 0.01114603, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.04462135, + "balance_loss_mlp": 1.01861417, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.9247419173472338, + "language_loss": 0.6163168, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63775349, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10455322, + "step": 14252, + "time_per_iteration": 2.424659252166748 + }, + { + "auxiliary_loss_clip": 0.01116855, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.0439415, + "balance_loss_mlp": 1.01875162, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 2.0623988110034284, + "language_loss": 0.69742036, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71891427, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13800049, + "step": 14253, + "time_per_iteration": 2.530970573425293 + }, + { + "auxiliary_loss_clip": 0.01034362, + "auxiliary_loss_mlp": 0.01005675, + "balance_loss_clip": 1.01027918, + "balance_loss_mlp": 1.00411057, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7822207570293741, + "language_loss": 0.59172153, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61212194, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.24047852, + "router_z_loss_mlp": 0.01564026, + "step": 14254, + "time_per_iteration": 3.1328516006469727 + }, + { + "auxiliary_loss_clip": 0.01114098, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.04266047, + "balance_loss_mlp": 1.02114785, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.7332067538426752, + "language_loss": 0.81226575, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83374786, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1295166, + "step": 14255, + "time_per_iteration": 2.5034148693084717 + }, + { + "auxiliary_loss_clip": 0.01114059, + "auxiliary_loss_mlp": 0.01024174, + "balance_loss_clip": 1.04441798, + "balance_loss_mlp": 1.01324868, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 3.0111393250335174, + "language_loss": 0.67600721, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69738954, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.109375, + "step": 14256, + "time_per_iteration": 2.4266645908355713 + }, + { + "auxiliary_loss_clip": 0.01111258, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.03980911, + "balance_loss_mlp": 1.01797581, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.527693200444249, + "language_loss": 0.69927782, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72069275, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12249756, + "step": 14257, + "time_per_iteration": 2.477447509765625 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.04561114, + "balance_loss_mlp": 1.02125692, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 4.690807454804978, + "language_loss": 0.77046639, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.7919637, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.1227417, + "step": 14258, + "time_per_iteration": 2.564523458480835 + }, + { + "auxiliary_loss_clip": 0.01112972, + "auxiliary_loss_mlp": 0.01027231, + "balance_loss_clip": 1.04377007, + "balance_loss_mlp": 1.01624608, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.779063514069444, + "language_loss": 0.67869723, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.70009923, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10986328, + "step": 14259, + "time_per_iteration": 2.561293601989746 + }, + { + "auxiliary_loss_clip": 0.0111721, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.0456953, + "balance_loss_mlp": 1.01985574, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 2.122244927469555, + "language_loss": 0.77292174, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79441059, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11804199, + "step": 14260, + "time_per_iteration": 2.600001573562622 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03765273, + "balance_loss_mlp": 1.01777065, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.7722359147195657, + "language_loss": 0.73991394, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76133198, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.13671875, + "step": 14261, + "time_per_iteration": 4.134501218795776 + }, + { + "auxiliary_loss_clip": 0.0112458, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.04694772, + "balance_loss_mlp": 1.02248859, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.9161166055974441, + "language_loss": 0.79266423, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81426024, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12542725, + "step": 14262, + "time_per_iteration": 2.7190637588500977 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.04842138, + "balance_loss_mlp": 1.02173781, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.5116736252486909, + "language_loss": 0.68253696, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.7040357, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.1060791, + "step": 14263, + "time_per_iteration": 2.6550793647766113 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.04809284, + "balance_loss_mlp": 1.01788819, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.427811053338208, + "language_loss": 0.7961123, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81761771, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11297607, + "step": 14264, + "time_per_iteration": 2.662005662918091 + }, + { + "auxiliary_loss_clip": 0.01116955, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.04580164, + "balance_loss_mlp": 1.0195353, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.321586997384202, + "language_loss": 0.69707072, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71854997, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11431885, + "step": 14265, + "time_per_iteration": 2.706655979156494 + }, + { + "auxiliary_loss_clip": 0.01107893, + "auxiliary_loss_mlp": 0.01024912, + "balance_loss_clip": 1.04002011, + "balance_loss_mlp": 1.01457632, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.6464676803570653, + "language_loss": 0.66216862, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68349671, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10339355, + "step": 14266, + "time_per_iteration": 2.7434377670288086 + }, + { + "auxiliary_loss_clip": 0.01110732, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.04018843, + "balance_loss_mlp": 1.02279067, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.7558821496846524, + "language_loss": 0.75531131, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77677768, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.13110352, + "step": 14267, + "time_per_iteration": 2.6590161323547363 + }, + { + "auxiliary_loss_clip": 0.01111087, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.04159021, + "balance_loss_mlp": 1.01386654, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.6457171980073202, + "language_loss": 0.87603498, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89739323, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10864258, + "step": 14268, + "time_per_iteration": 2.6555540561676025 + }, + { + "auxiliary_loss_clip": 0.01120071, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.04475975, + "balance_loss_mlp": 1.02069151, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.7532181243981788, + "language_loss": 0.72639275, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.74792176, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12145996, + "step": 14269, + "time_per_iteration": 2.4759106636047363 + }, + { + "auxiliary_loss_clip": 0.01117413, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.045017, + "balance_loss_mlp": 1.01861978, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.5376056640877542, + "language_loss": 0.66588569, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68736386, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11785889, + "step": 14270, + "time_per_iteration": 2.5524096488952637 + }, + { + "auxiliary_loss_clip": 0.01113676, + "auxiliary_loss_mlp": 0.01026256, + "balance_loss_clip": 1.0429244, + "balance_loss_mlp": 1.01494944, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 2.111683862101411, + "language_loss": 0.74064219, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76204157, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11291504, + "step": 14271, + "time_per_iteration": 2.4424397945404053 + }, + { + "auxiliary_loss_clip": 0.01044667, + "auxiliary_loss_mlp": 0.01003664, + "balance_loss_clip": 1.01981771, + "balance_loss_mlp": 1.00222862, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.7996903456883911, + "language_loss": 0.59429759, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.6147809, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01434326, + "step": 14272, + "time_per_iteration": 3.113895893096924 + }, + { + "auxiliary_loss_clip": 0.01115179, + "auxiliary_loss_mlp": 0.01030914, + "balance_loss_clip": 1.04016018, + "balance_loss_mlp": 1.01893294, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 2.1147146986659116, + "language_loss": 0.75494838, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77640927, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11981201, + "step": 14273, + "time_per_iteration": 2.4533498287200928 + }, + { + "auxiliary_loss_clip": 0.01109824, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.03688145, + "balance_loss_mlp": 1.01923311, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 2.922831910849676, + "language_loss": 0.81995314, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84136605, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12237549, + "step": 14274, + "time_per_iteration": 2.4353880882263184 + }, + { + "auxiliary_loss_clip": 0.01032551, + "auxiliary_loss_mlp": 0.01003313, + "balance_loss_clip": 1.00860953, + "balance_loss_mlp": 1.00200737, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.7930227058203592, + "language_loss": 0.60834384, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.6287024, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.01306152, + "step": 14275, + "time_per_iteration": 3.1725029945373535 + }, + { + "auxiliary_loss_clip": 0.01118368, + "auxiliary_loss_mlp": 0.01023213, + "balance_loss_clip": 1.04269683, + "balance_loss_mlp": 1.01074386, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.337104865963605, + "language_loss": 0.59248209, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61389792, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12475586, + "step": 14276, + "time_per_iteration": 2.479361057281494 + }, + { + "auxiliary_loss_clip": 0.01107292, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.03555512, + "balance_loss_mlp": 1.01735699, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.8881252280407623, + "language_loss": 0.76079565, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78216028, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11810303, + "step": 14277, + "time_per_iteration": 3.874847173690796 + }, + { + "auxiliary_loss_clip": 0.01109144, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.03729081, + "balance_loss_mlp": 1.01728415, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.5779881067102106, + "language_loss": 0.8378731, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85927629, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.13897705, + "step": 14278, + "time_per_iteration": 2.461089849472046 + }, + { + "auxiliary_loss_clip": 0.01117286, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.04256177, + "balance_loss_mlp": 1.0234015, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 2.516929176528054, + "language_loss": 0.74413645, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76568758, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.14428711, + "step": 14279, + "time_per_iteration": 2.419936418533325 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01036847, + "balance_loss_clip": 1.04364157, + "balance_loss_mlp": 1.02491438, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.446010609042726, + "language_loss": 0.66368508, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68522763, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1192627, + "step": 14280, + "time_per_iteration": 2.466607093811035 + }, + { + "auxiliary_loss_clip": 0.01116759, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.0456599, + "balance_loss_mlp": 1.01499748, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 2.159201081340535, + "language_loss": 0.62619072, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64762104, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11279297, + "step": 14281, + "time_per_iteration": 3.9062812328338623 + }, + { + "auxiliary_loss_clip": 0.01116725, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.04190266, + "balance_loss_mlp": 1.01785517, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.7325066222696732, + "language_loss": 0.73217154, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75363982, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12243652, + "step": 14282, + "time_per_iteration": 2.4316115379333496 + }, + { + "auxiliary_loss_clip": 0.01113697, + "auxiliary_loss_mlp": 0.01022042, + "balance_loss_clip": 1.03991306, + "balance_loss_mlp": 1.01121211, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.759910925241174, + "language_loss": 0.75533921, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77669656, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1083374, + "step": 14283, + "time_per_iteration": 2.441405773162842 + }, + { + "auxiliary_loss_clip": 0.01114414, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.04184031, + "balance_loss_mlp": 1.01655412, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.950842305763519, + "language_loss": 0.59765184, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.61907649, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1149292, + "step": 14284, + "time_per_iteration": 2.599006175994873 + }, + { + "auxiliary_loss_clip": 0.01112857, + "auxiliary_loss_mlp": 0.01028427, + "balance_loss_clip": 1.04388189, + "balance_loss_mlp": 1.01712608, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.9884126908705175, + "language_loss": 0.76135463, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.78276747, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.11297607, + "step": 14285, + "time_per_iteration": 2.500969409942627 + }, + { + "auxiliary_loss_clip": 0.01120882, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.04863477, + "balance_loss_mlp": 1.02059293, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 1.815917676845117, + "language_loss": 0.74431193, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76584828, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12158203, + "step": 14286, + "time_per_iteration": 2.4549529552459717 + }, + { + "auxiliary_loss_clip": 0.01032466, + "auxiliary_loss_mlp": 0.01002739, + "balance_loss_clip": 1.00833273, + "balance_loss_mlp": 1.00134587, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7689611805982263, + "language_loss": 0.49387988, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51423192, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01393127, + "step": 14287, + "time_per_iteration": 3.0366389751434326 + }, + { + "auxiliary_loss_clip": 0.01119307, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.04617691, + "balance_loss_mlp": 1.01564765, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.815800865347078, + "language_loss": 0.79167342, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81314152, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11853027, + "step": 14288, + "time_per_iteration": 2.5558629035949707 + }, + { + "auxiliary_loss_clip": 0.01111904, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03819883, + "balance_loss_mlp": 1.01983666, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.1998803020537276, + "language_loss": 0.80698323, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82842571, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12506104, + "step": 14289, + "time_per_iteration": 4.0012993812561035 + }, + { + "auxiliary_loss_clip": 0.01114436, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.04310393, + "balance_loss_mlp": 1.01746833, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.8974534962366008, + "language_loss": 0.65175158, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67318404, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11346436, + "step": 14290, + "time_per_iteration": 2.50124192237854 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.04742837, + "balance_loss_mlp": 1.0183357, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 3.9229356686855037, + "language_loss": 0.54802847, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.56956375, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12640381, + "step": 14291, + "time_per_iteration": 2.6152145862579346 + }, + { + "auxiliary_loss_clip": 0.01115129, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.04313576, + "balance_loss_mlp": 1.01750755, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 2.3174259101238825, + "language_loss": 0.71554118, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73698568, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11816406, + "step": 14292, + "time_per_iteration": 2.5365850925445557 + }, + { + "auxiliary_loss_clip": 0.01111416, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.0391922, + "balance_loss_mlp": 1.01736033, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.8849503356305495, + "language_loss": 0.71340513, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73481971, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12670898, + "step": 14293, + "time_per_iteration": 2.5580687522888184 + }, + { + "auxiliary_loss_clip": 0.01114528, + "auxiliary_loss_mlp": 0.010301, + "balance_loss_clip": 1.0429107, + "balance_loss_mlp": 1.01870966, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.6226593181206184, + "language_loss": 0.68508995, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70653629, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.1138916, + "step": 14294, + "time_per_iteration": 2.4337377548217773 + }, + { + "auxiliary_loss_clip": 0.01117374, + "auxiliary_loss_mlp": 0.01027067, + "balance_loss_clip": 1.04646111, + "balance_loss_mlp": 1.01625443, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 1.8752095285886552, + "language_loss": 0.78107834, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80252278, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.1081543, + "step": 14295, + "time_per_iteration": 2.5046417713165283 + }, + { + "auxiliary_loss_clip": 0.01123563, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.04760444, + "balance_loss_mlp": 1.02025676, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 7.077929437972557, + "language_loss": 0.6962316, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71780241, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13262939, + "step": 14296, + "time_per_iteration": 2.4415247440338135 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.04255211, + "balance_loss_mlp": 1.02065969, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 2.0011461473997865, + "language_loss": 0.79890704, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.82040763, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12359619, + "step": 14297, + "time_per_iteration": 2.474240779876709 + }, + { + "auxiliary_loss_clip": 0.01108672, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.03884721, + "balance_loss_mlp": 1.01912785, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.547708126544743, + "language_loss": 0.67980301, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70118546, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10437012, + "step": 14298, + "time_per_iteration": 2.4692277908325195 + }, + { + "auxiliary_loss_clip": 0.01110486, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.04208803, + "balance_loss_mlp": 1.02308261, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 2.197450221605079, + "language_loss": 0.68852419, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70997459, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.11462402, + "step": 14299, + "time_per_iteration": 2.422118663787842 + }, + { + "auxiliary_loss_clip": 0.01121858, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.04719353, + "balance_loss_mlp": 1.01975441, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.572182013468477, + "language_loss": 0.71429491, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73583019, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11907959, + "step": 14300, + "time_per_iteration": 2.586099863052368 + }, + { + "auxiliary_loss_clip": 0.01110107, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.04032493, + "balance_loss_mlp": 1.01352644, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.4784094826638867, + "language_loss": 0.69275415, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.7140975, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10699463, + "step": 14301, + "time_per_iteration": 2.5712974071502686 + }, + { + "auxiliary_loss_clip": 0.0111365, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.04168046, + "balance_loss_mlp": 1.02334702, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 1.6181830144251748, + "language_loss": 0.74330336, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76479936, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12585449, + "step": 14302, + "time_per_iteration": 2.502154588699341 + }, + { + "auxiliary_loss_clip": 0.01107522, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.03926516, + "balance_loss_mlp": 1.02114868, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.7962209069263275, + "language_loss": 0.84055233, + "learning_rate": 2.02186225623733e-07, + "loss": 0.86195719, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.11810303, + "step": 14303, + "time_per_iteration": 2.60233736038208 + }, + { + "auxiliary_loss_clip": 0.01115838, + "auxiliary_loss_mlp": 0.01033425, + "balance_loss_clip": 1.04354858, + "balance_loss_mlp": 1.02161145, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.029002361745856, + "language_loss": 0.77312213, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79461479, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11816406, + "step": 14304, + "time_per_iteration": 2.4912240505218506 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01035675, + "balance_loss_clip": 1.04168224, + "balance_loss_mlp": 1.02240753, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 2.176826642159735, + "language_loss": 0.53890008, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56039429, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.13262939, + "step": 14305, + "time_per_iteration": 3.9685168266296387 + }, + { + "auxiliary_loss_clip": 0.01110435, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.01601076, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 2.070006449447888, + "language_loss": 0.83743227, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.85882378, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.12713623, + "step": 14306, + "time_per_iteration": 2.413970947265625 + }, + { + "auxiliary_loss_clip": 0.01108254, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.03871584, + "balance_loss_mlp": 1.01748943, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 2.1019152200642677, + "language_loss": 0.7120598, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73342526, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10797119, + "step": 14307, + "time_per_iteration": 2.603405237197876 + }, + { + "auxiliary_loss_clip": 0.01115607, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.04012787, + "balance_loss_mlp": 1.0197953, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 1.7679835594309357, + "language_loss": 0.63842106, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65990263, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12756348, + "step": 14308, + "time_per_iteration": 2.5858819484710693 + }, + { + "auxiliary_loss_clip": 0.01035605, + "auxiliary_loss_mlp": 0.00999914, + "balance_loss_clip": 1.01155257, + "balance_loss_mlp": 0.99857551, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6287912626401218, + "language_loss": 0.48451716, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50487238, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.01338196, + "step": 14309, + "time_per_iteration": 3.1706345081329346 + }, + { + "auxiliary_loss_clip": 0.01119943, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.04544318, + "balance_loss_mlp": 1.02272117, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.9481904803897065, + "language_loss": 0.67271376, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69427288, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.13262939, + "step": 14310, + "time_per_iteration": 2.46185564994812 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01028572, + "balance_loss_clip": 1.038445, + "balance_loss_mlp": 1.01750934, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.8534067687768563, + "language_loss": 0.78320146, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80459917, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1105957, + "step": 14311, + "time_per_iteration": 2.479508876800537 + }, + { + "auxiliary_loss_clip": 0.01110772, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.03963268, + "balance_loss_mlp": 1.01654744, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.0641438046993392, + "language_loss": 0.71718442, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73857355, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1159668, + "step": 14312, + "time_per_iteration": 2.4502370357513428 + }, + { + "auxiliary_loss_clip": 0.01105548, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.03484344, + "balance_loss_mlp": 1.01616549, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 2.2668197460854147, + "language_loss": 0.78043205, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80176365, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11450195, + "step": 14313, + "time_per_iteration": 2.422661304473877 + }, + { + "auxiliary_loss_clip": 0.01110395, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.04107285, + "balance_loss_mlp": 1.02079475, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 2.8944463999401897, + "language_loss": 0.72857887, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75001508, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.12414551, + "step": 14314, + "time_per_iteration": 2.5682191848754883 + }, + { + "auxiliary_loss_clip": 0.01108235, + "auxiliary_loss_mlp": 0.0102722, + "balance_loss_clip": 1.03680444, + "balance_loss_mlp": 1.01550758, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.895624119073007, + "language_loss": 0.68995726, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71131182, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11694336, + "step": 14315, + "time_per_iteration": 2.5043187141418457 + }, + { + "auxiliary_loss_clip": 0.01105764, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.03555143, + "balance_loss_mlp": 1.02220082, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 2.152527952732161, + "language_loss": 0.7210201, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74241364, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11401367, + "step": 14316, + "time_per_iteration": 2.5387134552001953 + }, + { + "auxiliary_loss_clip": 0.01114992, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.04142547, + "balance_loss_mlp": 1.01767313, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 4.1447992579053725, + "language_loss": 0.82668722, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84812737, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11358643, + "step": 14317, + "time_per_iteration": 2.4469122886657715 + }, + { + "auxiliary_loss_clip": 0.01105821, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.03656304, + "balance_loss_mlp": 1.0169189, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.6558325390746533, + "language_loss": 0.67484564, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69618875, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11578369, + "step": 14318, + "time_per_iteration": 2.705655097961426 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.03874648, + "balance_loss_mlp": 1.01648366, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.583065053356612, + "language_loss": 0.71299815, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73435771, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10540771, + "step": 14319, + "time_per_iteration": 2.5952579975128174 + }, + { + "auxiliary_loss_clip": 0.01120238, + "auxiliary_loss_mlp": 0.01039607, + "balance_loss_clip": 1.0444845, + "balance_loss_mlp": 1.02669704, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.6210142559674423, + "language_loss": 0.67167449, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69327295, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12927246, + "step": 14320, + "time_per_iteration": 3.8359882831573486 + }, + { + "auxiliary_loss_clip": 0.01118632, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.04280174, + "balance_loss_mlp": 1.02130306, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 1.955662343347644, + "language_loss": 0.8006326, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82215488, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12280273, + "step": 14321, + "time_per_iteration": 2.443943738937378 + }, + { + "auxiliary_loss_clip": 0.01110995, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.04207444, + "balance_loss_mlp": 1.01663065, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 1.8573171225066325, + "language_loss": 0.71153396, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.73293209, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.12200928, + "step": 14322, + "time_per_iteration": 2.439556837081909 + }, + { + "auxiliary_loss_clip": 0.01120056, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.0453229, + "balance_loss_mlp": 1.02158535, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 2.0378340898575775, + "language_loss": 0.56046093, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.5820179, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.14050293, + "step": 14323, + "time_per_iteration": 2.4261057376861572 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.04169142, + "balance_loss_mlp": 1.01717687, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.6062401905057124, + "language_loss": 0.7557897, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77721226, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.128479, + "step": 14324, + "time_per_iteration": 3.894463300704956 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.03749561, + "balance_loss_mlp": 1.01691127, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.127361860758386, + "language_loss": 0.66194719, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.6833415, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.13867188, + "step": 14325, + "time_per_iteration": 2.4680583477020264 + }, + { + "auxiliary_loss_clip": 0.0111759, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.04446983, + "balance_loss_mlp": 1.01947153, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.5847422603973782, + "language_loss": 0.6457504, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66724777, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12683105, + "step": 14326, + "time_per_iteration": 2.450188636779785 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.04620111, + "balance_loss_mlp": 1.01968455, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 2.03587493130875, + "language_loss": 0.84315693, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86466545, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12097168, + "step": 14327, + "time_per_iteration": 2.5825936794281006 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.03931248, + "balance_loss_mlp": 1.01731992, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.2113774541286206, + "language_loss": 0.75082648, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77222168, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11541748, + "step": 14328, + "time_per_iteration": 2.474309206008911 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.01024608, + "balance_loss_clip": 1.03804719, + "balance_loss_mlp": 1.01309252, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.6451071507334567, + "language_loss": 0.80099386, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82231712, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11517334, + "step": 14329, + "time_per_iteration": 2.6124463081359863 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01025458, + "balance_loss_clip": 1.04070187, + "balance_loss_mlp": 1.01397777, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.235909476792249, + "language_loss": 0.76770616, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.78908265, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1149292, + "step": 14330, + "time_per_iteration": 2.5174038410186768 + }, + { + "auxiliary_loss_clip": 0.01112915, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.03829479, + "balance_loss_mlp": 1.0194416, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 1.8794070052723688, + "language_loss": 0.65377843, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67522073, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11877441, + "step": 14331, + "time_per_iteration": 2.5240979194641113 + }, + { + "auxiliary_loss_clip": 0.01120161, + "auxiliary_loss_mlp": 0.01029603, + "balance_loss_clip": 1.04826593, + "balance_loss_mlp": 1.0190587, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.9041079262707072, + "language_loss": 0.75851297, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78001064, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10552979, + "step": 14332, + "time_per_iteration": 2.4411544799804688 + }, + { + "auxiliary_loss_clip": 0.01114145, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.04020143, + "balance_loss_mlp": 1.0179832, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 5.692004083871485, + "language_loss": 0.67305732, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69450736, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12878418, + "step": 14333, + "time_per_iteration": 3.901055335998535 + }, + { + "auxiliary_loss_clip": 0.01117399, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.04118633, + "balance_loss_mlp": 1.01881695, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.727085817610478, + "language_loss": 0.62274504, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64424193, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13470459, + "step": 14334, + "time_per_iteration": 2.5560224056243896 + }, + { + "auxiliary_loss_clip": 0.01119618, + "auxiliary_loss_mlp": 0.01040677, + "balance_loss_clip": 1.04486883, + "balance_loss_mlp": 1.02805841, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 2.4138931218053754, + "language_loss": 0.69217503, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71377796, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12609863, + "step": 14335, + "time_per_iteration": 2.474022388458252 + }, + { + "auxiliary_loss_clip": 0.01115089, + "auxiliary_loss_mlp": 0.01027614, + "balance_loss_clip": 1.04179025, + "balance_loss_mlp": 1.01609254, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.4162007020168705, + "language_loss": 0.82730246, + "learning_rate": 1.965923098328135e-07, + "loss": 0.84872949, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11517334, + "step": 14336, + "time_per_iteration": 2.5181281566619873 + }, + { + "auxiliary_loss_clip": 0.01117527, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.04198146, + "balance_loss_mlp": 1.01797628, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.8620906339169896, + "language_loss": 0.67750102, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69898283, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12683105, + "step": 14337, + "time_per_iteration": 2.4871768951416016 + }, + { + "auxiliary_loss_clip": 0.01106755, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.01701796, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.9794767930581403, + "language_loss": 0.67214882, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69350171, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11517334, + "step": 14338, + "time_per_iteration": 2.5745606422424316 + }, + { + "auxiliary_loss_clip": 0.01111525, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.03878021, + "balance_loss_mlp": 1.01666331, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 2.0065412878308937, + "language_loss": 0.61956012, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64095366, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11169434, + "step": 14339, + "time_per_iteration": 2.4555399417877197 + }, + { + "auxiliary_loss_clip": 0.01106925, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.03667951, + "balance_loss_mlp": 1.01783776, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.9603239407240962, + "language_loss": 0.62725079, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64861584, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11761475, + "step": 14340, + "time_per_iteration": 2.5035805702209473 + }, + { + "auxiliary_loss_clip": 0.01104614, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.03816628, + "balance_loss_mlp": 1.01861739, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 1.6249678269104304, + "language_loss": 0.79905713, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82041538, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 0.66503906, + "router_z_loss_mlp": 0.12591553, + "step": 14341, + "time_per_iteration": 2.4338409900665283 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.01025219, + "balance_loss_clip": 1.04434466, + "balance_loss_mlp": 1.01452589, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 3.385380958254199, + "language_loss": 0.74948394, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.77088147, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10687256, + "step": 14342, + "time_per_iteration": 2.5587286949157715 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.04008269, + "balance_loss_mlp": 1.01613498, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 3.0625066764595803, + "language_loss": 0.68515003, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70656168, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.13122559, + "step": 14343, + "time_per_iteration": 2.4724700450897217 + }, + { + "auxiliary_loss_clip": 0.01110152, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.03728962, + "balance_loss_mlp": 1.0311842, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.729778550730863, + "language_loss": 0.679196, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.70075357, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.144104, + "step": 14344, + "time_per_iteration": 2.4875388145446777 + }, + { + "auxiliary_loss_clip": 0.01118249, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.043715, + "balance_loss_mlp": 1.02405298, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.3959683762378445, + "language_loss": 0.81566322, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83720887, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12255859, + "step": 14345, + "time_per_iteration": 2.5391204357147217 + }, + { + "auxiliary_loss_clip": 0.01123137, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.04781175, + "balance_loss_mlp": 1.01724792, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.333336962542392, + "language_loss": 0.50784814, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52937376, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12176514, + "step": 14346, + "time_per_iteration": 2.5627105236053467 + }, + { + "auxiliary_loss_clip": 0.01119352, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.04694414, + "balance_loss_mlp": 1.01801836, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.4958161026816872, + "language_loss": 0.75125688, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.7727505, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11993408, + "step": 14347, + "time_per_iteration": 2.4915592670440674 + }, + { + "auxiliary_loss_clip": 0.01116261, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.04447699, + "balance_loss_mlp": 1.0149231, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 1.9346882845564466, + "language_loss": 0.81127453, + "learning_rate": 1.945766105774449e-07, + "loss": 0.83270979, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12347412, + "step": 14348, + "time_per_iteration": 3.911118507385254 + }, + { + "auxiliary_loss_clip": 0.0110573, + "auxiliary_loss_mlp": 0.01026679, + "balance_loss_clip": 1.03747964, + "balance_loss_mlp": 1.01602221, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.7831087129881351, + "language_loss": 0.66641641, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.6877405, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10650635, + "step": 14349, + "time_per_iteration": 2.582301616668701 + }, + { + "auxiliary_loss_clip": 0.01113922, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.04144692, + "balance_loss_mlp": 1.02269602, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 17.956330515477127, + "language_loss": 0.69413102, + "learning_rate": 1.942416188703573e-07, + "loss": 0.71562207, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12493896, + "step": 14350, + "time_per_iteration": 2.5017683506011963 + }, + { + "auxiliary_loss_clip": 0.01118277, + "auxiliary_loss_mlp": 0.01029269, + "balance_loss_clip": 1.04518914, + "balance_loss_mlp": 1.01786685, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.7755032133692032, + "language_loss": 0.77208972, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79356515, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11395264, + "step": 14351, + "time_per_iteration": 2.4655613899230957 + }, + { + "auxiliary_loss_clip": 0.01117614, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.04380488, + "balance_loss_mlp": 1.02028883, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.9463329023354314, + "language_loss": 0.84633625, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.86782956, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11431885, + "step": 14352, + "time_per_iteration": 2.611985445022583 + }, + { + "auxiliary_loss_clip": 0.01047714, + "auxiliary_loss_mlp": 0.01001494, + "balance_loss_clip": 1.02296448, + "balance_loss_mlp": 1.00019264, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7856724601692184, + "language_loss": 0.61873835, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63923049, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01301575, + "step": 14353, + "time_per_iteration": 3.1375224590301514 + }, + { + "auxiliary_loss_clip": 0.0111516, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.04264593, + "balance_loss_mlp": 1.01882851, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 3.7983755640643704, + "language_loss": 0.81878614, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84023887, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11291504, + "step": 14354, + "time_per_iteration": 2.4986534118652344 + }, + { + "auxiliary_loss_clip": 0.0111818, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.04491222, + "balance_loss_mlp": 1.01826048, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 1.949100832250633, + "language_loss": 0.85835022, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87983608, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.121521, + "step": 14355, + "time_per_iteration": 2.4250152111053467 + }, + { + "auxiliary_loss_clip": 0.01119025, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.04625332, + "balance_loss_mlp": 1.01511288, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 1.9434458237103474, + "language_loss": 0.5888716, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.61033338, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12042236, + "step": 14356, + "time_per_iteration": 2.5181593894958496 + }, + { + "auxiliary_loss_clip": 0.01116951, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.0393424, + "balance_loss_mlp": 1.0224905, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.909444985856539, + "language_loss": 0.77195793, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79348314, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13079834, + "step": 14357, + "time_per_iteration": 2.45355224609375 + }, + { + "auxiliary_loss_clip": 0.0111943, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.0462141, + "balance_loss_mlp": 1.02000475, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.8735701393222737, + "language_loss": 0.77817774, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79969168, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11968994, + "step": 14358, + "time_per_iteration": 2.564815044403076 + }, + { + "auxiliary_loss_clip": 0.01122144, + "auxiliary_loss_mlp": 0.01026337, + "balance_loss_clip": 1.04667997, + "balance_loss_mlp": 1.0144937, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.4254853541134818, + "language_loss": 0.75188941, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.7733742, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11853027, + "step": 14359, + "time_per_iteration": 2.4916887283325195 + }, + { + "auxiliary_loss_clip": 0.011097, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.04136682, + "balance_loss_mlp": 1.01841402, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.8853347571195929, + "language_loss": 0.70730013, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72869503, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.1138916, + "step": 14360, + "time_per_iteration": 2.4818613529205322 + }, + { + "auxiliary_loss_clip": 0.01115188, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.0419383, + "balance_loss_mlp": 1.01884174, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.8160153620473363, + "language_loss": 0.76315439, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78461957, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12481689, + "step": 14361, + "time_per_iteration": 2.4214632511138916 + }, + { + "auxiliary_loss_clip": 0.01042083, + "auxiliary_loss_mlp": 0.01007406, + "balance_loss_clip": 1.01722467, + "balance_loss_mlp": 1.00591159, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.961921374875494, + "language_loss": 0.58808213, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60857707, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01493835, + "step": 14362, + "time_per_iteration": 3.0826644897460938 + }, + { + "auxiliary_loss_clip": 0.01122148, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.04698372, + "balance_loss_mlp": 1.01922643, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.6003337093492818, + "language_loss": 0.80470824, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82625341, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13146973, + "step": 14363, + "time_per_iteration": 2.4740209579467773 + }, + { + "auxiliary_loss_clip": 0.01117206, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.04378438, + "balance_loss_mlp": 1.02273762, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.9406233146337426, + "language_loss": 0.72356832, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.7450887, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12115479, + "step": 14364, + "time_per_iteration": 3.9454705715179443 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.04465532, + "balance_loss_mlp": 1.02130401, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.5651990053422595, + "language_loss": 0.71537995, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73690307, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.10913086, + "step": 14365, + "time_per_iteration": 2.570734739303589 + }, + { + "auxiliary_loss_clip": 0.01116978, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.04278708, + "balance_loss_mlp": 1.02253532, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.144180103995408, + "language_loss": 0.71081614, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73233396, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12280273, + "step": 14366, + "time_per_iteration": 2.4256203174591064 + }, + { + "auxiliary_loss_clip": 0.01114902, + "auxiliary_loss_mlp": 0.01024236, + "balance_loss_clip": 1.04455078, + "balance_loss_mlp": 1.01367366, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.5831854613495715, + "language_loss": 0.81729591, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.8386873, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10565186, + "step": 14367, + "time_per_iteration": 2.475430488586426 + }, + { + "auxiliary_loss_clip": 0.0111815, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.04357398, + "balance_loss_mlp": 1.01824713, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 2.3536734646517505, + "language_loss": 0.61821425, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63970327, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12518311, + "step": 14368, + "time_per_iteration": 4.006869792938232 + }, + { + "auxiliary_loss_clip": 0.01122073, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.04954922, + "balance_loss_mlp": 1.01795197, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 2.5419207575226075, + "language_loss": 0.76115763, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78267318, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11535645, + "step": 14369, + "time_per_iteration": 2.4944634437561035 + }, + { + "auxiliary_loss_clip": 0.01117439, + "auxiliary_loss_mlp": 0.01033425, + "balance_loss_clip": 1.04238129, + "balance_loss_mlp": 1.02059257, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 5.126561085927019, + "language_loss": 0.64576769, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66727632, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12841797, + "step": 14370, + "time_per_iteration": 2.5085816383361816 + }, + { + "auxiliary_loss_clip": 0.01124271, + "auxiliary_loss_mlp": 0.01027929, + "balance_loss_clip": 1.05202699, + "balance_loss_mlp": 1.01650918, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 2.039945381897894, + "language_loss": 0.66575044, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68727243, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11413574, + "step": 14371, + "time_per_iteration": 2.467906951904297 + }, + { + "auxiliary_loss_clip": 0.01047543, + "auxiliary_loss_mlp": 0.01005002, + "balance_loss_clip": 1.02179217, + "balance_loss_mlp": 1.00360477, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.858807251938975, + "language_loss": 0.569107, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58963239, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01397705, + "step": 14372, + "time_per_iteration": 2.9550657272338867 + }, + { + "auxiliary_loss_clip": 0.01117525, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04866791, + "balance_loss_mlp": 1.01836324, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.8660788442677512, + "language_loss": 0.79324573, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81472385, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.11907959, + "step": 14373, + "time_per_iteration": 2.463916063308716 + }, + { + "auxiliary_loss_clip": 0.01119595, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.04773974, + "balance_loss_mlp": 1.01641095, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.822224102485695, + "language_loss": 0.63802683, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65950298, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1159668, + "step": 14374, + "time_per_iteration": 2.488830804824829 + }, + { + "auxiliary_loss_clip": 0.01118213, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.04248977, + "balance_loss_mlp": 1.02591789, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.7091503031886743, + "language_loss": 0.77056062, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79212195, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12005615, + "step": 14375, + "time_per_iteration": 2.4080512523651123 + }, + { + "auxiliary_loss_clip": 0.01111591, + "auxiliary_loss_mlp": 0.01030016, + "balance_loss_clip": 1.03900862, + "balance_loss_mlp": 1.01822603, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.8091120587278067, + "language_loss": 0.60779369, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62920976, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11804199, + "step": 14376, + "time_per_iteration": 4.3192431926727295 + }, + { + "auxiliary_loss_clip": 0.01107057, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.03643095, + "balance_loss_mlp": 1.02303886, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.6010717260146592, + "language_loss": 0.66388679, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68529892, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11126709, + "step": 14377, + "time_per_iteration": 2.4440531730651855 + }, + { + "auxiliary_loss_clip": 0.01117684, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.04186201, + "balance_loss_mlp": 1.01815772, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.558877999406299, + "language_loss": 0.70422667, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72570491, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11981201, + "step": 14378, + "time_per_iteration": 2.4576752185821533 + }, + { + "auxiliary_loss_clip": 0.01033978, + "auxiliary_loss_mlp": 0.01000412, + "balance_loss_clip": 1.00965691, + "balance_loss_mlp": 0.99913371, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.810210867588384, + "language_loss": 0.60318089, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62352479, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.24291992, + "router_z_loss_mlp": 0.01278687, + "step": 14379, + "time_per_iteration": 3.099693775177002 + }, + { + "auxiliary_loss_clip": 0.01112081, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.03909373, + "balance_loss_mlp": 1.0215162, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.6853374004692383, + "language_loss": 0.74579257, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76724756, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11901855, + "step": 14380, + "time_per_iteration": 2.4727869033813477 + }, + { + "auxiliary_loss_clip": 0.01109945, + "auxiliary_loss_mlp": 0.01027507, + "balance_loss_clip": 1.03607798, + "balance_loss_mlp": 1.01552045, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.502307290020392, + "language_loss": 0.75168526, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77305979, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11993408, + "step": 14381, + "time_per_iteration": 2.5277209281921387 + }, + { + "auxiliary_loss_clip": 0.01108817, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.03938699, + "balance_loss_mlp": 1.02090847, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 3.1162919373137967, + "language_loss": 0.84551692, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86692101, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10687256, + "step": 14382, + "time_per_iteration": 2.4448487758636475 + }, + { + "auxiliary_loss_clip": 0.01121358, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.04714024, + "balance_loss_mlp": 1.0166409, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 2.3834312825832056, + "language_loss": 0.7583034, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77980697, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12365723, + "step": 14383, + "time_per_iteration": 2.477480173110962 + }, + { + "auxiliary_loss_clip": 0.0111258, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.04344702, + "balance_loss_mlp": 1.01833534, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.7615308314592701, + "language_loss": 0.85355949, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87498593, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.1171875, + "step": 14384, + "time_per_iteration": 2.444221019744873 + }, + { + "auxiliary_loss_clip": 0.0110477, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.03530765, + "balance_loss_mlp": 1.01954687, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.6873400793816966, + "language_loss": 0.80805898, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82941425, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11199951, + "step": 14385, + "time_per_iteration": 2.5117905139923096 + }, + { + "auxiliary_loss_clip": 0.01114355, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.04107583, + "balance_loss_mlp": 1.02280021, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.0428142944513286, + "language_loss": 0.72605586, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.7475661, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.13867188, + "step": 14386, + "time_per_iteration": 2.466122627258301 + }, + { + "auxiliary_loss_clip": 0.01116324, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.04338408, + "balance_loss_mlp": 1.0206964, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 1.8279033244303908, + "language_loss": 0.81751823, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.83900523, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11682129, + "step": 14387, + "time_per_iteration": 2.409862518310547 + }, + { + "auxiliary_loss_clip": 0.01113462, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.04250717, + "balance_loss_mlp": 1.01623189, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.8320015344764322, + "language_loss": 0.68408746, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70549828, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11401367, + "step": 14388, + "time_per_iteration": 2.468008041381836 + }, + { + "auxiliary_loss_clip": 0.01113081, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.04442453, + "balance_loss_mlp": 1.02288055, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.580706002974487, + "language_loss": 0.90456986, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92603248, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10308838, + "step": 14389, + "time_per_iteration": 2.58866810798645 + }, + { + "auxiliary_loss_clip": 0.01113772, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.04330742, + "balance_loss_mlp": 1.01824999, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.7791158830092164, + "language_loss": 0.70867878, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.73010981, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11083984, + "step": 14390, + "time_per_iteration": 2.4304072856903076 + }, + { + "auxiliary_loss_clip": 0.01121543, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.04340267, + "balance_loss_mlp": 1.02726114, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 2.098255942315957, + "language_loss": 0.82706547, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84867728, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12365723, + "step": 14391, + "time_per_iteration": 3.8844692707061768 + }, + { + "auxiliary_loss_clip": 0.01038789, + "auxiliary_loss_mlp": 0.01001938, + "balance_loss_clip": 1.01373923, + "balance_loss_mlp": 1.00049806, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.817158892446686, + "language_loss": 0.68036664, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70077384, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01438904, + "step": 14392, + "time_per_iteration": 2.9606611728668213 + }, + { + "auxiliary_loss_clip": 0.01119474, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.0437187, + "balance_loss_mlp": 1.02187991, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 1.9334600780411304, + "language_loss": 0.75642705, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77796853, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12792969, + "step": 14393, + "time_per_iteration": 2.42742657661438 + }, + { + "auxiliary_loss_clip": 0.01110943, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.03662336, + "balance_loss_mlp": 1.01976526, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 1.939177220192268, + "language_loss": 0.7402395, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.76166821, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.121521, + "step": 14394, + "time_per_iteration": 2.4482433795928955 + }, + { + "auxiliary_loss_clip": 0.0111485, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.03986764, + "balance_loss_mlp": 1.02142298, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 1.8220065241126755, + "language_loss": 0.65342224, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67491007, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12512207, + "step": 14395, + "time_per_iteration": 2.753408670425415 + }, + { + "auxiliary_loss_clip": 0.01115932, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.04376435, + "balance_loss_mlp": 1.02285171, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.863628576509986, + "language_loss": 0.6784246, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.699929, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11669922, + "step": 14396, + "time_per_iteration": 2.4514315128326416 + }, + { + "auxiliary_loss_clip": 0.01123156, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.04904914, + "balance_loss_mlp": 1.02166295, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.4110992966631564, + "language_loss": 0.69275647, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71432567, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12097168, + "step": 14397, + "time_per_iteration": 2.4736838340759277 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.03791606, + "balance_loss_mlp": 1.01717913, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 6.051946664801431, + "language_loss": 0.63650727, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65788102, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11083984, + "step": 14398, + "time_per_iteration": 2.4746172428131104 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.04112697, + "balance_loss_mlp": 1.0196178, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 2.618755170072776, + "language_loss": 0.76626378, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78773326, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11853027, + "step": 14399, + "time_per_iteration": 2.5265235900878906 + }, + { + "auxiliary_loss_clip": 0.01116132, + "auxiliary_loss_mlp": 0.01025083, + "balance_loss_clip": 1.04512227, + "balance_loss_mlp": 1.01447344, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 1.9722383545513207, + "language_loss": 0.9330461, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95445824, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.1060791, + "step": 14400, + "time_per_iteration": 2.4845380783081055 + }, + { + "auxiliary_loss_clip": 0.01118173, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.04322338, + "balance_loss_mlp": 1.02279615, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 2.0229956255703554, + "language_loss": 0.67627734, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69780505, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11804199, + "step": 14401, + "time_per_iteration": 2.63299822807312 + }, + { + "auxiliary_loss_clip": 0.01119055, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.04512179, + "balance_loss_mlp": 1.01849806, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 2.0272336149731274, + "language_loss": 0.73251402, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75400972, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12011719, + "step": 14402, + "time_per_iteration": 2.427384614944458 + }, + { + "auxiliary_loss_clip": 0.01117345, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.04538298, + "balance_loss_mlp": 1.0198977, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.6296264820582487, + "language_loss": 0.75149196, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77297485, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11053467, + "step": 14403, + "time_per_iteration": 2.460468053817749 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.0438931, + "balance_loss_mlp": 1.01756048, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 1.6876116711409417, + "language_loss": 0.73154122, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75301921, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12536621, + "step": 14404, + "time_per_iteration": 2.4728896617889404 + }, + { + "auxiliary_loss_clip": 0.01118194, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.0474633, + "balance_loss_mlp": 1.02178144, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 2.139903884485034, + "language_loss": 0.7066952, + "learning_rate": 1.851368555901447e-07, + "loss": 0.7282145, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11962891, + "step": 14405, + "time_per_iteration": 2.480802059173584 + }, + { + "auxiliary_loss_clip": 0.01118183, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.04227567, + "balance_loss_mlp": 1.01786911, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 5.982453589910874, + "language_loss": 0.66600013, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68747997, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1192627, + "step": 14406, + "time_per_iteration": 2.383486270904541 + }, + { + "auxiliary_loss_clip": 0.01108559, + "auxiliary_loss_mlp": 0.01023172, + "balance_loss_clip": 1.03742981, + "balance_loss_mlp": 1.01238966, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.725284563625856, + "language_loss": 0.82871699, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.8500343, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10778809, + "step": 14407, + "time_per_iteration": 3.924062490463257 + }, + { + "auxiliary_loss_clip": 0.01119924, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.04886305, + "balance_loss_mlp": 1.02074766, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 9.373695266184782, + "language_loss": 0.69827855, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.71979767, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11242676, + "step": 14408, + "time_per_iteration": 2.4615213871002197 + }, + { + "auxiliary_loss_clip": 0.01109271, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.04112601, + "balance_loss_mlp": 1.01677966, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.9185230296008478, + "language_loss": 0.77114397, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79251158, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.1071167, + "step": 14409, + "time_per_iteration": 2.40470290184021 + }, + { + "auxiliary_loss_clip": 0.01112303, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.03890991, + "balance_loss_mlp": 1.0200572, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.7923466626942355, + "language_loss": 0.77380347, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79526532, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.13824463, + "step": 14410, + "time_per_iteration": 3.8616747856140137 + }, + { + "auxiliary_loss_clip": 0.01109611, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.0379144, + "balance_loss_mlp": 1.01815748, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.8527455301794213, + "language_loss": 0.77916563, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.80056024, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11700439, + "step": 14411, + "time_per_iteration": 2.455700397491455 + }, + { + "auxiliary_loss_clip": 0.01109365, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.03835511, + "balance_loss_mlp": 1.01961803, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.7853394538919078, + "language_loss": 0.73599243, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.7573908, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10845947, + "step": 14412, + "time_per_iteration": 2.4791617393493652 + }, + { + "auxiliary_loss_clip": 0.01110083, + "auxiliary_loss_mlp": 0.01027084, + "balance_loss_clip": 1.04141128, + "balance_loss_mlp": 1.01667738, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.6209440167834985, + "language_loss": 0.69823253, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71960413, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10412598, + "step": 14413, + "time_per_iteration": 2.5231149196624756 + }, + { + "auxiliary_loss_clip": 0.01111203, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.03937161, + "balance_loss_mlp": 1.01783347, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.3333103301059714, + "language_loss": 0.63319147, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65460294, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12115479, + "step": 14414, + "time_per_iteration": 2.4166269302368164 + }, + { + "auxiliary_loss_clip": 0.01113583, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.04106867, + "balance_loss_mlp": 1.0217433, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.7342846094467175, + "language_loss": 0.63674057, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65822685, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.13305664, + "step": 14415, + "time_per_iteration": 2.45611310005188 + }, + { + "auxiliary_loss_clip": 0.01044543, + "auxiliary_loss_mlp": 0.0100225, + "balance_loss_clip": 1.02004433, + "balance_loss_mlp": 1.0010612, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.80102447358561, + "language_loss": 0.60401297, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62448096, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.24462891, + "router_z_loss_mlp": 0.01187134, + "step": 14416, + "time_per_iteration": 3.1299655437469482 + }, + { + "auxiliary_loss_clip": 0.01122176, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.0466578, + "balance_loss_mlp": 1.02197981, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.986465330282044, + "language_loss": 0.74551439, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76708198, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1260376, + "step": 14417, + "time_per_iteration": 2.454557180404663 + }, + { + "auxiliary_loss_clip": 0.01113127, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.04057848, + "balance_loss_mlp": 1.02339208, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.5014187168569415, + "language_loss": 0.75119901, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77268338, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1192627, + "step": 14418, + "time_per_iteration": 2.443559169769287 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.03616726, + "balance_loss_mlp": 1.01859069, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 2.329031739296952, + "language_loss": 0.68128276, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70267737, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.13256836, + "step": 14419, + "time_per_iteration": 3.931699752807617 + }, + { + "auxiliary_loss_clip": 0.01115174, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_clip": 1.04030776, + "balance_loss_mlp": 1.02958882, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.7698883826693372, + "language_loss": 0.78824306, + "learning_rate": 1.826898250065465e-07, + "loss": 0.80982667, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13604736, + "step": 14420, + "time_per_iteration": 2.398237705230713 + }, + { + "auxiliary_loss_clip": 0.01115496, + "auxiliary_loss_mlp": 0.01023618, + "balance_loss_clip": 1.0438242, + "balance_loss_mlp": 1.01260328, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5661027576248951, + "language_loss": 0.83864141, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.86003256, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11016846, + "step": 14421, + "time_per_iteration": 2.436399459838867 + }, + { + "auxiliary_loss_clip": 0.0103784, + "auxiliary_loss_mlp": 0.01001145, + "balance_loss_clip": 1.01318932, + "balance_loss_mlp": 0.99979508, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.705445657286265, + "language_loss": 0.491285, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51167482, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01350403, + "step": 14422, + "time_per_iteration": 3.0416066646575928 + }, + { + "auxiliary_loss_clip": 0.01118238, + "auxiliary_loss_mlp": 0.01023483, + "balance_loss_clip": 1.04550338, + "balance_loss_mlp": 1.01298666, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 3.2658911636857186, + "language_loss": 0.73318821, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75460541, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10498047, + "step": 14423, + "time_per_iteration": 2.5265016555786133 + }, + { + "auxiliary_loss_clip": 0.01108562, + "auxiliary_loss_mlp": 0.01022098, + "balance_loss_clip": 1.04116678, + "balance_loss_mlp": 1.01198876, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.802411089025129, + "language_loss": 0.77160627, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.79291284, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.10113525, + "step": 14424, + "time_per_iteration": 2.4274685382843018 + }, + { + "auxiliary_loss_clip": 0.01111165, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.043993, + "balance_loss_mlp": 1.02377677, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 2.0031334880055756, + "language_loss": 0.71470106, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73615468, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.10406494, + "step": 14425, + "time_per_iteration": 2.6188642978668213 + }, + { + "auxiliary_loss_clip": 0.0111914, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.04575729, + "balance_loss_mlp": 1.01762938, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.7980251310876132, + "language_loss": 0.67787015, + "learning_rate": 1.817153530980926e-07, + "loss": 0.6993593, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.121521, + "step": 14426, + "time_per_iteration": 2.453615665435791 + }, + { + "auxiliary_loss_clip": 0.0112219, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.04854703, + "balance_loss_mlp": 1.01550317, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 1.6988313106189936, + "language_loss": 0.70417047, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72567117, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.1237793, + "step": 14427, + "time_per_iteration": 2.4645800590515137 + }, + { + "auxiliary_loss_clip": 0.01112694, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.04212403, + "balance_loss_mlp": 1.01679313, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 1.6507926173208576, + "language_loss": 0.68312734, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70453423, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11206055, + "step": 14428, + "time_per_iteration": 2.4406473636627197 + }, + { + "auxiliary_loss_clip": 0.01119937, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.04895544, + "balance_loss_mlp": 1.01691866, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 1.7857801113393872, + "language_loss": 0.70686728, + "learning_rate": 1.812290478794889e-07, + "loss": 0.7283451, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10931396, + "step": 14429, + "time_per_iteration": 2.4389140605926514 + }, + { + "auxiliary_loss_clip": 0.01112423, + "auxiliary_loss_mlp": 0.01024363, + "balance_loss_clip": 1.04159987, + "balance_loss_mlp": 1.01248348, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 2.3832554463815017, + "language_loss": 0.66947937, + "learning_rate": 1.810670840677151e-07, + "loss": 0.69084728, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11877441, + "step": 14430, + "time_per_iteration": 2.393188238143921 + }, + { + "auxiliary_loss_clip": 0.01118322, + "auxiliary_loss_mlp": 0.01040223, + "balance_loss_clip": 1.04196167, + "balance_loss_mlp": 1.02689505, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 2.216856730261745, + "language_loss": 0.69334584, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71493125, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.13317871, + "step": 14431, + "time_per_iteration": 2.495588779449463 + }, + { + "auxiliary_loss_clip": 0.01116749, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.04167414, + "balance_loss_mlp": 1.02562428, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.1325478133131672, + "language_loss": 0.63212037, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65366751, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12347412, + "step": 14432, + "time_per_iteration": 2.3956644535064697 + }, + { + "auxiliary_loss_clip": 0.0113189, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.05749583, + "balance_loss_mlp": 1.02346945, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.9843446561330829, + "language_loss": 0.78532493, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80698609, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.10772705, + "step": 14433, + "time_per_iteration": 2.4871981143951416 + }, + { + "auxiliary_loss_clip": 0.01041092, + "auxiliary_loss_mlp": 0.0100399, + "balance_loss_clip": 1.01636696, + "balance_loss_mlp": 1.00261045, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7023452706100979, + "language_loss": 0.58444947, + "learning_rate": 1.804199186231805e-07, + "loss": 0.6049003, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.01379395, + "step": 14434, + "time_per_iteration": 4.629953384399414 + }, + { + "auxiliary_loss_clip": 0.01115621, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.04597831, + "balance_loss_mlp": 1.01988649, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.7157158405474722, + "language_loss": 0.80076844, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82223344, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10986328, + "step": 14435, + "time_per_iteration": 2.5202488899230957 + }, + { + "auxiliary_loss_clip": 0.01112125, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.03950787, + "balance_loss_mlp": 1.01495361, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 5.87230094429401, + "language_loss": 0.61983681, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.64122427, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11663818, + "step": 14436, + "time_per_iteration": 2.6005728244781494 + }, + { + "auxiliary_loss_clip": 0.01113675, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.04079962, + "balance_loss_mlp": 1.01720858, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 7.9280282424504955, + "language_loss": 0.71026266, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.73169756, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12609863, + "step": 14437, + "time_per_iteration": 2.461003065109253 + }, + { + "auxiliary_loss_clip": 0.01113294, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.04252243, + "balance_loss_mlp": 1.01698804, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 2.4554316396749902, + "language_loss": 0.80754507, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82896423, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11639404, + "step": 14438, + "time_per_iteration": 2.6150221824645996 + }, + { + "auxiliary_loss_clip": 0.01109544, + "auxiliary_loss_mlp": 0.01024728, + "balance_loss_clip": 1.04001915, + "balance_loss_mlp": 1.01319456, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 2.0107210798074333, + "language_loss": 0.67781955, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69916224, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11529541, + "step": 14439, + "time_per_iteration": 2.4698939323425293 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.03955317, + "balance_loss_mlp": 1.02295637, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.569225508806264, + "language_loss": 0.63779753, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.65925848, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.12298584, + "step": 14440, + "time_per_iteration": 2.592484951019287 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.04202902, + "balance_loss_mlp": 1.02328312, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.7981865720639354, + "language_loss": 0.65662658, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.67812526, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1362915, + "step": 14441, + "time_per_iteration": 2.5064539909362793 + }, + { + "auxiliary_loss_clip": 0.01116158, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.04480791, + "balance_loss_mlp": 1.01970959, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.6824403393376803, + "language_loss": 0.66603607, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68750107, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10632324, + "step": 14442, + "time_per_iteration": 2.5066373348236084 + }, + { + "auxiliary_loss_clip": 0.01121092, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.04708028, + "balance_loss_mlp": 1.01693916, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.8987404115010684, + "language_loss": 0.72298813, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74449438, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12597656, + "step": 14443, + "time_per_iteration": 2.3955605030059814 + }, + { + "auxiliary_loss_clip": 0.01113649, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.04112434, + "balance_loss_mlp": 1.018049, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.7012124292513617, + "language_loss": 0.83573866, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85717314, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11755371, + "step": 14444, + "time_per_iteration": 2.5317513942718506 + }, + { + "auxiliary_loss_clip": 0.01117772, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.04535055, + "balance_loss_mlp": 1.02002442, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 1.9619105737391385, + "language_loss": 0.77469707, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79619628, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12115479, + "step": 14445, + "time_per_iteration": 2.4236319065093994 + }, + { + "auxiliary_loss_clip": 0.01111285, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.0387032, + "balance_loss_mlp": 1.02516365, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 3.7462562385211147, + "language_loss": 0.67763901, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.69913793, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13464355, + "step": 14446, + "time_per_iteration": 2.5044338703155518 + }, + { + "auxiliary_loss_clip": 0.01114019, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.0408951, + "balance_loss_mlp": 1.01779032, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.6756097262205154, + "language_loss": 0.82745105, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.84889019, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12103271, + "step": 14447, + "time_per_iteration": 2.6259663105010986 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01026427, + "balance_loss_clip": 1.04277921, + "balance_loss_mlp": 1.01551926, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 11.649241576867393, + "language_loss": 0.73979837, + "learning_rate": 1.781635359686515e-07, + "loss": 0.7612114, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10913086, + "step": 14448, + "time_per_iteration": 2.5606117248535156 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.04414129, + "balance_loss_mlp": 1.01564837, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.4238415689550368, + "language_loss": 0.80429226, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82572496, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1182251, + "step": 14449, + "time_per_iteration": 2.4255592823028564 + }, + { + "auxiliary_loss_clip": 0.01045664, + "auxiliary_loss_mlp": 0.01005679, + "balance_loss_clip": 1.0211525, + "balance_loss_mlp": 1.00432754, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8029744794336972, + "language_loss": 0.605955, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62646842, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 0.01351929, + "step": 14450, + "time_per_iteration": 3.00870418548584 + }, + { + "auxiliary_loss_clip": 0.01119568, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.04337668, + "balance_loss_mlp": 1.02283406, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6799458440678054, + "language_loss": 0.76225239, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78380227, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12591553, + "step": 14451, + "time_per_iteration": 2.5573172569274902 + }, + { + "auxiliary_loss_clip": 0.01115932, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.04416323, + "balance_loss_mlp": 1.01679134, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.399922229389546, + "language_loss": 0.7212559, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74269581, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11273193, + "step": 14452, + "time_per_iteration": 3.8309407234191895 + }, + { + "auxiliary_loss_clip": 0.01120837, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.04664922, + "balance_loss_mlp": 1.01912141, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.9484427503528303, + "language_loss": 0.72445273, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74597734, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12487793, + "step": 14453, + "time_per_iteration": 2.446096658706665 + }, + { + "auxiliary_loss_clip": 0.0111664, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.04389882, + "balance_loss_mlp": 1.02300811, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 2.643439037331028, + "language_loss": 0.73446292, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75598025, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12097168, + "step": 14454, + "time_per_iteration": 2.4251697063446045 + }, + { + "auxiliary_loss_clip": 0.01119186, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.04611254, + "balance_loss_mlp": 1.02041411, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 2.4978677050105933, + "language_loss": 0.59211004, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61361897, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11303711, + "step": 14455, + "time_per_iteration": 4.022251605987549 + }, + { + "auxiliary_loss_clip": 0.01120939, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.04634857, + "balance_loss_mlp": 1.02000237, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.1409609350031844, + "language_loss": 0.7976225, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.81915474, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1229248, + "step": 14456, + "time_per_iteration": 2.4681613445281982 + }, + { + "auxiliary_loss_clip": 0.01132348, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.0565033, + "balance_loss_mlp": 1.02148104, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 2.5023606988084133, + "language_loss": 0.75096369, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.77263469, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13262939, + "step": 14457, + "time_per_iteration": 2.544715166091919 + }, + { + "auxiliary_loss_clip": 0.01110431, + "auxiliary_loss_mlp": 0.01025344, + "balance_loss_clip": 1.04077673, + "balance_loss_mlp": 1.01438308, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.5633214502912898, + "language_loss": 0.78548497, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80684268, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10955811, + "step": 14458, + "time_per_iteration": 2.4469449520111084 + }, + { + "auxiliary_loss_clip": 0.01113907, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.04240525, + "balance_loss_mlp": 1.02435207, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.9505546383160015, + "language_loss": 0.70957518, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73108226, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12457275, + "step": 14459, + "time_per_iteration": 2.5517852306365967 + }, + { + "auxiliary_loss_clip": 0.0111893, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.04974794, + "balance_loss_mlp": 1.01892638, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.6164431545673, + "language_loss": 0.74083734, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76232046, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10455322, + "step": 14460, + "time_per_iteration": 2.507192611694336 + }, + { + "auxiliary_loss_clip": 0.01119194, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.04501593, + "balance_loss_mlp": 1.02333093, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.8780981155361214, + "language_loss": 0.64787805, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.66941726, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1138916, + "step": 14461, + "time_per_iteration": 2.4892165660858154 + }, + { + "auxiliary_loss_clip": 0.01104859, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.03370976, + "balance_loss_mlp": 1.01923203, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 1.982714148035695, + "language_loss": 0.82644224, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84780318, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.12005615, + "step": 14462, + "time_per_iteration": 2.406597375869751 + }, + { + "auxiliary_loss_clip": 0.01115184, + "auxiliary_loss_mlp": 0.01042848, + "balance_loss_clip": 1.04025984, + "balance_loss_mlp": 1.02840543, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.7408126821876562, + "language_loss": 0.65722412, + "learning_rate": 1.757610093744335e-07, + "loss": 0.6788044, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.14422607, + "step": 14463, + "time_per_iteration": 2.5102856159210205 + }, + { + "auxiliary_loss_clip": 0.01121768, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.04543018, + "balance_loss_mlp": 1.02241671, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 1.9650029382262106, + "language_loss": 0.6647473, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68631339, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12414551, + "step": 14464, + "time_per_iteration": 3.996967315673828 + }, + { + "auxiliary_loss_clip": 0.0111159, + "auxiliary_loss_mlp": 0.01039847, + "balance_loss_clip": 1.03752971, + "balance_loss_mlp": 1.02747917, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.8988152343565794, + "language_loss": 0.62370408, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64521837, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12371826, + "step": 14465, + "time_per_iteration": 2.5004613399505615 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.03511441, + "balance_loss_mlp": 1.02341914, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.9014294123008826, + "language_loss": 0.84684116, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86820614, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.10528564, + "step": 14466, + "time_per_iteration": 2.510922908782959 + }, + { + "auxiliary_loss_clip": 0.01113743, + "auxiliary_loss_mlp": 0.01037187, + "balance_loss_clip": 1.03999615, + "balance_loss_mlp": 1.02454484, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.452712850022134, + "language_loss": 0.6185621, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64007139, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12640381, + "step": 14467, + "time_per_iteration": 2.4723165035247803 + }, + { + "auxiliary_loss_clip": 0.01109561, + "auxiliary_loss_mlp": 0.01027629, + "balance_loss_clip": 1.04168212, + "balance_loss_mlp": 1.0166738, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.630940455938208, + "language_loss": 0.68882704, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71019888, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.10943604, + "step": 14468, + "time_per_iteration": 2.575773239135742 + }, + { + "auxiliary_loss_clip": 0.0110626, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.03532457, + "balance_loss_mlp": 1.02137923, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.794670429696558, + "language_loss": 0.70962834, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73101842, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11364746, + "step": 14469, + "time_per_iteration": 2.592071056365967 + }, + { + "auxiliary_loss_clip": 0.01108875, + "auxiliary_loss_mlp": 0.01024533, + "balance_loss_clip": 1.04231632, + "balance_loss_mlp": 1.01422191, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 4.221692009512017, + "language_loss": 0.84340006, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86473417, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 0.66552734, + "router_z_loss_mlp": 0.10308838, + "step": 14470, + "time_per_iteration": 2.5584542751312256 + }, + { + "auxiliary_loss_clip": 0.0111196, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.04173064, + "balance_loss_mlp": 1.01673198, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.8279918902296397, + "language_loss": 0.73077929, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75217855, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11224365, + "step": 14471, + "time_per_iteration": 2.5291051864624023 + }, + { + "auxiliary_loss_clip": 0.01114463, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.04342961, + "balance_loss_mlp": 1.01551294, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.510319170810566, + "language_loss": 0.79061753, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.8120392, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12188721, + "step": 14472, + "time_per_iteration": 2.5670924186706543 + }, + { + "auxiliary_loss_clip": 0.01112423, + "auxiliary_loss_mlp": 0.01023686, + "balance_loss_clip": 1.04018259, + "balance_loss_mlp": 1.01202142, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 2.2999057673412495, + "language_loss": 0.72652495, + "learning_rate": 1.741679706279644e-07, + "loss": 0.747886, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11657715, + "step": 14473, + "time_per_iteration": 2.446249485015869 + }, + { + "auxiliary_loss_clip": 0.01119854, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.04556835, + "balance_loss_mlp": 1.02040124, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.766042553079093, + "language_loss": 0.72702569, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74854577, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11761475, + "step": 14474, + "time_per_iteration": 2.5429723262786865 + }, + { + "auxiliary_loss_clip": 0.01110431, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.03900409, + "balance_loss_mlp": 1.02102101, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 1.9322341210698233, + "language_loss": 0.66982186, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69126111, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12475586, + "step": 14475, + "time_per_iteration": 2.5000181198120117 + }, + { + "auxiliary_loss_clip": 0.01107826, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.03563511, + "balance_loss_mlp": 1.01485205, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 4.211168854123461, + "language_loss": 0.77432472, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79567772, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12628174, + "step": 14476, + "time_per_iteration": 2.4577982425689697 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01027499, + "balance_loss_clip": 1.03926098, + "balance_loss_mlp": 1.01687741, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.4933750445020901, + "language_loss": 0.72519791, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74654925, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10620117, + "step": 14477, + "time_per_iteration": 2.5154693126678467 + }, + { + "auxiliary_loss_clip": 0.01109391, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.03683746, + "balance_loss_mlp": 1.01506901, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 1.7655400785064823, + "language_loss": 0.59300029, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61436021, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11541748, + "step": 14478, + "time_per_iteration": 2.4916579723358154 + }, + { + "auxiliary_loss_clip": 0.01122384, + "auxiliary_loss_mlp": 0.01023317, + "balance_loss_clip": 1.0535655, + "balance_loss_mlp": 1.01381588, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.5962879943170027, + "language_loss": 0.71804738, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73950446, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.09503174, + "step": 14479, + "time_per_iteration": 4.038214206695557 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_clip": 1.04575944, + "balance_loss_mlp": 1.0187217, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.4563538777723248, + "language_loss": 0.71031332, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73178864, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.12255859, + "step": 14480, + "time_per_iteration": 2.5987353324890137 + }, + { + "auxiliary_loss_clip": 0.01113107, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.03994894, + "balance_loss_mlp": 1.02006626, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.6644446470903842, + "language_loss": 0.69710886, + "learning_rate": 1.728985243129666e-07, + "loss": 0.7185533, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11273193, + "step": 14481, + "time_per_iteration": 2.527942657470703 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.03536797, + "balance_loss_mlp": 1.01878834, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.5964061557485785, + "language_loss": 0.76900864, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79037732, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11932373, + "step": 14482, + "time_per_iteration": 2.502748966217041 + }, + { + "auxiliary_loss_clip": 0.0111194, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.03947449, + "balance_loss_mlp": 1.01872003, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 2.5359345770206856, + "language_loss": 0.76446593, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78589135, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11877441, + "step": 14483, + "time_per_iteration": 2.4413740634918213 + }, + { + "auxiliary_loss_clip": 0.01126852, + "auxiliary_loss_mlp": 0.01038389, + "balance_loss_clip": 1.04930568, + "balance_loss_mlp": 1.02497745, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 1.9718480407603098, + "language_loss": 0.6192497, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.6409021, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13409424, + "step": 14484, + "time_per_iteration": 2.5026228427886963 + }, + { + "auxiliary_loss_clip": 0.0111483, + "auxiliary_loss_mlp": 0.01035564, + "balance_loss_clip": 1.04268062, + "balance_loss_mlp": 1.02279699, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 2.495088529407778, + "language_loss": 0.68021518, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70171916, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12768555, + "step": 14485, + "time_per_iteration": 2.468823194503784 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.0420773, + "balance_loss_mlp": 1.02125025, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 2.1676539539402673, + "language_loss": 0.62885368, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.65034622, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12194824, + "step": 14486, + "time_per_iteration": 2.4924156665802 + }, + { + "auxiliary_loss_clip": 0.01118397, + "auxiliary_loss_mlp": 0.01032497, + "balance_loss_clip": 1.04349279, + "balance_loss_mlp": 1.01971149, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.3987795505240697, + "language_loss": 0.61945045, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.64095932, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12786865, + "step": 14487, + "time_per_iteration": 2.593196392059326 + }, + { + "auxiliary_loss_clip": 0.0110987, + "auxiliary_loss_mlp": 0.01024494, + "balance_loss_clip": 1.04001427, + "balance_loss_mlp": 1.01460576, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.9813043394477037, + "language_loss": 0.67828202, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69962567, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.09893799, + "step": 14488, + "time_per_iteration": 2.426114797592163 + }, + { + "auxiliary_loss_clip": 0.01115677, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.04341936, + "balance_loss_mlp": 1.01803279, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 1.9540869873086186, + "language_loss": 0.85733116, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87878191, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11376953, + "step": 14489, + "time_per_iteration": 2.4290857315063477 + }, + { + "auxiliary_loss_clip": 0.0112415, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.0473392, + "balance_loss_mlp": 1.01917768, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 3.0765020053612773, + "language_loss": 0.75422287, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.77578509, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12896729, + "step": 14490, + "time_per_iteration": 2.394454002380371 + }, + { + "auxiliary_loss_clip": 0.01123562, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.04879737, + "balance_loss_mlp": 1.01759791, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.559369067176042, + "language_loss": 0.76515758, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78669214, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1229248, + "step": 14491, + "time_per_iteration": 2.4819133281707764 + }, + { + "auxiliary_loss_clip": 0.01124358, + "auxiliary_loss_mlp": 0.0102462, + "balance_loss_clip": 1.05185068, + "balance_loss_mlp": 1.01297283, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.485738247037222, + "language_loss": 0.67002404, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69151378, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11639404, + "step": 14492, + "time_per_iteration": 2.5241200923919678 + }, + { + "auxiliary_loss_clip": 0.01111456, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.04167449, + "balance_loss_mlp": 1.01477838, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.8225779030017657, + "language_loss": 0.69550788, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71687961, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.109375, + "step": 14493, + "time_per_iteration": 2.5521700382232666 + }, + { + "auxiliary_loss_clip": 0.01117474, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.04550803, + "balance_loss_mlp": 1.02063179, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.842696513380476, + "language_loss": 0.89014161, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91165072, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12817383, + "step": 14494, + "time_per_iteration": 3.8912787437438965 + }, + { + "auxiliary_loss_clip": 0.01120242, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.04814184, + "balance_loss_mlp": 1.02048111, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 3.3437935003596615, + "language_loss": 0.59193444, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61345398, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11224365, + "step": 14495, + "time_per_iteration": 2.622354745864868 + }, + { + "auxiliary_loss_clip": 0.01108145, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03592181, + "balance_loss_mlp": 1.02082503, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.9204846723660527, + "language_loss": 0.8041364, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82554841, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12237549, + "step": 14496, + "time_per_iteration": 2.4526515007019043 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.03915131, + "balance_loss_mlp": 1.01928675, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 2.6016030719665952, + "language_loss": 0.78989303, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.81134093, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1293335, + "step": 14497, + "time_per_iteration": 2.446103096008301 + }, + { + "auxiliary_loss_clip": 0.01112769, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.0386498, + "balance_loss_mlp": 1.01622105, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.1052506486148768, + "language_loss": 0.66952103, + "learning_rate": 1.70215677535406e-07, + "loss": 0.690934, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12304688, + "step": 14498, + "time_per_iteration": 3.8333940505981445 + }, + { + "auxiliary_loss_clip": 0.01109591, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.03859568, + "balance_loss_mlp": 1.01779115, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.717075211834222, + "language_loss": 0.57366979, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59505701, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11346436, + "step": 14499, + "time_per_iteration": 2.59812068939209 + }, + { + "auxiliary_loss_clip": 0.01115916, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.04125631, + "balance_loss_mlp": 1.01949179, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 2.613575588027992, + "language_loss": 0.79784435, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81932282, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12445068, + "step": 14500, + "time_per_iteration": 2.4657063484191895 + }, + { + "auxiliary_loss_clip": 0.01115407, + "auxiliary_loss_mlp": 0.01024521, + "balance_loss_clip": 1.04472625, + "balance_loss_mlp": 1.01314247, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 1.9415949428668966, + "language_loss": 0.72698438, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74838376, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11364746, + "step": 14501, + "time_per_iteration": 2.397779941558838 + }, + { + "auxiliary_loss_clip": 0.01117466, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.04375553, + "balance_loss_mlp": 1.01680052, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.709441091926352, + "language_loss": 0.64459908, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66606635, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12463379, + "step": 14502, + "time_per_iteration": 2.442732095718384 + }, + { + "auxiliary_loss_clip": 0.01119873, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.04403257, + "balance_loss_mlp": 1.0196234, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.7299836141548075, + "language_loss": 0.69103163, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71254331, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11676025, + "step": 14503, + "time_per_iteration": 2.5578973293304443 + }, + { + "auxiliary_loss_clip": 0.01108415, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.03775573, + "balance_loss_mlp": 1.0170517, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.201849902493915, + "language_loss": 0.69909632, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.72046733, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11621094, + "step": 14504, + "time_per_iteration": 2.4723520278930664 + }, + { + "auxiliary_loss_clip": 0.01110214, + "auxiliary_loss_mlp": 0.01026689, + "balance_loss_clip": 1.03874254, + "balance_loss_mlp": 1.01510811, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.8556682832787244, + "language_loss": 0.70186234, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72323132, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11590576, + "step": 14505, + "time_per_iteration": 2.5150949954986572 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.03918839, + "balance_loss_mlp": 1.01683784, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.7275548707492168, + "language_loss": 0.785128, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80649805, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10931396, + "step": 14506, + "time_per_iteration": 2.5692083835601807 + }, + { + "auxiliary_loss_clip": 0.011148, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.04180729, + "balance_loss_mlp": 1.01609349, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 5.206763495759889, + "language_loss": 0.74077433, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76219785, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11468506, + "step": 14507, + "time_per_iteration": 3.9967222213745117 + }, + { + "auxiliary_loss_clip": 0.0111272, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.03794932, + "balance_loss_mlp": 1.01715684, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 1.9730234443960972, + "language_loss": 0.72329605, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74472249, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12774658, + "step": 14508, + "time_per_iteration": 2.4431991577148438 + }, + { + "auxiliary_loss_clip": 0.01114091, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.03861666, + "balance_loss_mlp": 1.02238226, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.7920688919191923, + "language_loss": 0.68601406, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.7075091, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13043213, + "step": 14509, + "time_per_iteration": 2.4950180053710938 + }, + { + "auxiliary_loss_clip": 0.01115035, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.04026127, + "balance_loss_mlp": 1.02314222, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 13.656949135552813, + "language_loss": 0.58463907, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60613668, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11572266, + "step": 14510, + "time_per_iteration": 2.5153725147247314 + }, + { + "auxiliary_loss_clip": 0.01124845, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.04657793, + "balance_loss_mlp": 1.01545608, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.331954671144867, + "language_loss": 0.67443728, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69597042, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13000488, + "step": 14511, + "time_per_iteration": 2.455382823944092 + }, + { + "auxiliary_loss_clip": 0.01116784, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.03897643, + "balance_loss_mlp": 1.02080059, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 1.5990068011696719, + "language_loss": 0.81883776, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.84033942, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12579346, + "step": 14512, + "time_per_iteration": 2.584547519683838 + }, + { + "auxiliary_loss_clip": 0.01049636, + "auxiliary_loss_mlp": 0.01008539, + "balance_loss_clip": 1.02411544, + "balance_loss_mlp": 1.00714874, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7949898648806378, + "language_loss": 0.58593667, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60651839, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.25488281, + "router_z_loss_mlp": 0.01390076, + "step": 14513, + "time_per_iteration": 2.9760918617248535 + }, + { + "auxiliary_loss_clip": 0.01114447, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.04290211, + "balance_loss_mlp": 1.01802003, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.7250562685470572, + "language_loss": 0.76614594, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78759289, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12219238, + "step": 14514, + "time_per_iteration": 2.482844591140747 + }, + { + "auxiliary_loss_clip": 0.01119438, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.04515815, + "balance_loss_mlp": 1.01447511, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.9061751862477243, + "language_loss": 0.65673143, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67818207, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11151123, + "step": 14515, + "time_per_iteration": 2.5244016647338867 + }, + { + "auxiliary_loss_clip": 0.01119122, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.04595232, + "balance_loss_mlp": 1.01966166, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.0554403558969683, + "language_loss": 0.78911489, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.8106246, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12194824, + "step": 14516, + "time_per_iteration": 2.4761011600494385 + }, + { + "auxiliary_loss_clip": 0.01116862, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.04254401, + "balance_loss_mlp": 1.01804471, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.557517485372083, + "language_loss": 0.72134751, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74281573, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1192627, + "step": 14517, + "time_per_iteration": 2.450584650039673 + }, + { + "auxiliary_loss_clip": 0.01106157, + "auxiliary_loss_mlp": 0.01024518, + "balance_loss_clip": 1.03646803, + "balance_loss_mlp": 1.01368773, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 4.315987122469369, + "language_loss": 0.72605133, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74735808, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.1083374, + "step": 14518, + "time_per_iteration": 2.417985200881958 + }, + { + "auxiliary_loss_clip": 0.01111035, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.04122019, + "balance_loss_mlp": 1.01937759, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.346114683729848, + "language_loss": 0.74335063, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76476109, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10644531, + "step": 14519, + "time_per_iteration": 2.48056960105896 + }, + { + "auxiliary_loss_clip": 0.01112373, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.03770876, + "balance_loss_mlp": 1.01906025, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 4.614201049776367, + "language_loss": 0.76703882, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78848511, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13189697, + "step": 14520, + "time_per_iteration": 2.4808881282806396 + }, + { + "auxiliary_loss_clip": 0.01110641, + "auxiliary_loss_mlp": 0.01037332, + "balance_loss_clip": 1.03770852, + "balance_loss_mlp": 1.02420723, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.994670257940042, + "language_loss": 0.82375145, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84523118, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.13128662, + "step": 14521, + "time_per_iteration": 2.5324182510375977 + }, + { + "auxiliary_loss_clip": 0.01117015, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.0429728, + "balance_loss_mlp": 1.01830602, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.0843051878978174, + "language_loss": 0.76177824, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.7832582, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12683105, + "step": 14522, + "time_per_iteration": 4.007592439651489 + }, + { + "auxiliary_loss_clip": 0.01106859, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.03831971, + "balance_loss_mlp": 1.01639915, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 1.8206403922846273, + "language_loss": 0.75809896, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77945405, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.12261963, + "step": 14523, + "time_per_iteration": 2.495105028152466 + }, + { + "auxiliary_loss_clip": 0.01113878, + "auxiliary_loss_mlp": 0.01025257, + "balance_loss_clip": 1.04331648, + "balance_loss_mlp": 1.01442111, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.272495444786175, + "language_loss": 0.78745639, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80884778, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10839844, + "step": 14524, + "time_per_iteration": 2.5288519859313965 + }, + { + "auxiliary_loss_clip": 0.01107471, + "auxiliary_loss_mlp": 0.01025662, + "balance_loss_clip": 1.03854346, + "balance_loss_mlp": 1.01527858, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 2.12467718602948, + "language_loss": 0.77937889, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.8007102, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.1038208, + "step": 14525, + "time_per_iteration": 2.4766793251037598 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.03834486, + "balance_loss_mlp": 1.02011979, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.6665692454829137, + "language_loss": 0.69609243, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71754342, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12017822, + "step": 14526, + "time_per_iteration": 2.4529788494110107 + }, + { + "auxiliary_loss_clip": 0.01118217, + "auxiliary_loss_mlp": 0.01041654, + "balance_loss_clip": 1.04167032, + "balance_loss_mlp": 1.02796292, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 2.13696347070039, + "language_loss": 0.61172807, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63332677, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13696289, + "step": 14527, + "time_per_iteration": 2.487542152404785 + }, + { + "auxiliary_loss_clip": 0.01120479, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.04301512, + "balance_loss_mlp": 1.01904023, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.955056602154459, + "language_loss": 0.65701973, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.67855203, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13726807, + "step": 14528, + "time_per_iteration": 2.4167051315307617 + }, + { + "auxiliary_loss_clip": 0.01116684, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.04515469, + "balance_loss_mlp": 1.0169102, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 3.0057217540386847, + "language_loss": 0.8951388, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91659123, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11633301, + "step": 14529, + "time_per_iteration": 2.4799726009368896 + }, + { + "auxiliary_loss_clip": 0.0110989, + "auxiliary_loss_mlp": 0.01028673, + "balance_loss_clip": 1.0395267, + "balance_loss_mlp": 1.01728868, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.8851481744777931, + "language_loss": 0.84753114, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.86891675, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11383057, + "step": 14530, + "time_per_iteration": 2.470921754837036 + }, + { + "auxiliary_loss_clip": 0.01117986, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.046453, + "balance_loss_mlp": 1.02135825, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.7011076629000308, + "language_loss": 0.74516267, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76666677, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1105957, + "step": 14531, + "time_per_iteration": 2.4820077419281006 + }, + { + "auxiliary_loss_clip": 0.01113697, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.04394555, + "balance_loss_mlp": 1.01798534, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 1.7689653876730247, + "language_loss": 0.61516011, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63658708, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11029053, + "step": 14532, + "time_per_iteration": 2.4358391761779785 + }, + { + "auxiliary_loss_clip": 0.01034108, + "auxiliary_loss_mlp": 0.00999986, + "balance_loss_clip": 1.00997949, + "balance_loss_mlp": 0.99865061, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.8223474557219344, + "language_loss": 0.586869, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60720998, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.24169922, + "router_z_loss_mlp": 0.0133667, + "step": 14533, + "time_per_iteration": 3.1986873149871826 + }, + { + "auxiliary_loss_clip": 0.01106146, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.03648901, + "balance_loss_mlp": 1.01985729, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.5073721946225875, + "language_loss": 0.76919353, + "learning_rate": 1.646005846335954e-07, + "loss": 0.79057795, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.12432861, + "step": 14534, + "time_per_iteration": 2.5189356803894043 + }, + { + "auxiliary_loss_clip": 0.01109642, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.03715229, + "balance_loss_mlp": 1.02114272, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 2.108372928196476, + "language_loss": 0.7518937, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77333212, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1305542, + "step": 14535, + "time_per_iteration": 2.4604783058166504 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.03602982, + "balance_loss_mlp": 1.01711941, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.962956760860117, + "language_loss": 0.74278879, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76416862, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.13238525, + "step": 14536, + "time_per_iteration": 2.513385534286499 + }, + { + "auxiliary_loss_clip": 0.0111386, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.03980947, + "balance_loss_mlp": 1.01962686, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.5523684303963654, + "language_loss": 0.6429317, + "learning_rate": 1.641367279482304e-07, + "loss": 0.66438895, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12243652, + "step": 14537, + "time_per_iteration": 3.876588821411133 + }, + { + "auxiliary_loss_clip": 0.01115157, + "auxiliary_loss_mlp": 0.01027266, + "balance_loss_clip": 1.04317284, + "balance_loss_mlp": 1.01483834, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 2.156284019995475, + "language_loss": 0.58190513, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60332936, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12420654, + "step": 14538, + "time_per_iteration": 2.573007106781006 + }, + { + "auxiliary_loss_clip": 0.0111898, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.04902518, + "balance_loss_mlp": 1.01871371, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 2.162686614504631, + "language_loss": 0.68564034, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70712858, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11126709, + "step": 14539, + "time_per_iteration": 2.419658660888672 + }, + { + "auxiliary_loss_clip": 0.01114201, + "auxiliary_loss_mlp": 0.01031898, + "balance_loss_clip": 1.03919089, + "balance_loss_mlp": 1.01976252, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.043308650212973, + "language_loss": 0.74508786, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76654887, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12145996, + "step": 14540, + "time_per_iteration": 2.4408626556396484 + }, + { + "auxiliary_loss_clip": 0.01114421, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.04109931, + "balance_loss_mlp": 1.024629, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.6232817551857481, + "language_loss": 0.79144782, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81296062, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12225342, + "step": 14541, + "time_per_iteration": 2.4714128971099854 + }, + { + "auxiliary_loss_clip": 0.01122902, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.04574513, + "balance_loss_mlp": 1.01715362, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.125407382818177, + "language_loss": 0.67059183, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.69212836, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13604736, + "step": 14542, + "time_per_iteration": 3.8696422576904297 + }, + { + "auxiliary_loss_clip": 0.01044269, + "auxiliary_loss_mlp": 0.01003356, + "balance_loss_clip": 1.0189538, + "balance_loss_mlp": 1.00199354, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7863051214317377, + "language_loss": 0.54482555, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56530178, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01361084, + "step": 14543, + "time_per_iteration": 2.891770124435425 + }, + { + "auxiliary_loss_clip": 0.01120083, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.04659081, + "balance_loss_mlp": 1.01855576, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 1.963927953410882, + "language_loss": 0.69637609, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71788692, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12451172, + "step": 14544, + "time_per_iteration": 2.5022928714752197 + }, + { + "auxiliary_loss_clip": 0.01113491, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.04501319, + "balance_loss_mlp": 1.01681924, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.644104886752756, + "language_loss": 0.75684917, + "learning_rate": 1.62902840325714e-07, + "loss": 0.77826309, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.11077881, + "step": 14545, + "time_per_iteration": 2.4873058795928955 + }, + { + "auxiliary_loss_clip": 0.01109945, + "auxiliary_loss_mlp": 0.01038663, + "balance_loss_clip": 1.03649902, + "balance_loss_mlp": 1.02435791, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.596407838274368, + "language_loss": 0.66265321, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68413931, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.14306641, + "step": 14546, + "time_per_iteration": 2.597538709640503 + }, + { + "auxiliary_loss_clip": 0.0111952, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.04611433, + "balance_loss_mlp": 1.01838493, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.8289065612074398, + "language_loss": 0.72786033, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.74936354, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12414551, + "step": 14547, + "time_per_iteration": 2.506520986557007 + }, + { + "auxiliary_loss_clip": 0.01124023, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.04366028, + "balance_loss_mlp": 1.02256143, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.882347418142789, + "language_loss": 0.7019347, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.7235322, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.1317749, + "step": 14548, + "time_per_iteration": 2.569744825363159 + }, + { + "auxiliary_loss_clip": 0.0111793, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.04167116, + "balance_loss_mlp": 1.02279675, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.135907196180904, + "language_loss": 0.7047441, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72627723, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12591553, + "step": 14549, + "time_per_iteration": 2.5797581672668457 + }, + { + "auxiliary_loss_clip": 0.01127361, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.04896212, + "balance_loss_mlp": 1.0178647, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.3604947541172963, + "language_loss": 0.84043372, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.86202163, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13562012, + "step": 14550, + "time_per_iteration": 2.5226669311523438 + }, + { + "auxiliary_loss_clip": 0.01120888, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.04627371, + "balance_loss_mlp": 1.02288389, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 2.1857695330245845, + "language_loss": 0.71498513, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.73653781, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1149292, + "step": 14551, + "time_per_iteration": 2.433439254760742 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.04067397, + "balance_loss_mlp": 1.02184629, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 2.7949026451253047, + "language_loss": 0.63900232, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.6604799, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12756348, + "step": 14552, + "time_per_iteration": 3.9945149421691895 + }, + { + "auxiliary_loss_clip": 0.01116739, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.04174137, + "balance_loss_mlp": 1.01293957, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 2.591456429735761, + "language_loss": 0.79437959, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81581295, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13659668, + "step": 14553, + "time_per_iteration": 2.483694553375244 + }, + { + "auxiliary_loss_clip": 0.01119705, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.04432774, + "balance_loss_mlp": 1.02049053, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 2.1566032743512684, + "language_loss": 0.70149374, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.7230128, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1171875, + "step": 14554, + "time_per_iteration": 2.4048643112182617 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.04089403, + "balance_loss_mlp": 1.01487064, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.6624292812197876, + "language_loss": 0.83560443, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85701346, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11914062, + "step": 14555, + "time_per_iteration": 2.5155229568481445 + }, + { + "auxiliary_loss_clip": 0.01115756, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.04069591, + "balance_loss_mlp": 1.02211761, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.542052705687908, + "language_loss": 0.71083015, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73233312, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12420654, + "step": 14556, + "time_per_iteration": 2.5741326808929443 + }, + { + "auxiliary_loss_clip": 0.01117649, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.04272652, + "balance_loss_mlp": 1.01661015, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 1.8120594837026531, + "language_loss": 0.76942444, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.79090047, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13336182, + "step": 14557, + "time_per_iteration": 2.4671413898468018 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.04504251, + "balance_loss_mlp": 1.022331, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 1.9456415649485046, + "language_loss": 0.82859921, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85013282, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12457275, + "step": 14558, + "time_per_iteration": 2.4823429584503174 + }, + { + "auxiliary_loss_clip": 0.01041149, + "auxiliary_loss_mlp": 0.01002351, + "balance_loss_clip": 1.01612186, + "balance_loss_mlp": 1.00112128, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8113791010763532, + "language_loss": 0.56043231, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58086729, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.01229858, + "step": 14559, + "time_per_iteration": 3.076702117919922 + }, + { + "auxiliary_loss_clip": 0.01115516, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_clip": 1.04466963, + "balance_loss_mlp": 1.01830077, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.6495460269076947, + "language_loss": 0.66226244, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68371207, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.1114502, + "step": 14560, + "time_per_iteration": 2.442976474761963 + }, + { + "auxiliary_loss_clip": 0.01120283, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.04861331, + "balance_loss_mlp": 1.01794875, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.8978559757700866, + "language_loss": 0.78827792, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80976981, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10961914, + "step": 14561, + "time_per_iteration": 2.6625125408172607 + }, + { + "auxiliary_loss_clip": 0.01116219, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.01833868, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 2.6078905862842143, + "language_loss": 0.77166611, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79313785, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12615967, + "step": 14562, + "time_per_iteration": 2.449075698852539 + }, + { + "auxiliary_loss_clip": 0.01113666, + "auxiliary_loss_mlp": 0.01025573, + "balance_loss_clip": 1.04525995, + "balance_loss_mlp": 1.01451659, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 2.0722426749856506, + "language_loss": 0.7196784, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74107081, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.1105957, + "step": 14563, + "time_per_iteration": 2.582829475402832 + }, + { + "auxiliary_loss_clip": 0.01119094, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.044631, + "balance_loss_mlp": 1.02207041, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.2962934466804668, + "language_loss": 0.6546945, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67622566, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11950684, + "step": 14564, + "time_per_iteration": 2.4294819831848145 + }, + { + "auxiliary_loss_clip": 0.01113848, + "auxiliary_loss_mlp": 0.01036713, + "balance_loss_clip": 1.04188454, + "balance_loss_mlp": 1.02421343, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.960465639785533, + "language_loss": 0.70572364, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72722924, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12518311, + "step": 14565, + "time_per_iteration": 2.4343814849853516 + }, + { + "auxiliary_loss_clip": 0.01124448, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.04403734, + "balance_loss_mlp": 1.02083087, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 3.0401753975659305, + "language_loss": 0.7764101, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79800034, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13751221, + "step": 14566, + "time_per_iteration": 3.9682743549346924 + }, + { + "auxiliary_loss_clip": 0.01113976, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.04091787, + "balance_loss_mlp": 1.02651429, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 2.051813413124519, + "language_loss": 0.71275562, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73429143, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.13098145, + "step": 14567, + "time_per_iteration": 2.500436782836914 + }, + { + "auxiliary_loss_clip": 0.01122553, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.04879522, + "balance_loss_mlp": 1.01776791, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 2.5644247562759355, + "language_loss": 0.74381667, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76533884, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11901855, + "step": 14568, + "time_per_iteration": 2.51051664352417 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.04072034, + "balance_loss_mlp": 1.02476501, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 5.901026058888379, + "language_loss": 0.86853313, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.89004159, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11956787, + "step": 14569, + "time_per_iteration": 2.421855926513672 + }, + { + "auxiliary_loss_clip": 0.01116896, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_clip": 1.04059768, + "balance_loss_mlp": 1.02910423, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.9176948050560767, + "language_loss": 0.7407915, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76237249, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12097168, + "step": 14570, + "time_per_iteration": 2.469034433364868 + }, + { + "auxiliary_loss_clip": 0.01118596, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.04464269, + "balance_loss_mlp": 1.01566625, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.805556137199942, + "language_loss": 0.67909229, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.70056349, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12872314, + "step": 14571, + "time_per_iteration": 2.5814054012298584 + }, + { + "auxiliary_loss_clip": 0.01113581, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.04099083, + "balance_loss_mlp": 1.0180285, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 1.8646138513239825, + "language_loss": 0.62532187, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64675319, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11541748, + "step": 14572, + "time_per_iteration": 2.471364736557007 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.0409646, + "balance_loss_mlp": 1.02014303, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.7610608638665024, + "language_loss": 0.73917615, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.76061308, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10565186, + "step": 14573, + "time_per_iteration": 2.543008327484131 + }, + { + "auxiliary_loss_clip": 0.01113289, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.04317391, + "balance_loss_mlp": 1.01638722, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 3.7439271700091687, + "language_loss": 0.72749352, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74889851, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10827637, + "step": 14574, + "time_per_iteration": 2.561527729034424 + }, + { + "auxiliary_loss_clip": 0.01117002, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.04447901, + "balance_loss_mlp": 1.02074718, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 2.092483739641489, + "language_loss": 0.75723362, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.77873015, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11914062, + "step": 14575, + "time_per_iteration": 2.4508564472198486 + }, + { + "auxiliary_loss_clip": 0.01117674, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.04717255, + "balance_loss_mlp": 1.02217019, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.771493122829326, + "language_loss": 0.66605997, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.68757033, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11187744, + "step": 14576, + "time_per_iteration": 2.600209951400757 + }, + { + "auxiliary_loss_clip": 0.01115465, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.04293752, + "balance_loss_mlp": 1.01580608, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 2.473442117561195, + "language_loss": 0.6734103, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.69483495, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11181641, + "step": 14577, + "time_per_iteration": 2.3982152938842773 + }, + { + "auxiliary_loss_clip": 0.01130764, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.05616295, + "balance_loss_mlp": 1.01739192, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.3612198148091585, + "language_loss": 0.70942295, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73102188, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11743164, + "step": 14578, + "time_per_iteration": 2.517796754837036 + }, + { + "auxiliary_loss_clip": 0.01118789, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.04514337, + "balance_loss_mlp": 1.02104783, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 2.9352166765442678, + "language_loss": 0.71042591, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73194408, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11968994, + "step": 14579, + "time_per_iteration": 2.4713425636291504 + }, + { + "auxiliary_loss_clip": 0.01120732, + "auxiliary_loss_mlp": 0.01028285, + "balance_loss_clip": 1.05164552, + "balance_loss_mlp": 1.01722836, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 2.1011177242408055, + "language_loss": 0.699646, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72113609, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11071777, + "step": 14580, + "time_per_iteration": 2.5013134479522705 + }, + { + "auxiliary_loss_clip": 0.01112374, + "auxiliary_loss_mlp": 0.01024604, + "balance_loss_clip": 1.04144311, + "balance_loss_mlp": 1.01357746, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.624767546399875, + "language_loss": 0.65765417, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67902398, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11010742, + "step": 14581, + "time_per_iteration": 2.4854061603546143 + }, + { + "auxiliary_loss_clip": 0.01116912, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.04738641, + "balance_loss_mlp": 1.02127302, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.466743518779177, + "language_loss": 0.73691678, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75841147, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11273193, + "step": 14582, + "time_per_iteration": 3.9241530895233154 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01025592, + "balance_loss_clip": 1.03946877, + "balance_loss_mlp": 1.01376009, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.5972861146611654, + "language_loss": 0.66923696, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.69060898, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11834717, + "step": 14583, + "time_per_iteration": 2.450953245162964 + }, + { + "auxiliary_loss_clip": 0.01110914, + "auxiliary_loss_mlp": 0.01027218, + "balance_loss_clip": 1.03687692, + "balance_loss_mlp": 1.01545739, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.850801537592269, + "language_loss": 0.79235005, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81373131, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11749268, + "step": 14584, + "time_per_iteration": 2.4432132244110107 + }, + { + "auxiliary_loss_clip": 0.01114618, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.04004812, + "balance_loss_mlp": 1.01850581, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5355163124718094, + "language_loss": 0.72409028, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74553621, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11468506, + "step": 14585, + "time_per_iteration": 4.055616617202759 + }, + { + "auxiliary_loss_clip": 0.01108057, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.03672683, + "balance_loss_mlp": 1.01437044, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.9836258766801473, + "language_loss": 0.74452698, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.765876, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12481689, + "step": 14586, + "time_per_iteration": 2.5710947513580322 + }, + { + "auxiliary_loss_clip": 0.01114588, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.04195142, + "balance_loss_mlp": 1.01536107, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.7972662523781493, + "language_loss": 0.78709918, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80851924, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12054443, + "step": 14587, + "time_per_iteration": 2.4904699325561523 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.04211235, + "balance_loss_mlp": 1.01856852, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.90339018824523, + "language_loss": 0.73534751, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.75683302, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.14953613, + "step": 14588, + "time_per_iteration": 2.438110589981079 + }, + { + "auxiliary_loss_clip": 0.01114778, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.04235172, + "balance_loss_mlp": 1.01945829, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.7755192077204516, + "language_loss": 0.6663661, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68781763, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10913086, + "step": 14589, + "time_per_iteration": 2.4839205741882324 + }, + { + "auxiliary_loss_clip": 0.01119536, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.04585361, + "balance_loss_mlp": 1.01631987, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.231109484704426, + "language_loss": 0.71348464, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.73495775, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11468506, + "step": 14590, + "time_per_iteration": 2.4117841720581055 + }, + { + "auxiliary_loss_clip": 0.01123434, + "auxiliary_loss_mlp": 0.01037001, + "balance_loss_clip": 1.04622233, + "balance_loss_mlp": 1.02390563, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.28122890530385, + "language_loss": 0.74860924, + "learning_rate": 1.558945991776086e-07, + "loss": 0.7702136, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.13104248, + "step": 14591, + "time_per_iteration": 2.438493013381958 + }, + { + "auxiliary_loss_clip": 0.01110001, + "auxiliary_loss_mlp": 0.01023971, + "balance_loss_clip": 1.04321527, + "balance_loss_mlp": 1.01304007, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.720686291615256, + "language_loss": 0.80263638, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82397604, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 0.66748047, + "router_z_loss_mlp": 0.10919189, + "step": 14592, + "time_per_iteration": 2.489102602005005 + }, + { + "auxiliary_loss_clip": 0.01106209, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.03772616, + "balance_loss_mlp": 1.01934564, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.6197309874546189, + "language_loss": 0.82762814, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84898984, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10620117, + "step": 14593, + "time_per_iteration": 2.4781556129455566 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.01026309, + "balance_loss_clip": 1.04034257, + "balance_loss_mlp": 1.01502597, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 2.207402423280266, + "language_loss": 0.76150584, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78287375, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11279297, + "step": 14594, + "time_per_iteration": 2.495680809020996 + }, + { + "auxiliary_loss_clip": 0.01111674, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.03697586, + "balance_loss_mlp": 1.03031778, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.2958175528337947, + "language_loss": 0.78052998, + "learning_rate": 1.552921717241651e-07, + "loss": 0.80209374, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.14379883, + "step": 14595, + "time_per_iteration": 3.839722156524658 + }, + { + "auxiliary_loss_clip": 0.01118744, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.0435791, + "balance_loss_mlp": 1.0249145, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.3570675560907925, + "language_loss": 0.70419282, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72575903, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12969971, + "step": 14596, + "time_per_iteration": 2.519561529159546 + }, + { + "auxiliary_loss_clip": 0.01120606, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.04662085, + "balance_loss_mlp": 1.01625609, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.6377314129236669, + "language_loss": 0.85994804, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88142681, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11010742, + "step": 14597, + "time_per_iteration": 2.526723623275757 + }, + { + "auxiliary_loss_clip": 0.01115567, + "auxiliary_loss_mlp": 0.01027256, + "balance_loss_clip": 1.04390645, + "balance_loss_mlp": 1.01579976, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.5874092179373238, + "language_loss": 0.72617388, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.74760211, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11444092, + "step": 14598, + "time_per_iteration": 2.6473052501678467 + }, + { + "auxiliary_loss_clip": 0.01118014, + "auxiliary_loss_mlp": 0.01026066, + "balance_loss_clip": 1.04490864, + "balance_loss_mlp": 1.01437724, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.745813161598533, + "language_loss": 0.77583653, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79727733, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11700439, + "step": 14599, + "time_per_iteration": 2.3977558612823486 + }, + { + "auxiliary_loss_clip": 0.01122808, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.04897714, + "balance_loss_mlp": 1.01904202, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.291884272584071, + "language_loss": 0.68042511, + "learning_rate": 1.545407113589332e-07, + "loss": 0.70195615, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11254883, + "step": 14600, + "time_per_iteration": 2.4348535537719727 + }, + { + "auxiliary_loss_clip": 0.01124972, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.05156314, + "balance_loss_mlp": 1.02620792, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.718003615048635, + "language_loss": 0.69230926, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71393943, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11828613, + "step": 14601, + "time_per_iteration": 2.6779448986053467 + }, + { + "auxiliary_loss_clip": 0.01119245, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.04419494, + "balance_loss_mlp": 1.01958632, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 2.4329391319264917, + "language_loss": 0.73265445, + "learning_rate": 1.542406170329733e-07, + "loss": 0.754161, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11816406, + "step": 14602, + "time_per_iteration": 2.5130515098571777 + }, + { + "auxiliary_loss_clip": 0.01112783, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.04282045, + "balance_loss_mlp": 1.01918697, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 2.0227420045276956, + "language_loss": 0.71495581, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73637903, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10345459, + "step": 14603, + "time_per_iteration": 2.409649610519409 + }, + { + "auxiliary_loss_clip": 0.01050665, + "auxiliary_loss_mlp": 0.01006602, + "balance_loss_clip": 1.02636158, + "balance_loss_mlp": 1.00530291, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7348123921479782, + "language_loss": 0.54192281, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56249547, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.24291992, + "router_z_loss_mlp": 0.01300049, + "step": 14604, + "time_per_iteration": 3.0925629138946533 + }, + { + "auxiliary_loss_clip": 0.0104597, + "auxiliary_loss_mlp": 0.01010076, + "balance_loss_clip": 1.02112687, + "balance_loss_mlp": 1.0087558, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7057931563147533, + "language_loss": 0.59224439, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61280489, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.24853516, + "router_z_loss_mlp": 0.01321411, + "step": 14605, + "time_per_iteration": 3.082228422164917 + }, + { + "auxiliary_loss_clip": 0.01114936, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.0422945, + "balance_loss_mlp": 1.01887631, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.681701981961507, + "language_loss": 0.85340357, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87485814, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11651611, + "step": 14606, + "time_per_iteration": 2.599998950958252 + }, + { + "auxiliary_loss_clip": 0.01116379, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.04340541, + "balance_loss_mlp": 1.01895571, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 1.7648079010538666, + "language_loss": 0.70495105, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72643197, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12750244, + "step": 14607, + "time_per_iteration": 2.4439761638641357 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.04116368, + "balance_loss_mlp": 1.02176142, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.8140404558207082, + "language_loss": 0.72069681, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74213386, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10534668, + "step": 14608, + "time_per_iteration": 2.4849636554718018 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.04051769, + "balance_loss_mlp": 1.02364635, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 2.180513986308703, + "language_loss": 0.87529033, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89682782, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1307373, + "step": 14609, + "time_per_iteration": 2.399195671081543 + }, + { + "auxiliary_loss_clip": 0.01109903, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.03858876, + "balance_loss_mlp": 1.01844978, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 2.0605222257361056, + "language_loss": 0.70323813, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72463822, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11663818, + "step": 14610, + "time_per_iteration": 3.9873697757720947 + }, + { + "auxiliary_loss_clip": 0.0111084, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.04092884, + "balance_loss_mlp": 1.02064538, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 2.8002374512387624, + "language_loss": 0.80473012, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82615757, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11260986, + "step": 14611, + "time_per_iteration": 2.4328761100769043 + }, + { + "auxiliary_loss_clip": 0.01120073, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.0464977, + "balance_loss_mlp": 1.01698565, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.653503113042204, + "language_loss": 0.76700413, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78848839, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11364746, + "step": 14612, + "time_per_iteration": 2.42972993850708 + }, + { + "auxiliary_loss_clip": 0.01111126, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.04170322, + "balance_loss_mlp": 1.02090287, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.3931068238092315, + "language_loss": 0.72530019, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74673194, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.1114502, + "step": 14613, + "time_per_iteration": 2.5065906047821045 + }, + { + "auxiliary_loss_clip": 0.0106701, + "auxiliary_loss_mlp": 0.01005356, + "balance_loss_clip": 1.04347134, + "balance_loss_mlp": 1.00392604, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0328087420229786, + "language_loss": 0.64555943, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66628313, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.01431274, + "step": 14614, + "time_per_iteration": 2.8134264945983887 + }, + { + "auxiliary_loss_clip": 0.01040453, + "auxiliary_loss_mlp": 0.01003104, + "balance_loss_clip": 1.01588666, + "balance_loss_mlp": 1.001683, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6893544323669859, + "language_loss": 0.5854007, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60583627, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01420593, + "step": 14615, + "time_per_iteration": 3.1442208290100098 + }, + { + "auxiliary_loss_clip": 0.01114355, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.04229045, + "balance_loss_mlp": 1.01534617, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 1.813614637733685, + "language_loss": 0.72974527, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.75115669, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11437988, + "step": 14616, + "time_per_iteration": 2.3811304569244385 + }, + { + "auxiliary_loss_clip": 0.01043815, + "auxiliary_loss_mlp": 0.01004092, + "balance_loss_clip": 1.01906395, + "balance_loss_mlp": 1.00273442, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.832037207068046, + "language_loss": 0.58010852, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60058767, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01358032, + "step": 14617, + "time_per_iteration": 3.1599066257476807 + }, + { + "auxiliary_loss_clip": 0.01115527, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.04383564, + "balance_loss_mlp": 1.02044296, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.757991721385275, + "language_loss": 0.84128684, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.86275899, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11248779, + "step": 14618, + "time_per_iteration": 2.461016893386841 + }, + { + "auxiliary_loss_clip": 0.01112041, + "auxiliary_loss_mlp": 0.01028609, + "balance_loss_clip": 1.04488242, + "balance_loss_mlp": 1.01776648, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 2.1095228529253953, + "language_loss": 0.69336021, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71476668, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.10839844, + "step": 14619, + "time_per_iteration": 2.575667381286621 + }, + { + "auxiliary_loss_clip": 0.01110803, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.03751159, + "balance_loss_mlp": 1.02330685, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 1.8523212848758113, + "language_loss": 0.77507699, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79653037, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11218262, + "step": 14620, + "time_per_iteration": 2.5500006675720215 + }, + { + "auxiliary_loss_clip": 0.01116237, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.04236174, + "balance_loss_mlp": 1.02456641, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.7039182370386186, + "language_loss": 0.79262066, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81417143, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1427002, + "step": 14621, + "time_per_iteration": 2.4597792625427246 + }, + { + "auxiliary_loss_clip": 0.0111034, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.03686929, + "balance_loss_mlp": 1.02201772, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.5835735463753655, + "language_loss": 0.66560847, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.68704855, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11651611, + "step": 14622, + "time_per_iteration": 2.453380584716797 + }, + { + "auxiliary_loss_clip": 0.0110905, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03819418, + "balance_loss_mlp": 1.02148092, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.8507184834402561, + "language_loss": 0.73321998, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75463617, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11108398, + "step": 14623, + "time_per_iteration": 2.5132386684417725 + }, + { + "auxiliary_loss_clip": 0.0110518, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.03538835, + "balance_loss_mlp": 1.0170269, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.9104117609204834, + "language_loss": 0.78371012, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80504775, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11560059, + "step": 14624, + "time_per_iteration": 2.441460371017456 + }, + { + "auxiliary_loss_clip": 0.01119591, + "auxiliary_loss_mlp": 0.01030531, + "balance_loss_clip": 1.04624867, + "balance_loss_mlp": 1.01822877, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.7475411752614693, + "language_loss": 0.80245042, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.8239516, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12304688, + "step": 14625, + "time_per_iteration": 3.8913259506225586 + }, + { + "auxiliary_loss_clip": 0.01110971, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.04211307, + "balance_loss_mlp": 1.02359533, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.457455549105381, + "language_loss": 0.74203217, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76348913, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.11138916, + "step": 14626, + "time_per_iteration": 2.4724762439727783 + }, + { + "auxiliary_loss_clip": 0.01107575, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.03479314, + "balance_loss_mlp": 1.01844621, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.4862022423252266, + "language_loss": 0.71107304, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73245215, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11901855, + "step": 14627, + "time_per_iteration": 2.5605738162994385 + }, + { + "auxiliary_loss_clip": 0.01114263, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.02070713, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 2.2772733716839455, + "language_loss": 0.72390515, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74537694, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12213135, + "step": 14628, + "time_per_iteration": 3.830392599105835 + }, + { + "auxiliary_loss_clip": 0.01115435, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.04309499, + "balance_loss_mlp": 1.0182296, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.5186269700282455, + "language_loss": 0.68738365, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.70884287, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12249756, + "step": 14629, + "time_per_iteration": 2.3960790634155273 + }, + { + "auxiliary_loss_clip": 0.0111417, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.03985929, + "balance_loss_mlp": 1.03029013, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.5799626552201604, + "language_loss": 0.68652987, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70809549, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12084961, + "step": 14630, + "time_per_iteration": 2.487048387527466 + }, + { + "auxiliary_loss_clip": 0.01117129, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.04620528, + "balance_loss_mlp": 1.01850414, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.5570848490650666, + "language_loss": 0.74287635, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76435471, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12207031, + "step": 14631, + "time_per_iteration": 2.4976882934570312 + }, + { + "auxiliary_loss_clip": 0.01111835, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.04326582, + "balance_loss_mlp": 1.02065396, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 2.269657912528108, + "language_loss": 0.69321406, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71465576, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.11688232, + "step": 14632, + "time_per_iteration": 2.4494822025299072 + }, + { + "auxiliary_loss_clip": 0.01114981, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.043787, + "balance_loss_mlp": 1.01662958, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.8245743971785724, + "language_loss": 0.64645833, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.66787857, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10412598, + "step": 14633, + "time_per_iteration": 2.428523540496826 + }, + { + "auxiliary_loss_clip": 0.01115636, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.04524446, + "balance_loss_mlp": 1.02227926, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.5481488938493988, + "language_loss": 0.84083271, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86232579, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11395264, + "step": 14634, + "time_per_iteration": 2.498871326446533 + }, + { + "auxiliary_loss_clip": 0.01107349, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.03584719, + "balance_loss_mlp": 1.02462685, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.7947934177333125, + "language_loss": 0.79539794, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.81684077, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12298584, + "step": 14635, + "time_per_iteration": 2.5457603931427 + }, + { + "auxiliary_loss_clip": 0.01119198, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.04707837, + "balance_loss_mlp": 1.0125674, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 2.0376553703822067, + "language_loss": 0.65331835, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.6747539, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11785889, + "step": 14636, + "time_per_iteration": 2.4771904945373535 + }, + { + "auxiliary_loss_clip": 0.01116778, + "auxiliary_loss_mlp": 0.01037834, + "balance_loss_clip": 1.04157925, + "balance_loss_mlp": 1.02494788, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.569513344691041, + "language_loss": 0.70432729, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72587335, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12878418, + "step": 14637, + "time_per_iteration": 2.425128936767578 + }, + { + "auxiliary_loss_clip": 0.01119122, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.04746366, + "balance_loss_mlp": 1.0182265, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 1.8909885255962338, + "language_loss": 0.65854633, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68003714, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11737061, + "step": 14638, + "time_per_iteration": 3.8713161945343018 + }, + { + "auxiliary_loss_clip": 0.01116311, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.04393315, + "balance_loss_mlp": 1.01785278, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.729917080895641, + "language_loss": 0.58070558, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60216331, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11621094, + "step": 14639, + "time_per_iteration": 2.5849828720092773 + }, + { + "auxiliary_loss_clip": 0.01118342, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.04320455, + "balance_loss_mlp": 1.02182198, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.5643378231133573, + "language_loss": 0.74342203, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76494598, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12225342, + "step": 14640, + "time_per_iteration": 2.462829828262329 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.04660237, + "balance_loss_mlp": 1.02470958, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 2.0071106392438653, + "language_loss": 0.69807744, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71963692, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1159668, + "step": 14641, + "time_per_iteration": 2.507807493209839 + }, + { + "auxiliary_loss_clip": 0.01121064, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.04806185, + "balance_loss_mlp": 1.01464415, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.1830145630095896, + "language_loss": 0.85349452, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.87497699, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12554932, + "step": 14642, + "time_per_iteration": 2.454076051712036 + }, + { + "auxiliary_loss_clip": 0.01116939, + "auxiliary_loss_mlp": 0.0104349, + "balance_loss_clip": 1.04189062, + "balance_loss_mlp": 1.03018618, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.8493713925921988, + "language_loss": 0.78895032, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81055462, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13305664, + "step": 14643, + "time_per_iteration": 2.4314868450164795 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.01025888, + "balance_loss_clip": 1.04892921, + "balance_loss_mlp": 1.01528478, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.6357651086770375, + "language_loss": 0.73053467, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75197679, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10614014, + "step": 14644, + "time_per_iteration": 2.447493076324463 + }, + { + "auxiliary_loss_clip": 0.01114105, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.03959274, + "balance_loss_mlp": 1.01703668, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 4.6261263219172415, + "language_loss": 0.7973634, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81881386, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13897705, + "step": 14645, + "time_per_iteration": 2.413637399673462 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.04277635, + "balance_loss_mlp": 1.0192132, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 1.5701015322321756, + "language_loss": 0.64127338, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66269994, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11181641, + "step": 14646, + "time_per_iteration": 2.4403624534606934 + }, + { + "auxiliary_loss_clip": 0.01119839, + "auxiliary_loss_mlp": 0.01031811, + "balance_loss_clip": 1.04528499, + "balance_loss_mlp": 1.01941895, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.332692125709849, + "language_loss": 0.77721941, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79873598, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12390137, + "step": 14647, + "time_per_iteration": 2.463381052017212 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.04271078, + "balance_loss_mlp": 1.02101827, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 2.1434485939934187, + "language_loss": 0.75397062, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77542412, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11218262, + "step": 14648, + "time_per_iteration": 2.5243775844573975 + }, + { + "auxiliary_loss_clip": 0.01111329, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.03859127, + "balance_loss_mlp": 1.01775098, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.774634463171863, + "language_loss": 0.6556232, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67702895, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11499023, + "step": 14649, + "time_per_iteration": 2.5177507400512695 + }, + { + "auxiliary_loss_clip": 0.01113163, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.0410732, + "balance_loss_mlp": 1.01776874, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.3320270965616263, + "language_loss": 0.6252659, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64669114, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1159668, + "step": 14650, + "time_per_iteration": 2.4852397441864014 + }, + { + "auxiliary_loss_clip": 0.01106444, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.03720033, + "balance_loss_mlp": 1.0196048, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.5734262338564955, + "language_loss": 0.72844696, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74981654, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10913086, + "step": 14651, + "time_per_iteration": 2.5129165649414062 + }, + { + "auxiliary_loss_clip": 0.01115723, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.04234529, + "balance_loss_mlp": 1.02340364, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.768926683065919, + "language_loss": 0.71748871, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.73901367, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.13360596, + "step": 14652, + "time_per_iteration": 2.432433605194092 + }, + { + "auxiliary_loss_clip": 0.01109507, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.03838921, + "balance_loss_mlp": 1.02010822, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 2.8855908732792273, + "language_loss": 0.74451298, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76592684, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11755371, + "step": 14653, + "time_per_iteration": 2.438216209411621 + }, + { + "auxiliary_loss_clip": 0.01114716, + "auxiliary_loss_mlp": 0.01027203, + "balance_loss_clip": 1.04006839, + "balance_loss_mlp": 1.01488233, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 2.036644621684332, + "language_loss": 0.7213726, + "learning_rate": 1.465365647269421e-07, + "loss": 0.74279177, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12316895, + "step": 14654, + "time_per_iteration": 3.909174919128418 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.03994751, + "balance_loss_mlp": 1.01814973, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 4.245362052062845, + "language_loss": 0.71651673, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73793805, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12915039, + "step": 14655, + "time_per_iteration": 2.537886381149292 + }, + { + "auxiliary_loss_clip": 0.01118487, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.0466218, + "balance_loss_mlp": 1.01687443, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.7412382163118445, + "language_loss": 0.81267571, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83414161, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11236572, + "step": 14656, + "time_per_iteration": 2.5031814575195312 + }, + { + "auxiliary_loss_clip": 0.01117323, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.04573131, + "balance_loss_mlp": 1.01958466, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.8687787607139206, + "language_loss": 0.68491852, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70639956, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11199951, + "step": 14657, + "time_per_iteration": 2.4830267429351807 + }, + { + "auxiliary_loss_clip": 0.0111122, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.03830934, + "balance_loss_mlp": 1.0239141, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.135526286753189, + "language_loss": 0.84428871, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86575294, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11291504, + "step": 14658, + "time_per_iteration": 2.4588942527770996 + }, + { + "auxiliary_loss_clip": 0.01115083, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.03931046, + "balance_loss_mlp": 1.02038574, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.8832218211672174, + "language_loss": 0.77562284, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79710591, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12841797, + "step": 14659, + "time_per_iteration": 2.4652512073516846 + }, + { + "auxiliary_loss_clip": 0.01111368, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.0403595, + "balance_loss_mlp": 1.02121496, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.015953018232782, + "language_loss": 0.60370386, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62514728, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11773682, + "step": 14660, + "time_per_iteration": 2.5004358291625977 + }, + { + "auxiliary_loss_clip": 0.01109496, + "auxiliary_loss_mlp": 0.01038396, + "balance_loss_clip": 1.03689623, + "balance_loss_mlp": 1.02353692, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.9095833856732127, + "language_loss": 0.78153849, + "learning_rate": 1.455139770123972e-07, + "loss": 0.80301738, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.14849854, + "step": 14661, + "time_per_iteration": 2.4557363986968994 + }, + { + "auxiliary_loss_clip": 0.01113951, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.04167116, + "balance_loss_mlp": 1.02270317, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 2.0026232054224145, + "language_loss": 0.76804876, + "learning_rate": 1.45368174298081e-07, + "loss": 0.7895323, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11694336, + "step": 14662, + "time_per_iteration": 2.4512813091278076 + }, + { + "auxiliary_loss_clip": 0.01112664, + "auxiliary_loss_mlp": 0.01024267, + "balance_loss_clip": 1.04275167, + "balance_loss_mlp": 1.01403832, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.8154429514105577, + "language_loss": 0.74055731, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76192653, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10229492, + "step": 14663, + "time_per_iteration": 2.4202845096588135 + }, + { + "auxiliary_loss_clip": 0.01111038, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.04081428, + "balance_loss_mlp": 1.01967287, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.6149398518048907, + "language_loss": 0.70066965, + "learning_rate": 1.450767798584489e-07, + "loss": 0.7220881, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.1114502, + "step": 14664, + "time_per_iteration": 2.5438554286956787 + }, + { + "auxiliary_loss_clip": 0.01110656, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.04187894, + "balance_loss_mlp": 1.01587486, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.7265758764865902, + "language_loss": 0.81262928, + "learning_rate": 1.449311881441828e-07, + "loss": 0.8339954, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10083008, + "step": 14665, + "time_per_iteration": 2.4534220695495605 + }, + { + "auxiliary_loss_clip": 0.0111851, + "auxiliary_loss_mlp": 0.01035818, + "balance_loss_clip": 1.04617941, + "balance_loss_mlp": 1.02464855, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 1.9183415167231817, + "language_loss": 0.58577096, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60731423, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11169434, + "step": 14666, + "time_per_iteration": 2.4219062328338623 + }, + { + "auxiliary_loss_clip": 0.01113132, + "auxiliary_loss_mlp": 0.01029604, + "balance_loss_clip": 1.04085684, + "balance_loss_mlp": 1.01674128, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 1.8833194373297724, + "language_loss": 0.83714068, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85856801, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12860107, + "step": 14667, + "time_per_iteration": 2.458235740661621 + }, + { + "auxiliary_loss_clip": 0.01117635, + "auxiliary_loss_mlp": 0.01035812, + "balance_loss_clip": 1.04637182, + "balance_loss_mlp": 1.0237, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.921325080260823, + "language_loss": 0.623523, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.6450575, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12109375, + "step": 14668, + "time_per_iteration": 2.450928211212158 + }, + { + "auxiliary_loss_clip": 0.0110997, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.04081702, + "balance_loss_mlp": 1.01708102, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.057227068436692, + "language_loss": 0.56879812, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59016871, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10003662, + "step": 14669, + "time_per_iteration": 3.8438162803649902 + }, + { + "auxiliary_loss_clip": 0.01110492, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.03824377, + "balance_loss_mlp": 1.02104926, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 1.8950825855687532, + "language_loss": 0.71544278, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73687303, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11486816, + "step": 14670, + "time_per_iteration": 2.5777206420898438 + }, + { + "auxiliary_loss_clip": 0.01118129, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.04511642, + "balance_loss_mlp": 1.0166229, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.0108791625181546, + "language_loss": 0.738217, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75967801, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11334229, + "step": 14671, + "time_per_iteration": 2.5003116130828857 + }, + { + "auxiliary_loss_clip": 0.01119152, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.0418222, + "balance_loss_mlp": 1.01623392, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 2.679480272238084, + "language_loss": 0.8524102, + "learning_rate": 1.43914016096218e-07, + "loss": 0.87388837, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12438965, + "step": 14672, + "time_per_iteration": 3.8962745666503906 + }, + { + "auxiliary_loss_clip": 0.01104406, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.03521681, + "balance_loss_mlp": 1.01988578, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5004550384380173, + "language_loss": 0.72793162, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.7492913, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11676025, + "step": 14673, + "time_per_iteration": 2.5036377906799316 + }, + { + "auxiliary_loss_clip": 0.01040647, + "auxiliary_loss_mlp": 0.01005552, + "balance_loss_clip": 1.01579487, + "balance_loss_mlp": 1.0041585, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8008880173431357, + "language_loss": 0.49367642, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51413846, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01393127, + "step": 14674, + "time_per_iteration": 3.1552016735076904 + }, + { + "auxiliary_loss_clip": 0.01116614, + "auxiliary_loss_mlp": 0.01029362, + "balance_loss_clip": 1.04341865, + "balance_loss_mlp": 1.01766777, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.7660043861113275, + "language_loss": 0.76394308, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78540289, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11682129, + "step": 14675, + "time_per_iteration": 2.454692840576172 + }, + { + "auxiliary_loss_clip": 0.01116938, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.0465169, + "balance_loss_mlp": 1.02094698, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 2.0718464352058956, + "language_loss": 0.79636121, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81786692, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1270752, + "step": 14676, + "time_per_iteration": 2.4132163524627686 + }, + { + "auxiliary_loss_clip": 0.01044265, + "auxiliary_loss_mlp": 0.01003957, + "balance_loss_clip": 1.02022958, + "balance_loss_mlp": 1.00254166, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6957384057592393, + "language_loss": 0.54684979, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56733203, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.0141449, + "step": 14677, + "time_per_iteration": 3.1881139278411865 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.04063547, + "balance_loss_mlp": 1.01660657, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.2258412552352356, + "language_loss": 0.65692854, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67831713, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11450195, + "step": 14678, + "time_per_iteration": 2.3963258266448975 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.03590262, + "balance_loss_mlp": 1.02060437, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 2.1197552490188802, + "language_loss": 0.71355367, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.7349751, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11956787, + "step": 14679, + "time_per_iteration": 2.50939679145813 + }, + { + "auxiliary_loss_clip": 0.01118804, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.0458796, + "balance_loss_mlp": 1.01759923, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.6838298603938315, + "language_loss": 0.63695979, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.65842867, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10491943, + "step": 14680, + "time_per_iteration": 2.425908088684082 + }, + { + "auxiliary_loss_clip": 0.01113378, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.04284382, + "balance_loss_mlp": 1.01951003, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.1680501794170706, + "language_loss": 0.77079725, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79223782, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.1116333, + "step": 14681, + "time_per_iteration": 3.8274898529052734 + }, + { + "auxiliary_loss_clip": 0.01114068, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.04027176, + "balance_loss_mlp": 1.01490426, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.673630622709486, + "language_loss": 0.73132706, + "learning_rate": 1.424668961888047e-07, + "loss": 0.75273979, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12304688, + "step": 14682, + "time_per_iteration": 2.4654901027679443 + }, + { + "auxiliary_loss_clip": 0.01118924, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.04367852, + "balance_loss_mlp": 1.01666594, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.9201331494928038, + "language_loss": 0.74525845, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76674235, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12792969, + "step": 14683, + "time_per_iteration": 2.473090171813965 + }, + { + "auxiliary_loss_clip": 0.01108601, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.0375421, + "balance_loss_mlp": 1.01539016, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.6595827843873476, + "language_loss": 0.65683496, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67819703, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12207031, + "step": 14684, + "time_per_iteration": 2.537977933883667 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.03933692, + "balance_loss_mlp": 1.01536691, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.7225467473575775, + "language_loss": 0.6972813, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71864516, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11639404, + "step": 14685, + "time_per_iteration": 2.500075340270996 + }, + { + "auxiliary_loss_clip": 0.01116634, + "auxiliary_loss_mlp": 0.01039718, + "balance_loss_clip": 1.04271984, + "balance_loss_mlp": 1.0255146, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 2.0801108415732528, + "language_loss": 0.74191606, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76347959, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.14221191, + "step": 14686, + "time_per_iteration": 2.3993561267852783 + }, + { + "auxiliary_loss_clip": 0.01112452, + "auxiliary_loss_mlp": 0.0102652, + "balance_loss_clip": 1.04024374, + "balance_loss_mlp": 1.01493883, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.7732760162464214, + "language_loss": 0.63099915, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65238887, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11578369, + "step": 14687, + "time_per_iteration": 2.444310426712036 + }, + { + "auxiliary_loss_clip": 0.01118387, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.0435915, + "balance_loss_mlp": 1.01730239, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 2.524654778333922, + "language_loss": 0.68825269, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.70972496, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11541748, + "step": 14688, + "time_per_iteration": 2.472205638885498 + }, + { + "auxiliary_loss_clip": 0.01108928, + "auxiliary_loss_mlp": 0.01026242, + "balance_loss_clip": 1.04072356, + "balance_loss_mlp": 1.01483393, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.5151076761793425, + "language_loss": 0.66984808, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69119978, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.11413574, + "step": 14689, + "time_per_iteration": 2.507938861846924 + }, + { + "auxiliary_loss_clip": 0.0111365, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.04422438, + "balance_loss_mlp": 1.01902592, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.364902958830705, + "language_loss": 0.74551511, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76695442, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.11248779, + "step": 14690, + "time_per_iteration": 2.537278890609741 + }, + { + "auxiliary_loss_clip": 0.01114128, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.04031742, + "balance_loss_mlp": 1.0252049, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4684636833597555, + "language_loss": 0.73042721, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.7519455, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12493896, + "step": 14691, + "time_per_iteration": 2.4987576007843018 + }, + { + "auxiliary_loss_clip": 0.0111763, + "auxiliary_loss_mlp": 0.01028706, + "balance_loss_clip": 1.04205608, + "balance_loss_mlp": 1.01643395, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 2.065263237643993, + "language_loss": 0.5245176, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54598099, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12286377, + "step": 14692, + "time_per_iteration": 2.408217430114746 + }, + { + "auxiliary_loss_clip": 0.01115434, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.04370356, + "balance_loss_mlp": 1.02147627, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.2431668978893495, + "language_loss": 0.61035752, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.63183963, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11279297, + "step": 14693, + "time_per_iteration": 2.4767062664031982 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.0102463, + "balance_loss_clip": 1.04581285, + "balance_loss_mlp": 1.01422322, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.563961497635509, + "language_loss": 0.75255346, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77395356, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10406494, + "step": 14694, + "time_per_iteration": 2.450880289077759 + }, + { + "auxiliary_loss_clip": 0.01108372, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.03393674, + "balance_loss_mlp": 1.01861906, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.6740747034206256, + "language_loss": 0.72955716, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.75093597, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.10888672, + "step": 14695, + "time_per_iteration": 2.5437023639678955 + }, + { + "auxiliary_loss_clip": 0.0110461, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.03686094, + "balance_loss_mlp": 1.01425242, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.6274625167127712, + "language_loss": 0.79770303, + "learning_rate": 1.404527630961998e-07, + "loss": 0.81900096, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 0.67724609, + "router_z_loss_mlp": 0.10919189, + "step": 14696, + "time_per_iteration": 2.5622971057891846 + }, + { + "auxiliary_loss_clip": 0.01115736, + "auxiliary_loss_mlp": 0.01040335, + "balance_loss_clip": 1.04095006, + "balance_loss_mlp": 1.02855694, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.5362575126904265, + "language_loss": 0.74791682, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76947749, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11791992, + "step": 14697, + "time_per_iteration": 4.101220607757568 + }, + { + "auxiliary_loss_clip": 0.01116575, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.0439558, + "balance_loss_mlp": 1.01651597, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 2.044130476617491, + "language_loss": 0.72196782, + "learning_rate": 1.401661576761779e-07, + "loss": 0.7434147, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11590576, + "step": 14698, + "time_per_iteration": 2.44862699508667 + }, + { + "auxiliary_loss_clip": 0.01041712, + "auxiliary_loss_mlp": 0.01000363, + "balance_loss_clip": 1.01718795, + "balance_loss_mlp": 0.99900532, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.7989250395871589, + "language_loss": 0.53663301, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55705369, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01358032, + "step": 14699, + "time_per_iteration": 3.100475549697876 + }, + { + "auxiliary_loss_clip": 0.01112888, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.03846335, + "balance_loss_mlp": 1.01560533, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.857562595554051, + "language_loss": 0.77417493, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.79558122, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12133789, + "step": 14700, + "time_per_iteration": 2.545316457748413 + }, + { + "auxiliary_loss_clip": 0.01108065, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.03882062, + "balance_loss_mlp": 1.01707971, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.8384284926831407, + "language_loss": 0.73216552, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75353205, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.1151123, + "step": 14701, + "time_per_iteration": 2.4515323638916016 + }, + { + "auxiliary_loss_clip": 0.01117666, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.04201055, + "balance_loss_mlp": 1.01626694, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 1.6957727857361078, + "language_loss": 0.70979989, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73127151, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13238525, + "step": 14702, + "time_per_iteration": 2.489982843399048 + }, + { + "auxiliary_loss_clip": 0.01120288, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.04440093, + "balance_loss_mlp": 1.02486598, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.7896374436155629, + "language_loss": 0.71800339, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73957467, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11981201, + "step": 14703, + "time_per_iteration": 2.6901423931121826 + }, + { + "auxiliary_loss_clip": 0.01113852, + "auxiliary_loss_mlp": 0.01028027, + "balance_loss_clip": 1.04407144, + "balance_loss_mlp": 1.01761997, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.8305519827052787, + "language_loss": 0.66625887, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68767762, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10406494, + "step": 14704, + "time_per_iteration": 2.477283239364624 + }, + { + "auxiliary_loss_clip": 0.01110701, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.04169786, + "balance_loss_mlp": 1.01582801, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.5854470696523868, + "language_loss": 0.70746565, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.7288388, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10778809, + "step": 14705, + "time_per_iteration": 2.538121223449707 + }, + { + "auxiliary_loss_clip": 0.01111323, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.04120755, + "balance_loss_mlp": 1.01877379, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.8164651626738162, + "language_loss": 0.71212882, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73353183, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10205078, + "step": 14706, + "time_per_iteration": 2.586350202560425 + }, + { + "auxiliary_loss_clip": 0.01113644, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.04023767, + "balance_loss_mlp": 1.01698637, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.6251379057408115, + "language_loss": 0.74410057, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76552045, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11346436, + "step": 14707, + "time_per_iteration": 2.446854829788208 + }, + { + "auxiliary_loss_clip": 0.01039023, + "auxiliary_loss_mlp": 0.01003499, + "balance_loss_clip": 1.01450491, + "balance_loss_mlp": 1.00212216, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.7980154365508509, + "language_loss": 0.60445446, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62487972, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01377869, + "step": 14708, + "time_per_iteration": 2.962139844894409 + }, + { + "auxiliary_loss_clip": 0.01109182, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.04130363, + "balance_loss_mlp": 1.01752353, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 2.2740783025021747, + "language_loss": 0.67265606, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.6940285, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.10540771, + "step": 14709, + "time_per_iteration": 2.646604061126709 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01037989, + "balance_loss_clip": 1.04438972, + "balance_loss_mlp": 1.02447069, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.6254115543296341, + "language_loss": 0.62697208, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64859337, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13519287, + "step": 14710, + "time_per_iteration": 2.862993001937866 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.0102382, + "balance_loss_clip": 1.04250169, + "balance_loss_mlp": 1.01342511, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 5.527394823448021, + "language_loss": 0.63443446, + "learning_rate": 1.38310100580431e-07, + "loss": 0.65578949, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10394287, + "step": 14711, + "time_per_iteration": 2.503634452819824 + }, + { + "auxiliary_loss_clip": 0.01120054, + "auxiliary_loss_mlp": 0.01030914, + "balance_loss_clip": 1.04499102, + "balance_loss_mlp": 1.01880789, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 1.8877366085833442, + "language_loss": 0.76039004, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78189969, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12109375, + "step": 14712, + "time_per_iteration": 4.043498516082764 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_clip": 1.03606427, + "balance_loss_mlp": 1.01246929, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.473441885707561, + "language_loss": 0.81266934, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83398557, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11627197, + "step": 14713, + "time_per_iteration": 2.4588534832000732 + }, + { + "auxiliary_loss_clip": 0.01115841, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.04221511, + "balance_loss_mlp": 1.01730967, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 2.3018677635810496, + "language_loss": 0.55806166, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.5795188, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12579346, + "step": 14714, + "time_per_iteration": 2.5168535709381104 + }, + { + "auxiliary_loss_clip": 0.01116482, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.04315996, + "balance_loss_mlp": 1.02213871, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.9655985299247216, + "language_loss": 0.73999476, + "learning_rate": 1.377414057838755e-07, + "loss": 0.76151001, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12915039, + "step": 14715, + "time_per_iteration": 2.571500062942505 + }, + { + "auxiliary_loss_clip": 0.01108814, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.03722191, + "balance_loss_mlp": 1.02304769, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 1.5987472562094345, + "language_loss": 0.75164926, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77309465, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12670898, + "step": 14716, + "time_per_iteration": 3.941624402999878 + }, + { + "auxiliary_loss_clip": 0.01111952, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.0418762, + "balance_loss_mlp": 1.02239132, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 2.288365555116427, + "language_loss": 0.71200764, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73346341, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11242676, + "step": 14717, + "time_per_iteration": 2.4609971046447754 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.04167461, + "balance_loss_mlp": 1.02358973, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 1.9703202829801, + "language_loss": 0.74010175, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76156443, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.11553955, + "step": 14718, + "time_per_iteration": 2.610673189163208 + }, + { + "auxiliary_loss_clip": 0.01115102, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.04172444, + "balance_loss_mlp": 1.01806355, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.8227495963040146, + "language_loss": 0.78508085, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80653059, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11804199, + "step": 14719, + "time_per_iteration": 2.5437140464782715 + }, + { + "auxiliary_loss_clip": 0.01116713, + "auxiliary_loss_mlp": 0.01023958, + "balance_loss_clip": 1.04428327, + "balance_loss_mlp": 1.01257348, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.6438201110921626, + "language_loss": 0.72127634, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74268305, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.1138916, + "step": 14720, + "time_per_iteration": 2.4600424766540527 + }, + { + "auxiliary_loss_clip": 0.01115149, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.04019761, + "balance_loss_mlp": 1.01598775, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 1.8002853693913465, + "language_loss": 0.82527757, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84671819, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12927246, + "step": 14721, + "time_per_iteration": 2.5363733768463135 + }, + { + "auxiliary_loss_clip": 0.01108256, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.03533053, + "balance_loss_mlp": 1.01890135, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 2.3914362707983607, + "language_loss": 0.62362003, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.645015, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12341309, + "step": 14722, + "time_per_iteration": 2.670551300048828 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01026661, + "balance_loss_clip": 1.03954375, + "balance_loss_mlp": 1.0150317, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.9525108669677083, + "language_loss": 0.68860269, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.71002156, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11633301, + "step": 14723, + "time_per_iteration": 2.5927891731262207 + }, + { + "auxiliary_loss_clip": 0.01115618, + "auxiliary_loss_mlp": 0.01029818, + "balance_loss_clip": 1.0433023, + "balance_loss_mlp": 1.01846945, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.8556816145544033, + "language_loss": 0.77794468, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.79939914, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11346436, + "step": 14724, + "time_per_iteration": 3.909376621246338 + }, + { + "auxiliary_loss_clip": 0.01042746, + "auxiliary_loss_mlp": 0.01001225, + "balance_loss_clip": 1.01796925, + "balance_loss_mlp": 0.99989301, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.8045224489241203, + "language_loss": 0.58886009, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60929978, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01332092, + "step": 14725, + "time_per_iteration": 2.9835762977600098 + }, + { + "auxiliary_loss_clip": 0.01116138, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.0400393, + "balance_loss_mlp": 1.02662194, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 2.1774535368967394, + "language_loss": 0.68868232, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.71023136, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.121521, + "step": 14726, + "time_per_iteration": 2.4053406715393066 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.04650021, + "balance_loss_mlp": 1.02057958, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.371223404787945, + "language_loss": 0.69698036, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71846449, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11956787, + "step": 14727, + "time_per_iteration": 2.630146026611328 + }, + { + "auxiliary_loss_clip": 0.01119854, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.04609561, + "balance_loss_mlp": 1.02512836, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.6795666243178728, + "language_loss": 0.70163274, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72320968, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1270752, + "step": 14728, + "time_per_iteration": 2.442107677459717 + }, + { + "auxiliary_loss_clip": 0.01122751, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.04838836, + "balance_loss_mlp": 1.01732492, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.1191188293434187, + "language_loss": 0.66092575, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68243754, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11102295, + "step": 14729, + "time_per_iteration": 2.450122117996216 + }, + { + "auxiliary_loss_clip": 0.01117923, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0474124, + "balance_loss_mlp": 1.02326512, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.6126853121499072, + "language_loss": 0.63389379, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65541446, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10882568, + "step": 14730, + "time_per_iteration": 2.6669883728027344 + }, + { + "auxiliary_loss_clip": 0.01109251, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.03975379, + "balance_loss_mlp": 1.01949799, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.599058562993226, + "language_loss": 0.79190886, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81331724, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.12084961, + "step": 14731, + "time_per_iteration": 2.5373599529266357 + }, + { + "auxiliary_loss_clip": 0.01114268, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.04206181, + "balance_loss_mlp": 1.02174258, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.6361116200186852, + "language_loss": 0.83265841, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85413432, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11566162, + "step": 14732, + "time_per_iteration": 2.47689151763916 + }, + { + "auxiliary_loss_clip": 0.01041498, + "auxiliary_loss_mlp": 0.01008563, + "balance_loss_clip": 1.0168612, + "balance_loss_mlp": 1.00718176, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.8902633649949842, + "language_loss": 0.59918213, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61968267, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.0138092, + "step": 14733, + "time_per_iteration": 3.1139461994171143 + }, + { + "auxiliary_loss_clip": 0.01111946, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.04017329, + "balance_loss_mlp": 1.0237205, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.732420024811169, + "language_loss": 0.66493213, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68640685, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11804199, + "step": 14734, + "time_per_iteration": 2.476513624191284 + }, + { + "auxiliary_loss_clip": 0.0112054, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.04736197, + "balance_loss_mlp": 1.02634454, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 1.9244264991124596, + "language_loss": 0.75521326, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77679598, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11395264, + "step": 14735, + "time_per_iteration": 2.4713292121887207 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.04461372, + "balance_loss_mlp": 1.02025676, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.8192080450284855, + "language_loss": 0.70543116, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72696459, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11730957, + "step": 14736, + "time_per_iteration": 2.4412431716918945 + }, + { + "auxiliary_loss_clip": 0.01116318, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.04298925, + "balance_loss_mlp": 1.01735795, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.8370081455662606, + "language_loss": 0.84172451, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86317718, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11602783, + "step": 14737, + "time_per_iteration": 2.5653018951416016 + }, + { + "auxiliary_loss_clip": 0.01126614, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.04662418, + "balance_loss_mlp": 1.01764786, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 4.15215897752117, + "language_loss": 0.68177247, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70334995, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1350708, + "step": 14738, + "time_per_iteration": 2.557739019393921 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01027342, + "balance_loss_clip": 1.04449821, + "balance_loss_mlp": 1.01506388, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.6530046770522253, + "language_loss": 0.75163269, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77312595, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12286377, + "step": 14739, + "time_per_iteration": 2.479079008102417 + }, + { + "auxiliary_loss_clip": 0.01115391, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.04482353, + "balance_loss_mlp": 1.01971889, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 4.96410817072419, + "language_loss": 0.87398869, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.8954531, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11352539, + "step": 14740, + "time_per_iteration": 2.4419608116149902 + }, + { + "auxiliary_loss_clip": 0.01113333, + "auxiliary_loss_mlp": 0.01028567, + "balance_loss_clip": 1.0421257, + "balance_loss_mlp": 1.01683712, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.8269089324373677, + "language_loss": 0.6329571, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65437615, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11737061, + "step": 14741, + "time_per_iteration": 3.9725825786590576 + }, + { + "auxiliary_loss_clip": 0.01119141, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.04688382, + "balance_loss_mlp": 1.01721036, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 1.7567407706743818, + "language_loss": 0.72686571, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.74834281, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11352539, + "step": 14742, + "time_per_iteration": 2.4919486045837402 + }, + { + "auxiliary_loss_clip": 0.01115665, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.04504001, + "balance_loss_mlp": 1.02056718, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.8344427426487868, + "language_loss": 0.59275174, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.6142292, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.1151123, + "step": 14743, + "time_per_iteration": 2.49543833732605 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.04362226, + "balance_loss_mlp": 1.02194667, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 3.354516559370379, + "language_loss": 0.59915316, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.6206882, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13201904, + "step": 14744, + "time_per_iteration": 2.5014121532440186 + }, + { + "auxiliary_loss_clip": 0.01114487, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.04316723, + "balance_loss_mlp": 1.01783919, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.7988908020815886, + "language_loss": 0.76746404, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78890502, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11785889, + "step": 14745, + "time_per_iteration": 2.4562606811523438 + }, + { + "auxiliary_loss_clip": 0.01109036, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.03952229, + "balance_loss_mlp": 1.0218637, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 2.8072637994793563, + "language_loss": 0.774665, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79608685, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11303711, + "step": 14746, + "time_per_iteration": 2.6009933948516846 + }, + { + "auxiliary_loss_clip": 0.0111502, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.04193926, + "balance_loss_mlp": 1.02059555, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 2.572377288963187, + "language_loss": 0.76639938, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78787726, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12164307, + "step": 14747, + "time_per_iteration": 2.478247880935669 + }, + { + "auxiliary_loss_clip": 0.01111765, + "auxiliary_loss_mlp": 0.01025166, + "balance_loss_clip": 1.0425967, + "balance_loss_mlp": 1.01409793, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.7089339354332602, + "language_loss": 0.82585931, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84722859, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.11065674, + "step": 14748, + "time_per_iteration": 2.4632771015167236 + }, + { + "auxiliary_loss_clip": 0.01120197, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.04678655, + "balance_loss_mlp": 1.02080488, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 8.356081751606705, + "language_loss": 0.77143788, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79296964, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12182617, + "step": 14749, + "time_per_iteration": 2.6604456901550293 + }, + { + "auxiliary_loss_clip": 0.01115915, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.03896713, + "balance_loss_mlp": 1.01736379, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 1.9350412901829646, + "language_loss": 0.69858801, + "learning_rate": 1.328135602550451e-07, + "loss": 0.72004783, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.1270752, + "step": 14750, + "time_per_iteration": 2.4837608337402344 + }, + { + "auxiliary_loss_clip": 0.01105466, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.03511095, + "balance_loss_mlp": 1.0214994, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 3.723426986666637, + "language_loss": 0.59406257, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61544669, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11444092, + "step": 14751, + "time_per_iteration": 2.5028038024902344 + }, + { + "auxiliary_loss_clip": 0.01112663, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.04166198, + "balance_loss_mlp": 1.0208652, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 3.6815107602966637, + "language_loss": 0.81444627, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83591533, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1338501, + "step": 14752, + "time_per_iteration": 2.452829122543335 + }, + { + "auxiliary_loss_clip": 0.01124145, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.04645562, + "balance_loss_mlp": 1.01995504, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 1.8137305114818747, + "language_loss": 0.8043139, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.8258853, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13043213, + "step": 14753, + "time_per_iteration": 2.456597328186035 + }, + { + "auxiliary_loss_clip": 0.01119538, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.04690588, + "balance_loss_mlp": 1.02356839, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.6293272946409851, + "language_loss": 0.65438974, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.6759361, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11535645, + "step": 14754, + "time_per_iteration": 2.4674394130706787 + }, + { + "auxiliary_loss_clip": 0.0111682, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.0426147, + "balance_loss_mlp": 1.01901841, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 8.600518382156654, + "language_loss": 0.74615246, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76762992, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11901855, + "step": 14755, + "time_per_iteration": 2.61806058883667 + }, + { + "auxiliary_loss_clip": 0.01117096, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.04325843, + "balance_loss_mlp": 1.02156603, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.427815015827507, + "language_loss": 0.78145427, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.8029868, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.14587402, + "step": 14756, + "time_per_iteration": 2.524837017059326 + }, + { + "auxiliary_loss_clip": 0.01109615, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.03741014, + "balance_loss_mlp": 1.02285576, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 2.2883764945852465, + "language_loss": 0.76725018, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78870285, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12799072, + "step": 14757, + "time_per_iteration": 3.8423118591308594 + }, + { + "auxiliary_loss_clip": 0.01109565, + "auxiliary_loss_mlp": 0.01035481, + "balance_loss_clip": 1.04098189, + "balance_loss_mlp": 1.02295244, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 2.0408974852769304, + "language_loss": 0.68187165, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70332211, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.12536621, + "step": 14758, + "time_per_iteration": 2.609713077545166 + }, + { + "auxiliary_loss_clip": 0.01116726, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.04373097, + "balance_loss_mlp": 1.01938605, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.8758439713619033, + "language_loss": 0.6910708, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71255702, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12512207, + "step": 14759, + "time_per_iteration": 2.693066120147705 + }, + { + "auxiliary_loss_clip": 0.01111947, + "auxiliary_loss_mlp": 0.0102761, + "balance_loss_clip": 1.04201198, + "balance_loss_mlp": 1.0168097, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 2.140167691518221, + "language_loss": 0.74301493, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76441056, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10797119, + "step": 14760, + "time_per_iteration": 3.857480764389038 + }, + { + "auxiliary_loss_clip": 0.01119218, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.04473495, + "balance_loss_mlp": 1.02338767, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.1512933061296833, + "language_loss": 0.75666469, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.77821815, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1272583, + "step": 14761, + "time_per_iteration": 2.4622132778167725 + }, + { + "auxiliary_loss_clip": 0.01116346, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.04321337, + "balance_loss_mlp": 1.02094805, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 2.0080758493359014, + "language_loss": 0.61541069, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63690174, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11816406, + "step": 14762, + "time_per_iteration": 2.5249276161193848 + }, + { + "auxiliary_loss_clip": 0.01113839, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.03943694, + "balance_loss_mlp": 1.02196765, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.834946370315946, + "language_loss": 0.64295948, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66446006, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.14245605, + "step": 14763, + "time_per_iteration": 2.4598376750946045 + }, + { + "auxiliary_loss_clip": 0.01116412, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.04398012, + "balance_loss_mlp": 1.01882815, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.7348486341473515, + "language_loss": 0.7094847, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73096222, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12524414, + "step": 14764, + "time_per_iteration": 2.4130992889404297 + }, + { + "auxiliary_loss_clip": 0.01115892, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.04077482, + "balance_loss_mlp": 1.02388525, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.0977617905673998, + "language_loss": 0.65908694, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68060017, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11547852, + "step": 14765, + "time_per_iteration": 2.5187902450561523 + }, + { + "auxiliary_loss_clip": 0.01107541, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.03965628, + "balance_loss_mlp": 1.02064872, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.567223760566723, + "language_loss": 0.76635873, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78774178, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10113525, + "step": 14766, + "time_per_iteration": 2.4532692432403564 + }, + { + "auxiliary_loss_clip": 0.01113483, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.04404998, + "balance_loss_mlp": 1.01470351, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 2.076208533633342, + "language_loss": 0.73620087, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75759357, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11090088, + "step": 14767, + "time_per_iteration": 2.5042436122894287 + }, + { + "auxiliary_loss_clip": 0.01110495, + "auxiliary_loss_mlp": 0.01027766, + "balance_loss_clip": 1.04174209, + "balance_loss_mlp": 1.01700115, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.8300490392910784, + "language_loss": 0.71387529, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73525786, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.10772705, + "step": 14768, + "time_per_iteration": 4.091295480728149 + }, + { + "auxiliary_loss_clip": 0.01115532, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.04446936, + "balance_loss_mlp": 1.02025485, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 2.2222917017226504, + "language_loss": 0.70148468, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72295332, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11077881, + "step": 14769, + "time_per_iteration": 2.532621145248413 + }, + { + "auxiliary_loss_clip": 0.01112527, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.04146767, + "balance_loss_mlp": 1.01590359, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.2602426172804133, + "language_loss": 0.67454028, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69593161, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10705566, + "step": 14770, + "time_per_iteration": 2.5286006927490234 + }, + { + "auxiliary_loss_clip": 0.01108264, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03960395, + "balance_loss_mlp": 1.01878667, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 6.012886598459536, + "language_loss": 0.65731692, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67869759, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.11022949, + "step": 14771, + "time_per_iteration": 2.5393691062927246 + }, + { + "auxiliary_loss_clip": 0.01106887, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.03653145, + "balance_loss_mlp": 1.01523542, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.6190358477415936, + "language_loss": 0.82516491, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84651119, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.12506104, + "step": 14772, + "time_per_iteration": 2.499863624572754 + }, + { + "auxiliary_loss_clip": 0.01105626, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.03755856, + "balance_loss_mlp": 1.01564574, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.524095522275184, + "language_loss": 0.76715529, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78847909, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.11114502, + "step": 14773, + "time_per_iteration": 2.5129995346069336 + }, + { + "auxiliary_loss_clip": 0.01113337, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.04230523, + "balance_loss_mlp": 1.01664805, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.9742868048473219, + "language_loss": 0.75075632, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77216905, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11291504, + "step": 14774, + "time_per_iteration": 2.477736234664917 + }, + { + "auxiliary_loss_clip": 0.01118118, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.04518747, + "balance_loss_mlp": 1.01852894, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 3.0292526983914327, + "language_loss": 0.72398001, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.7454685, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12194824, + "step": 14775, + "time_per_iteration": 2.520475387573242 + }, + { + "auxiliary_loss_clip": 0.01113592, + "auxiliary_loss_mlp": 0.01026232, + "balance_loss_clip": 1.04308212, + "balance_loss_mlp": 1.01473415, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 2.2496752888817273, + "language_loss": 0.80183762, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82323581, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.1151123, + "step": 14776, + "time_per_iteration": 2.4239344596862793 + }, + { + "auxiliary_loss_clip": 0.01119818, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.04424882, + "balance_loss_mlp": 1.01842642, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 2.5342813777914786, + "language_loss": 0.69424278, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71574587, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12072754, + "step": 14777, + "time_per_iteration": 2.5701987743377686 + }, + { + "auxiliary_loss_clip": 0.01107492, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.02468085, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.6917134907772131, + "language_loss": 0.70717502, + "learning_rate": 1.2893372177522e-07, + "loss": 0.7286272, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.13061523, + "step": 14778, + "time_per_iteration": 2.4485998153686523 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.03629136, + "balance_loss_mlp": 1.01839852, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 2.1697587449466487, + "language_loss": 0.77574593, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79712772, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12402344, + "step": 14779, + "time_per_iteration": 2.510587215423584 + }, + { + "auxiliary_loss_clip": 0.0103345, + "auxiliary_loss_mlp": 0.01002916, + "balance_loss_clip": 1.00957668, + "balance_loss_mlp": 1.00136304, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8657885906712008, + "language_loss": 0.56768042, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58804405, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.01551819, + "step": 14780, + "time_per_iteration": 2.9050536155700684 + }, + { + "auxiliary_loss_clip": 0.01065886, + "auxiliary_loss_mlp": 0.01005916, + "balance_loss_clip": 1.04114783, + "balance_loss_mlp": 1.00428176, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7943483354775014, + "language_loss": 0.62411267, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.6448307, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01634216, + "step": 14781, + "time_per_iteration": 3.1980395317077637 + }, + { + "auxiliary_loss_clip": 0.0103983, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.01481557, + "balance_loss_mlp": 1.00079668, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.8339508475690224, + "language_loss": 0.58153629, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60195673, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01419067, + "step": 14782, + "time_per_iteration": 2.929863214492798 + }, + { + "auxiliary_loss_clip": 0.01110518, + "auxiliary_loss_mlp": 0.01026923, + "balance_loss_clip": 1.04160571, + "balance_loss_mlp": 1.01639128, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.8152919589359566, + "language_loss": 0.65753508, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.67890948, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10528564, + "step": 14783, + "time_per_iteration": 2.6025850772857666 + }, + { + "auxiliary_loss_clip": 0.01121915, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.04288578, + "balance_loss_mlp": 1.026752, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.6247234639663115, + "language_loss": 0.77762961, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79925191, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13562012, + "step": 14784, + "time_per_iteration": 3.896158218383789 + }, + { + "auxiliary_loss_clip": 0.01117789, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.0425241, + "balance_loss_mlp": 1.0179317, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.296443050703811, + "language_loss": 0.60505283, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62652934, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11938477, + "step": 14785, + "time_per_iteration": 2.530801773071289 + }, + { + "auxiliary_loss_clip": 0.01112767, + "auxiliary_loss_mlp": 0.01029261, + "balance_loss_clip": 1.04169238, + "balance_loss_mlp": 1.01760185, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.8618292986570015, + "language_loss": 0.65140051, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67282081, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11651611, + "step": 14786, + "time_per_iteration": 2.3964309692382812 + }, + { + "auxiliary_loss_clip": 0.01118539, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.04112887, + "balance_loss_mlp": 1.0177238, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 3.0362412976824333, + "language_loss": 0.85764515, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87913716, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12939453, + "step": 14787, + "time_per_iteration": 2.442974805831909 + }, + { + "auxiliary_loss_clip": 0.01120247, + "auxiliary_loss_mlp": 0.01027216, + "balance_loss_clip": 1.0479815, + "balance_loss_mlp": 1.01611733, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 2.660818868611802, + "language_loss": 0.70621318, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.72768784, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11102295, + "step": 14788, + "time_per_iteration": 2.4339101314544678 + }, + { + "auxiliary_loss_clip": 0.01112445, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.04189742, + "balance_loss_mlp": 1.01996446, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.801386569184462, + "language_loss": 0.70006037, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72149718, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11279297, + "step": 14789, + "time_per_iteration": 2.4571425914764404 + }, + { + "auxiliary_loss_clip": 0.01116755, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.04448271, + "balance_loss_mlp": 1.01457071, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.519430345239056, + "language_loss": 0.70615327, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72758663, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11999512, + "step": 14790, + "time_per_iteration": 2.447679281234741 + }, + { + "auxiliary_loss_clip": 0.01114838, + "auxiliary_loss_mlp": 0.01026511, + "balance_loss_clip": 1.04379249, + "balance_loss_mlp": 1.01636589, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.8382085859638129, + "language_loss": 0.73128927, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75270271, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10150146, + "step": 14791, + "time_per_iteration": 2.475029945373535 + }, + { + "auxiliary_loss_clip": 0.01109731, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.04201424, + "balance_loss_mlp": 1.0181241, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.524778964374418, + "language_loss": 0.73966545, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76105362, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 0.67724609, + "router_z_loss_mlp": 0.10943604, + "step": 14792, + "time_per_iteration": 2.541215419769287 + }, + { + "auxiliary_loss_clip": 0.01115692, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.03999281, + "balance_loss_mlp": 1.02293897, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 2.231123640967746, + "language_loss": 0.66019243, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68170035, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12164307, + "step": 14793, + "time_per_iteration": 2.4476869106292725 + }, + { + "auxiliary_loss_clip": 0.01116415, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.04044926, + "balance_loss_mlp": 1.01913953, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.9154233877192604, + "language_loss": 0.71889806, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.74037814, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12438965, + "step": 14794, + "time_per_iteration": 2.4698855876922607 + }, + { + "auxiliary_loss_clip": 0.01121384, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.04516482, + "balance_loss_mlp": 1.0217185, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.7429816470255115, + "language_loss": 0.7553426, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77689391, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12042236, + "step": 14795, + "time_per_iteration": 2.5435962677001953 + }, + { + "auxiliary_loss_clip": 0.01039167, + "auxiliary_loss_mlp": 0.01006605, + "balance_loss_clip": 1.01451564, + "balance_loss_mlp": 1.00530076, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7691386034128682, + "language_loss": 0.56049311, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58095086, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.24633789, + "router_z_loss_mlp": 0.01304626, + "step": 14796, + "time_per_iteration": 2.9470977783203125 + }, + { + "auxiliary_loss_clip": 0.01115489, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.04243374, + "balance_loss_mlp": 1.01666737, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.9398967059952694, + "language_loss": 0.70388651, + "learning_rate": 1.263326468169843e-07, + "loss": 0.7253359, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12786865, + "step": 14797, + "time_per_iteration": 2.4521324634552 + }, + { + "auxiliary_loss_clip": 0.01044543, + "auxiliary_loss_mlp": 0.01002631, + "balance_loss_clip": 1.02027059, + "balance_loss_mlp": 1.00135994, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7497405043233687, + "language_loss": 0.57990253, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60037434, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.24291992, + "router_z_loss_mlp": 0.01269531, + "step": 14798, + "time_per_iteration": 3.1196413040161133 + }, + { + "auxiliary_loss_clip": 0.01114958, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.04141474, + "balance_loss_mlp": 1.02028799, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.6585079506278573, + "language_loss": 0.79018587, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81166422, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.1260376, + "step": 14799, + "time_per_iteration": 2.439134359359741 + }, + { + "auxiliary_loss_clip": 0.01071299, + "auxiliary_loss_mlp": 0.01008733, + "balance_loss_clip": 1.04618073, + "balance_loss_mlp": 1.00715768, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.903530500515464, + "language_loss": 0.58102179, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60182214, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01576233, + "step": 14800, + "time_per_iteration": 4.430742263793945 + }, + { + "auxiliary_loss_clip": 0.01112755, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.04092669, + "balance_loss_mlp": 1.02619505, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 2.154587161391576, + "language_loss": 0.6608665, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68236744, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11151123, + "step": 14801, + "time_per_iteration": 2.4314911365509033 + }, + { + "auxiliary_loss_clip": 0.01120478, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.04485357, + "balance_loss_mlp": 1.02164412, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.7864511913381294, + "language_loss": 0.75519621, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77675116, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13391113, + "step": 14802, + "time_per_iteration": 2.4329519271850586 + }, + { + "auxiliary_loss_clip": 0.01119047, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.04740858, + "balance_loss_mlp": 1.01717758, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 2.0187846937477056, + "language_loss": 0.73397338, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75544566, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11004639, + "step": 14803, + "time_per_iteration": 2.45904541015625 + }, + { + "auxiliary_loss_clip": 0.01118298, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.0472523, + "balance_loss_mlp": 1.01998258, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 2.049660561722578, + "language_loss": 0.71826512, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73976129, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11334229, + "step": 14804, + "time_per_iteration": 3.988046646118164 + }, + { + "auxiliary_loss_clip": 0.01120749, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.04701936, + "balance_loss_mlp": 1.01864862, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.82919464147682, + "language_loss": 0.81214011, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83365405, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12005615, + "step": 14805, + "time_per_iteration": 2.4776740074157715 + }, + { + "auxiliary_loss_clip": 0.01117983, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.04394817, + "balance_loss_mlp": 1.01997089, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.9205906693444854, + "language_loss": 0.67398524, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69548815, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12329102, + "step": 14806, + "time_per_iteration": 2.50154972076416 + }, + { + "auxiliary_loss_clip": 0.01123917, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.05124474, + "balance_loss_mlp": 1.02211297, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.8627776713291522, + "language_loss": 0.6722607, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.6938417, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12060547, + "step": 14807, + "time_per_iteration": 2.5372586250305176 + }, + { + "auxiliary_loss_clip": 0.01113979, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04109299, + "balance_loss_mlp": 1.01920033, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.7631379091448285, + "language_loss": 0.75390851, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77535433, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11401367, + "step": 14808, + "time_per_iteration": 2.4730820655822754 + }, + { + "auxiliary_loss_clip": 0.01115546, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.04362214, + "balance_loss_mlp": 1.01751971, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 1.8725517388371449, + "language_loss": 0.8111974, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83263773, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10980225, + "step": 14809, + "time_per_iteration": 2.445525646209717 + }, + { + "auxiliary_loss_clip": 0.01114965, + "auxiliary_loss_mlp": 0.01025536, + "balance_loss_clip": 1.04377878, + "balance_loss_mlp": 1.01493883, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.9039074167345698, + "language_loss": 0.68466389, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70606887, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10601807, + "step": 14810, + "time_per_iteration": 2.4934206008911133 + }, + { + "auxiliary_loss_clip": 0.01114801, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.04172683, + "balance_loss_mlp": 1.01433456, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 1.8827990210278427, + "language_loss": 0.70567226, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.7270838, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12030029, + "step": 14811, + "time_per_iteration": 2.4733216762542725 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01031693, + "balance_loss_clip": 1.04307032, + "balance_loss_mlp": 1.0198555, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 2.833796967408588, + "language_loss": 0.656075, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67758143, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11834717, + "step": 14812, + "time_per_iteration": 4.19562554359436 + }, + { + "auxiliary_loss_clip": 0.01114024, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.04145527, + "balance_loss_mlp": 1.01904273, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.7625113217278958, + "language_loss": 0.6849854, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70642203, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10614014, + "step": 14813, + "time_per_iteration": 2.392616033554077 + }, + { + "auxiliary_loss_clip": 0.01119016, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.04190028, + "balance_loss_mlp": 1.02085054, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 2.3831913091871226, + "language_loss": 0.75163019, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77316338, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13446045, + "step": 14814, + "time_per_iteration": 2.4379937648773193 + }, + { + "auxiliary_loss_clip": 0.01124096, + "auxiliary_loss_mlp": 0.01026382, + "balance_loss_clip": 1.0492394, + "balance_loss_mlp": 1.01458049, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.2560295543471187, + "language_loss": 0.74427682, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76578164, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11804199, + "step": 14815, + "time_per_iteration": 2.4341671466827393 + }, + { + "auxiliary_loss_clip": 0.01114236, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.04556262, + "balance_loss_mlp": 1.01772261, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 2.0159108171945004, + "language_loss": 0.75005579, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77148879, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.11322021, + "step": 14816, + "time_per_iteration": 2.5093111991882324 + }, + { + "auxiliary_loss_clip": 0.01111776, + "auxiliary_loss_mlp": 0.01026659, + "balance_loss_clip": 1.03970003, + "balance_loss_mlp": 1.01506567, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.773168284626876, + "language_loss": 0.77902174, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.8004061, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1159668, + "step": 14817, + "time_per_iteration": 2.4882843494415283 + }, + { + "auxiliary_loss_clip": 0.01042196, + "auxiliary_loss_mlp": 0.01004329, + "balance_loss_clip": 1.01715147, + "balance_loss_mlp": 1.00313997, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.8132228635035531, + "language_loss": 0.56504011, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58550537, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01190186, + "step": 14818, + "time_per_iteration": 3.110410451889038 + }, + { + "auxiliary_loss_clip": 0.01112353, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.04048276, + "balance_loss_mlp": 1.02022147, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.7337602079296976, + "language_loss": 0.64453173, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66597462, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1171875, + "step": 14819, + "time_per_iteration": 2.6551525592803955 + }, + { + "auxiliary_loss_clip": 0.01112934, + "auxiliary_loss_mlp": 0.01031285, + "balance_loss_clip": 1.03975177, + "balance_loss_mlp": 1.01875639, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.9536057023638922, + "language_loss": 0.78131974, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80276191, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12548828, + "step": 14820, + "time_per_iteration": 2.46333909034729 + }, + { + "auxiliary_loss_clip": 0.01112392, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.03960049, + "balance_loss_mlp": 1.02180338, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.9615048839949007, + "language_loss": 0.76462436, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78608119, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1149292, + "step": 14821, + "time_per_iteration": 2.510546922683716 + }, + { + "auxiliary_loss_clip": 0.01062609, + "auxiliary_loss_mlp": 0.01005352, + "balance_loss_clip": 1.03814209, + "balance_loss_mlp": 1.00388098, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7927379334948972, + "language_loss": 0.59288025, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61355984, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.24462891, + "router_z_loss_mlp": 0.01470947, + "step": 14822, + "time_per_iteration": 2.9590818881988525 + }, + { + "auxiliary_loss_clip": 0.01117309, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.04513586, + "balance_loss_mlp": 1.01687539, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.8862329717956958, + "language_loss": 0.69433737, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71579623, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11700439, + "step": 14823, + "time_per_iteration": 2.49837589263916 + }, + { + "auxiliary_loss_clip": 0.01113398, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.04174697, + "balance_loss_mlp": 1.0170486, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.618632157535248, + "language_loss": 0.69320589, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71462667, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11608887, + "step": 14824, + "time_per_iteration": 2.42356276512146 + }, + { + "auxiliary_loss_clip": 0.01109747, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.03645825, + "balance_loss_mlp": 1.01927614, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 2.0305555553210475, + "language_loss": 0.7043075, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72572339, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12561035, + "step": 14825, + "time_per_iteration": 2.506021738052368 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.04091668, + "balance_loss_mlp": 1.01610029, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.717565601164098, + "language_loss": 0.71100235, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73240125, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11383057, + "step": 14826, + "time_per_iteration": 2.4396753311157227 + }, + { + "auxiliary_loss_clip": 0.01115384, + "auxiliary_loss_mlp": 0.01025858, + "balance_loss_clip": 1.04422975, + "balance_loss_mlp": 1.01425934, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 1.9825276756075456, + "language_loss": 0.75390732, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77531976, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11590576, + "step": 14827, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.04010034, + "balance_loss_mlp": 1.01837587, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.7893981585420604, + "language_loss": 0.78177935, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80321467, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12768555, + "step": 14828, + "time_per_iteration": 2.4040257930755615 + }, + { + "auxiliary_loss_clip": 0.01111848, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.0396651, + "balance_loss_mlp": 1.02394557, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.8972761865170134, + "language_loss": 0.75361109, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77510434, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.13537598, + "step": 14829, + "time_per_iteration": 3.8932900428771973 + }, + { + "auxiliary_loss_clip": 0.01112683, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.04089379, + "balance_loss_mlp": 1.02189898, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.5326907141560446, + "language_loss": 0.84568018, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86712986, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1038208, + "step": 14830, + "time_per_iteration": 2.475275993347168 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01028648, + "balance_loss_clip": 1.0375948, + "balance_loss_mlp": 1.01748419, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.4399487576108811, + "language_loss": 0.74837303, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.76972306, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.11157227, + "step": 14831, + "time_per_iteration": 2.5872461795806885 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01028218, + "balance_loss_clip": 1.03869963, + "balance_loss_mlp": 1.01613665, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.7747457054562568, + "language_loss": 0.72903496, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75044489, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12078857, + "step": 14832, + "time_per_iteration": 2.4943668842315674 + }, + { + "auxiliary_loss_clip": 0.01107349, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.01683545, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 2.228543309876199, + "language_loss": 0.67126566, + "learning_rate": 1.214746621848355e-07, + "loss": 0.69263214, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12481689, + "step": 14833, + "time_per_iteration": 2.491170883178711 + }, + { + "auxiliary_loss_clip": 0.01120764, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.04453969, + "balance_loss_mlp": 1.02077651, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 1.766024999490014, + "language_loss": 0.73728418, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.75882626, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12677002, + "step": 14834, + "time_per_iteration": 2.519130229949951 + }, + { + "auxiliary_loss_clip": 0.01113474, + "auxiliary_loss_mlp": 0.01033052, + "balance_loss_clip": 1.04225171, + "balance_loss_mlp": 1.02014208, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.478545322721722, + "language_loss": 0.79145098, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81291628, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.12921143, + "step": 14835, + "time_per_iteration": 2.414212226867676 + }, + { + "auxiliary_loss_clip": 0.01109373, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.03881335, + "balance_loss_mlp": 1.02121758, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.494412470181592, + "language_loss": 0.74003541, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76146585, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12451172, + "step": 14836, + "time_per_iteration": 2.492537260055542 + }, + { + "auxiliary_loss_clip": 0.01115384, + "auxiliary_loss_mlp": 0.01028898, + "balance_loss_clip": 1.04386222, + "balance_loss_mlp": 1.01702511, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.3380231967925207, + "language_loss": 0.68528318, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70672596, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11865234, + "step": 14837, + "time_per_iteration": 2.3948237895965576 + }, + { + "auxiliary_loss_clip": 0.01121177, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.04347885, + "balance_loss_mlp": 1.02554989, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.5716929016187617, + "language_loss": 0.67667699, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69827747, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13305664, + "step": 14838, + "time_per_iteration": 2.476348876953125 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.03968883, + "balance_loss_mlp": 1.0217967, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 1.8950219613150208, + "language_loss": 0.76657468, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78808367, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12872314, + "step": 14839, + "time_per_iteration": 2.4385461807250977 + }, + { + "auxiliary_loss_clip": 0.01041246, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.01656604, + "balance_loss_mlp": 1.00022674, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6830093772712104, + "language_loss": 0.49342543, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51385379, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01361084, + "step": 14840, + "time_per_iteration": 3.0546875 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.04056168, + "balance_loss_mlp": 1.0182786, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.081733469332718, + "language_loss": 0.63915014, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66064501, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12976074, + "step": 14841, + "time_per_iteration": 2.4096388816833496 + }, + { + "auxiliary_loss_clip": 0.01113339, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.04297245, + "balance_loss_mlp": 1.02547073, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.6426532655129822, + "language_loss": 0.6831165, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70461291, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10821533, + "step": 14842, + "time_per_iteration": 2.473745107650757 + }, + { + "auxiliary_loss_clip": 0.0111267, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.04253078, + "balance_loss_mlp": 1.02252936, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 1.9500075814750077, + "language_loss": 0.80067873, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.8221398, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10906982, + "step": 14843, + "time_per_iteration": 2.476642370223999 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.04398167, + "balance_loss_mlp": 1.01672363, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 2.1293697003721266, + "language_loss": 0.68255252, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.7040298, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12176514, + "step": 14844, + "time_per_iteration": 3.9564645290374756 + }, + { + "auxiliary_loss_clip": 0.01117702, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.04446959, + "balance_loss_mlp": 1.01746392, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 2.133816982235472, + "language_loss": 0.91155112, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93301499, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11230469, + "step": 14845, + "time_per_iteration": 2.4149036407470703 + }, + { + "auxiliary_loss_clip": 0.01113152, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.04147494, + "balance_loss_mlp": 1.01791215, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 2.2767619121771703, + "language_loss": 0.72547311, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74689066, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10693359, + "step": 14846, + "time_per_iteration": 2.438800096511841 + }, + { + "auxiliary_loss_clip": 0.01125834, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.0512017, + "balance_loss_mlp": 1.0191164, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.6934700665624745, + "language_loss": 0.56812841, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.5896939, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11608887, + "step": 14847, + "time_per_iteration": 2.7138519287109375 + }, + { + "auxiliary_loss_clip": 0.01118913, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.04631031, + "balance_loss_mlp": 1.02174211, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.9134242910218355, + "language_loss": 0.76778567, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78929877, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10662842, + "step": 14848, + "time_per_iteration": 3.8584961891174316 + }, + { + "auxiliary_loss_clip": 0.01117057, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.04437017, + "balance_loss_mlp": 1.01729274, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 2.1399370102348643, + "language_loss": 0.69191992, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71337879, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1151123, + "step": 14849, + "time_per_iteration": 2.5366570949554443 + }, + { + "auxiliary_loss_clip": 0.0111689, + "auxiliary_loss_mlp": 0.01031716, + "balance_loss_clip": 1.04503536, + "balance_loss_mlp": 1.02015293, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.6099839050386622, + "language_loss": 0.80937785, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83086389, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11572266, + "step": 14850, + "time_per_iteration": 2.4650049209594727 + }, + { + "auxiliary_loss_clip": 0.01119547, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.04726434, + "balance_loss_mlp": 1.02032876, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.5409166510316494, + "language_loss": 0.74420404, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.76571417, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11138916, + "step": 14851, + "time_per_iteration": 2.470978260040283 + }, + { + "auxiliary_loss_clip": 0.01110013, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.04011512, + "balance_loss_mlp": 1.01754022, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.5379388462224677, + "language_loss": 0.78362656, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80501837, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11627197, + "step": 14852, + "time_per_iteration": 2.4901649951934814 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.04414141, + "balance_loss_mlp": 1.01847339, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.6102348971851586, + "language_loss": 0.69749439, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71892017, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.1126709, + "step": 14853, + "time_per_iteration": 2.5355782508850098 + }, + { + "auxiliary_loss_clip": 0.01122749, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.04723418, + "balance_loss_mlp": 1.02000225, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.6183740691388795, + "language_loss": 0.67283368, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69438684, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12561035, + "step": 14854, + "time_per_iteration": 2.649777889251709 + }, + { + "auxiliary_loss_clip": 0.01114188, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.04322255, + "balance_loss_mlp": 1.01901269, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.4378950227658103, + "language_loss": 0.74806601, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.7695061, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10809326, + "step": 14855, + "time_per_iteration": 3.968839406967163 + }, + { + "auxiliary_loss_clip": 0.01112192, + "auxiliary_loss_mlp": 0.01025007, + "balance_loss_clip": 1.04172504, + "balance_loss_mlp": 1.0135808, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.8919215425170488, + "language_loss": 0.6400795, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66145146, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11425781, + "step": 14856, + "time_per_iteration": 2.530107021331787 + }, + { + "auxiliary_loss_clip": 0.01115744, + "auxiliary_loss_mlp": 0.01026287, + "balance_loss_clip": 1.04460406, + "balance_loss_mlp": 1.0150516, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.8538690321870872, + "language_loss": 0.66467071, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68609095, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11242676, + "step": 14857, + "time_per_iteration": 2.4481394290924072 + }, + { + "auxiliary_loss_clip": 0.01124952, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.04877377, + "balance_loss_mlp": 1.01946735, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.629749975320658, + "language_loss": 0.75221956, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77378297, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11932373, + "step": 14858, + "time_per_iteration": 2.4733853340148926 + }, + { + "auxiliary_loss_clip": 0.01117903, + "auxiliary_loss_mlp": 0.01026984, + "balance_loss_clip": 1.04673195, + "balance_loss_mlp": 1.01484251, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.7644152307059935, + "language_loss": 0.69586897, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71731782, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.12139893, + "step": 14859, + "time_per_iteration": 2.5058765411376953 + }, + { + "auxiliary_loss_clip": 0.01111932, + "auxiliary_loss_mlp": 0.01025611, + "balance_loss_clip": 1.04568779, + "balance_loss_mlp": 1.0150193, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.647395400367965, + "language_loss": 0.7561478, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77752322, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 0.66259766, + "router_z_loss_mlp": 0.10595703, + "step": 14860, + "time_per_iteration": 2.4569356441497803 + }, + { + "auxiliary_loss_clip": 0.01120126, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.04712045, + "balance_loss_mlp": 1.02043533, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 1.825759800716576, + "language_loss": 0.57648802, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59801972, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.12615967, + "step": 14861, + "time_per_iteration": 2.4623188972473145 + }, + { + "auxiliary_loss_clip": 0.0111731, + "auxiliary_loss_mlp": 0.01026483, + "balance_loss_clip": 1.04586434, + "balance_loss_mlp": 1.01534855, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.9201972160127232, + "language_loss": 0.6378566, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65929449, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11126709, + "step": 14862, + "time_per_iteration": 2.420423984527588 + }, + { + "auxiliary_loss_clip": 0.01107295, + "auxiliary_loss_mlp": 0.01025872, + "balance_loss_clip": 1.03842795, + "balance_loss_mlp": 1.01473236, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.7483360628133309, + "language_loss": 0.77977961, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.80111128, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.11132812, + "step": 14863, + "time_per_iteration": 2.434572458267212 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.03970385, + "balance_loss_mlp": 1.01794195, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 1.8181111719446585, + "language_loss": 0.70839405, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.72975934, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10571289, + "step": 14864, + "time_per_iteration": 2.4275546073913574 + }, + { + "auxiliary_loss_clip": 0.01121356, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.04590952, + "balance_loss_mlp": 1.02395606, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 2.2765844012263527, + "language_loss": 0.76203007, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.78360558, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12249756, + "step": 14865, + "time_per_iteration": 2.450880289077759 + }, + { + "auxiliary_loss_clip": 0.01114847, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.04411721, + "balance_loss_mlp": 1.02092898, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.6857331131829585, + "language_loss": 0.71688724, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73835683, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11175537, + "step": 14866, + "time_per_iteration": 2.5551774501800537 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.04615784, + "balance_loss_mlp": 1.01650703, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.7393367282964998, + "language_loss": 0.84090316, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86240077, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13067627, + "step": 14867, + "time_per_iteration": 2.4674477577209473 + }, + { + "auxiliary_loss_clip": 0.01115317, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.04347789, + "balance_loss_mlp": 1.01611698, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.77197783813031, + "language_loss": 0.80804056, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82945675, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10186768, + "step": 14868, + "time_per_iteration": 2.4981892108917236 + }, + { + "auxiliary_loss_clip": 0.01120761, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.0480907, + "balance_loss_mlp": 1.01866043, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.7731010439714914, + "language_loss": 0.77235359, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79386759, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11993408, + "step": 14869, + "time_per_iteration": 2.5791842937469482 + }, + { + "auxiliary_loss_clip": 0.01113471, + "auxiliary_loss_mlp": 0.01024622, + "balance_loss_clip": 1.04291487, + "balance_loss_mlp": 1.0130049, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.5195845504115262, + "language_loss": 0.65415877, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67553973, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.1161499, + "step": 14870, + "time_per_iteration": 2.446721076965332 + }, + { + "auxiliary_loss_clip": 0.01050179, + "auxiliary_loss_mlp": 0.01003191, + "balance_loss_clip": 1.02559376, + "balance_loss_mlp": 1.00190258, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7937946223173957, + "language_loss": 0.55936384, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57989758, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01289368, + "step": 14871, + "time_per_iteration": 3.117185354232788 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.04409003, + "balance_loss_mlp": 1.02446258, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 2.215731559882344, + "language_loss": 0.76285851, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78433293, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.10754395, + "step": 14872, + "time_per_iteration": 3.882413148880005 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.04404843, + "balance_loss_mlp": 1.01648927, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.9689303322279206, + "language_loss": 0.67095995, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69235015, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10186768, + "step": 14873, + "time_per_iteration": 2.503068208694458 + }, + { + "auxiliary_loss_clip": 0.01112101, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.04269743, + "balance_loss_mlp": 1.02085423, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.6459795757739886, + "language_loss": 0.60080504, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.62225151, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11694336, + "step": 14874, + "time_per_iteration": 2.4778077602386475 + }, + { + "auxiliary_loss_clip": 0.01112582, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.03843367, + "balance_loss_mlp": 1.01841354, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 2.0212056387002746, + "language_loss": 0.75867712, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.78011072, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12365723, + "step": 14875, + "time_per_iteration": 2.4985008239746094 + }, + { + "auxiliary_loss_clip": 0.01116955, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.04120767, + "balance_loss_mlp": 1.017416, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 5.012614088006888, + "language_loss": 0.77820075, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79969442, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.14996338, + "step": 14876, + "time_per_iteration": 2.429861545562744 + }, + { + "auxiliary_loss_clip": 0.01107131, + "auxiliary_loss_mlp": 0.01028122, + "balance_loss_clip": 1.03662753, + "balance_loss_mlp": 1.01735151, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.8654798395410122, + "language_loss": 0.78645802, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80781054, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10772705, + "step": 14877, + "time_per_iteration": 2.4455044269561768 + }, + { + "auxiliary_loss_clip": 0.01110655, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.01463079, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.393747440267487, + "language_loss": 0.75367272, + "learning_rate": 1.155320321355151e-07, + "loss": 0.7750538, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.12823486, + "step": 14878, + "time_per_iteration": 2.4348080158233643 + }, + { + "auxiliary_loss_clip": 0.01119125, + "auxiliary_loss_mlp": 0.01026293, + "balance_loss_clip": 1.04524028, + "balance_loss_mlp": 1.01359141, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.9527906462986524, + "language_loss": 0.76368046, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78513467, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1270752, + "step": 14879, + "time_per_iteration": 2.4277915954589844 + }, + { + "auxiliary_loss_clip": 0.01127541, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.05260098, + "balance_loss_mlp": 1.01671171, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.9514174510871805, + "language_loss": 0.74570036, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.7672596, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11663818, + "step": 14880, + "time_per_iteration": 2.419550895690918 + }, + { + "auxiliary_loss_clip": 0.0110678, + "auxiliary_loss_mlp": 0.010275, + "balance_loss_clip": 1.03608382, + "balance_loss_mlp": 1.015692, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.9231693014908375, + "language_loss": 0.82922971, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.85057247, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11804199, + "step": 14881, + "time_per_iteration": 2.595905303955078 + }, + { + "auxiliary_loss_clip": 0.01101173, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.03255701, + "balance_loss_mlp": 1.01551437, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.996847199974498, + "language_loss": 0.67572045, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69700229, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.1149292, + "step": 14882, + "time_per_iteration": 2.5200281143188477 + }, + { + "auxiliary_loss_clip": 0.01123322, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.04506493, + "balance_loss_mlp": 1.01723742, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 3.3341756399870084, + "language_loss": 0.75340098, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77494347, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13677979, + "step": 14883, + "time_per_iteration": 2.3839616775512695 + }, + { + "auxiliary_loss_clip": 0.01109632, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.04153526, + "balance_loss_mlp": 1.01692057, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.7063968580044742, + "language_loss": 0.72091216, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74228585, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10827637, + "step": 14884, + "time_per_iteration": 2.4932913780212402 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01025261, + "balance_loss_clip": 1.03842783, + "balance_loss_mlp": 1.01437747, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 1.629662779157454, + "language_loss": 0.75446028, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77579153, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10882568, + "step": 14885, + "time_per_iteration": 2.4374940395355225 + }, + { + "auxiliary_loss_clip": 0.01114248, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.03871274, + "balance_loss_mlp": 1.01801467, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 1.881452144930569, + "language_loss": 0.81799746, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83944339, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12335205, + "step": 14886, + "time_per_iteration": 2.4776461124420166 + }, + { + "auxiliary_loss_clip": 0.01113414, + "auxiliary_loss_mlp": 0.01026533, + "balance_loss_clip": 1.04182875, + "balance_loss_mlp": 1.01531529, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.5637911619750318, + "language_loss": 0.63455832, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65595782, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11218262, + "step": 14887, + "time_per_iteration": 4.200463056564331 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.03968537, + "balance_loss_mlp": 1.02069211, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.6366254536834441, + "language_loss": 0.60620302, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.62766165, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12945557, + "step": 14888, + "time_per_iteration": 2.465959072113037 + }, + { + "auxiliary_loss_clip": 0.01110847, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.03799379, + "balance_loss_mlp": 1.01589799, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 2.4053458369942238, + "language_loss": 0.70390785, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.72529626, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12084961, + "step": 14889, + "time_per_iteration": 2.5084280967712402 + }, + { + "auxiliary_loss_clip": 0.01120211, + "auxiliary_loss_mlp": 0.01033171, + "balance_loss_clip": 1.04498649, + "balance_loss_mlp": 1.02071929, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.5097558930835113, + "language_loss": 0.71925896, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.74079281, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12445068, + "step": 14890, + "time_per_iteration": 2.3840041160583496 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.04507387, + "balance_loss_mlp": 1.01670671, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.5358608565411738, + "language_loss": 0.75994766, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.78139961, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11859131, + "step": 14891, + "time_per_iteration": 3.9242522716522217 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_clip": 1.03879428, + "balance_loss_mlp": 1.0284369, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.8590336493483999, + "language_loss": 0.76812893, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78973007, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.15032959, + "step": 14892, + "time_per_iteration": 2.3969781398773193 + }, + { + "auxiliary_loss_clip": 0.01116355, + "auxiliary_loss_mlp": 0.01030394, + "balance_loss_clip": 1.04462385, + "balance_loss_mlp": 1.01889682, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.465698618601443, + "language_loss": 0.81691897, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83838654, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11505127, + "step": 14893, + "time_per_iteration": 2.5690231323242188 + }, + { + "auxiliary_loss_clip": 0.01114345, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.04249454, + "balance_loss_mlp": 1.02297735, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.7840853121080378, + "language_loss": 0.74773407, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76921958, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11236572, + "step": 14894, + "time_per_iteration": 2.4062020778656006 + }, + { + "auxiliary_loss_clip": 0.01118912, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.04523575, + "balance_loss_mlp": 1.01987576, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.6781185574897808, + "language_loss": 0.66855001, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.69005388, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1159668, + "step": 14895, + "time_per_iteration": 2.4108901023864746 + }, + { + "auxiliary_loss_clip": 0.0111981, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.04492795, + "balance_loss_mlp": 1.01486015, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.5944694096771626, + "language_loss": 0.66996551, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69143951, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12738037, + "step": 14896, + "time_per_iteration": 2.453456163406372 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.01026756, + "balance_loss_clip": 1.04044628, + "balance_loss_mlp": 1.01546073, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.65275552640619, + "language_loss": 0.7543205, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77571058, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11291504, + "step": 14897, + "time_per_iteration": 2.431605339050293 + }, + { + "auxiliary_loss_clip": 0.01048299, + "auxiliary_loss_mlp": 0.01005401, + "balance_loss_clip": 1.02379799, + "balance_loss_mlp": 1.00417638, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7419332123288452, + "language_loss": 0.55354404, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57408106, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01225281, + "step": 14898, + "time_per_iteration": 3.0775113105773926 + }, + { + "auxiliary_loss_clip": 0.01126178, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.04944384, + "balance_loss_mlp": 1.01761448, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.5673622021429032, + "language_loss": 0.70066959, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72222626, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11877441, + "step": 14899, + "time_per_iteration": 3.992710590362549 + }, + { + "auxiliary_loss_clip": 0.01130437, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.05446172, + "balance_loss_mlp": 1.01820219, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 2.0376096766451552, + "language_loss": 0.73780221, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.75941777, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12927246, + "step": 14900, + "time_per_iteration": 2.4541921615600586 + }, + { + "auxiliary_loss_clip": 0.01040862, + "auxiliary_loss_mlp": 0.01002208, + "balance_loss_clip": 1.01609993, + "balance_loss_mlp": 1.00081468, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7653328508822433, + "language_loss": 0.61767018, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63810086, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01394653, + "step": 14901, + "time_per_iteration": 3.033092498779297 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.0429914, + "balance_loss_mlp": 1.01819301, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.6224662688512328, + "language_loss": 0.70451593, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72597992, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11663818, + "step": 14902, + "time_per_iteration": 2.5364835262298584 + }, + { + "auxiliary_loss_clip": 0.01106524, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.03852582, + "balance_loss_mlp": 1.01704502, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.853985450219374, + "language_loss": 0.77871495, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80005926, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.10870361, + "step": 14903, + "time_per_iteration": 2.5130529403686523 + }, + { + "auxiliary_loss_clip": 0.01114769, + "auxiliary_loss_mlp": 0.01038146, + "balance_loss_clip": 1.0399754, + "balance_loss_mlp": 1.02399004, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 2.3878227855340333, + "language_loss": 0.72951025, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75103939, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.14147949, + "step": 14904, + "time_per_iteration": 2.4663538932800293 + }, + { + "auxiliary_loss_clip": 0.01115546, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.04121375, + "balance_loss_mlp": 1.01362383, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 2.1947823566113707, + "language_loss": 0.74877769, + "learning_rate": 1.12035883275166e-07, + "loss": 0.77020109, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13165283, + "step": 14905, + "time_per_iteration": 2.4381234645843506 + }, + { + "auxiliary_loss_clip": 0.01114538, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.04219103, + "balance_loss_mlp": 1.01802278, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 1.6901533474099715, + "language_loss": 0.76722509, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78866279, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11218262, + "step": 14906, + "time_per_iteration": 2.5149948596954346 + }, + { + "auxiliary_loss_clip": 0.0112071, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.04514694, + "balance_loss_mlp": 1.02356434, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.7406867944905104, + "language_loss": 0.74132031, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76288331, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12011719, + "step": 14907, + "time_per_iteration": 2.4511613845825195 + }, + { + "auxiliary_loss_clip": 0.0111987, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.04734743, + "balance_loss_mlp": 1.02520919, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 1.704074392319001, + "language_loss": 0.8328476, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85441661, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11816406, + "step": 14908, + "time_per_iteration": 2.426344156265259 + }, + { + "auxiliary_loss_clip": 0.01122295, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.04607224, + "balance_loss_mlp": 1.0184232, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 2.0544029484004107, + "language_loss": 0.70562088, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72715545, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1272583, + "step": 14909, + "time_per_iteration": 2.4334161281585693 + }, + { + "auxiliary_loss_clip": 0.01125287, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.05084908, + "balance_loss_mlp": 1.02041435, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 2.2457342071734563, + "language_loss": 0.72147077, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74305409, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12634277, + "step": 14910, + "time_per_iteration": 2.4434452056884766 + }, + { + "auxiliary_loss_clip": 0.01110758, + "auxiliary_loss_mlp": 0.01024705, + "balance_loss_clip": 1.04047775, + "balance_loss_mlp": 1.01407766, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 3.3946093606707692, + "language_loss": 0.62926787, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65062249, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10632324, + "step": 14911, + "time_per_iteration": 2.4642581939697266 + }, + { + "auxiliary_loss_clip": 0.01116026, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.04368007, + "balance_loss_mlp": 1.0184412, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.901379808360029, + "language_loss": 0.74925536, + "learning_rate": 1.111379898520437e-07, + "loss": 0.77071619, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11627197, + "step": 14912, + "time_per_iteration": 2.5307695865631104 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01034305, + "balance_loss_clip": 1.05118942, + "balance_loss_mlp": 1.02231848, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.8141861539465103, + "language_loss": 0.81905448, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84063768, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11981201, + "step": 14913, + "time_per_iteration": 2.4701874256134033 + }, + { + "auxiliary_loss_clip": 0.0111755, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.0438652, + "balance_loss_mlp": 1.02241385, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.6641099291467456, + "language_loss": 0.61140919, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63293707, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12835693, + "step": 14914, + "time_per_iteration": 2.455266237258911 + }, + { + "auxiliary_loss_clip": 0.01045662, + "auxiliary_loss_mlp": 0.01002697, + "balance_loss_clip": 1.02129674, + "balance_loss_mlp": 1.00148833, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.7191898858827992, + "language_loss": 0.55050743, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57099104, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.2434082, + "router_z_loss_mlp": 0.01208496, + "step": 14915, + "time_per_iteration": 3.075106143951416 + }, + { + "auxiliary_loss_clip": 0.01114048, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.04449916, + "balance_loss_mlp": 1.01861835, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.4458958618014992, + "language_loss": 0.71620792, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73764271, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10809326, + "step": 14916, + "time_per_iteration": 4.00837779045105 + }, + { + "auxiliary_loss_clip": 0.01113693, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.04216218, + "balance_loss_mlp": 1.0169189, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.8699355912882534, + "language_loss": 0.77857262, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79998672, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10797119, + "step": 14917, + "time_per_iteration": 2.473447799682617 + }, + { + "auxiliary_loss_clip": 0.01122356, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.0438081, + "balance_loss_mlp": 1.02967739, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 2.658887457118824, + "language_loss": 0.68642354, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70808202, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13818359, + "step": 14918, + "time_per_iteration": 2.603980541229248 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.04400623, + "balance_loss_mlp": 1.01832116, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.7335472785466595, + "language_loss": 0.83559132, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85707009, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11077881, + "step": 14919, + "time_per_iteration": 2.4910614490509033 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.0102716, + "balance_loss_clip": 1.04243112, + "balance_loss_mlp": 1.01503658, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 3.3352391149145695, + "language_loss": 0.72448313, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74592483, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12121582, + "step": 14920, + "time_per_iteration": 2.4123008251190186 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.04790068, + "balance_loss_mlp": 1.01760197, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.310301781787661, + "language_loss": 0.91059911, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.9321034, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12469482, + "step": 14921, + "time_per_iteration": 2.395012378692627 + }, + { + "auxiliary_loss_clip": 0.01115419, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.04208803, + "balance_loss_mlp": 1.01586294, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.9185626974568433, + "language_loss": 0.73988354, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.76131213, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11578369, + "step": 14922, + "time_per_iteration": 2.4205009937286377 + }, + { + "auxiliary_loss_clip": 0.01113815, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.04278719, + "balance_loss_mlp": 1.01732135, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 2.0770123505436158, + "language_loss": 0.70344478, + "learning_rate": 1.097341060694219e-07, + "loss": 0.7248801, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.12414551, + "step": 14923, + "time_per_iteration": 2.4443721771240234 + }, + { + "auxiliary_loss_clip": 0.01122311, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.04875565, + "balance_loss_mlp": 1.0179342, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.516934808543511, + "language_loss": 0.70428979, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72582233, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.13024902, + "step": 14924, + "time_per_iteration": 2.3846986293792725 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.04276443, + "balance_loss_mlp": 1.02086914, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.3596329097765596, + "language_loss": 0.71941233, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74088448, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11022949, + "step": 14925, + "time_per_iteration": 2.4787983894348145 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01025617, + "balance_loss_clip": 1.04649949, + "balance_loss_mlp": 1.01337373, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.550194831831337, + "language_loss": 0.82311535, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84459484, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12237549, + "step": 14926, + "time_per_iteration": 2.4543254375457764 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01027916, + "balance_loss_clip": 1.0498662, + "balance_loss_mlp": 1.01644826, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.5268661295067323, + "language_loss": 0.79285198, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81438279, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11462402, + "step": 14927, + "time_per_iteration": 2.504885673522949 + }, + { + "auxiliary_loss_clip": 0.01114992, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.0440439, + "balance_loss_mlp": 1.01383245, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.4746924836570396, + "language_loss": 0.6624006, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68379366, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1048584, + "step": 14928, + "time_per_iteration": 2.5732390880584717 + }, + { + "auxiliary_loss_clip": 0.01123674, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.04749131, + "balance_loss_mlp": 1.02371049, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 1.7461736348516608, + "language_loss": 0.7068851, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72850144, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.14251709, + "step": 14929, + "time_per_iteration": 2.476088047027588 + }, + { + "auxiliary_loss_clip": 0.01115155, + "auxiliary_loss_mlp": 0.01029407, + "balance_loss_clip": 1.0441705, + "balance_loss_mlp": 1.0182786, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.710252706539232, + "language_loss": 0.68260705, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.70405269, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11126709, + "step": 14930, + "time_per_iteration": 2.413128137588501 + }, + { + "auxiliary_loss_clip": 0.01111945, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.04116869, + "balance_loss_mlp": 1.01853895, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.6977221189300769, + "language_loss": 0.74839729, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.76981992, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11785889, + "step": 14931, + "time_per_iteration": 3.8421285152435303 + }, + { + "auxiliary_loss_clip": 0.01118922, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.04748392, + "balance_loss_mlp": 1.01702666, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.5623661168284046, + "language_loss": 0.62890553, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65037513, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11010742, + "step": 14932, + "time_per_iteration": 2.432941436767578 + }, + { + "auxiliary_loss_clip": 0.01115884, + "auxiliary_loss_mlp": 0.01023821, + "balance_loss_clip": 1.0464766, + "balance_loss_mlp": 1.01337838, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.8317245295258673, + "language_loss": 0.71743751, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73883462, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10449219, + "step": 14933, + "time_per_iteration": 2.4440736770629883 + }, + { + "auxiliary_loss_clip": 0.01114986, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.04255533, + "balance_loss_mlp": 1.01878452, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.4216661119284846, + "language_loss": 0.74493057, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76638973, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.121521, + "step": 14934, + "time_per_iteration": 2.4811840057373047 + }, + { + "auxiliary_loss_clip": 0.01110856, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.04047108, + "balance_loss_mlp": 1.0178293, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.747413717525634, + "language_loss": 0.60787034, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62927389, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11651611, + "step": 14935, + "time_per_iteration": 3.89041805267334 + }, + { + "auxiliary_loss_clip": 0.01108483, + "auxiliary_loss_mlp": 0.01023097, + "balance_loss_clip": 1.03946495, + "balance_loss_mlp": 1.01155162, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 2.9811952539262783, + "language_loss": 0.76716787, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78848368, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11560059, + "step": 14936, + "time_per_iteration": 2.481614828109741 + }, + { + "auxiliary_loss_clip": 0.01109754, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.03807783, + "balance_loss_mlp": 1.01876986, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.6442766844326078, + "language_loss": 0.73895693, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76035786, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11566162, + "step": 14937, + "time_per_iteration": 2.4462685585021973 + }, + { + "auxiliary_loss_clip": 0.01047847, + "auxiliary_loss_mlp": 0.01003301, + "balance_loss_clip": 1.02362287, + "balance_loss_mlp": 1.00200713, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8450522831564089, + "language_loss": 0.63388896, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65440047, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.24194336, + "router_z_loss_mlp": 0.01292419, + "step": 14938, + "time_per_iteration": 2.990461826324463 + }, + { + "auxiliary_loss_clip": 0.01111704, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.04077268, + "balance_loss_mlp": 1.01686263, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.1227165755501787, + "language_loss": 0.80362332, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82502782, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11895752, + "step": 14939, + "time_per_iteration": 2.4228739738464355 + }, + { + "auxiliary_loss_clip": 0.0103429, + "auxiliary_loss_mlp": 0.01000281, + "balance_loss_clip": 1.0102129, + "balance_loss_mlp": 0.99890614, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7221416370784826, + "language_loss": 0.52895224, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54929793, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.24072266, + "router_z_loss_mlp": 0.01376343, + "step": 14940, + "time_per_iteration": 3.2093634605407715 + }, + { + "auxiliary_loss_clip": 0.01111403, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.03952861, + "balance_loss_mlp": 1.01634467, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.8269555791943106, + "language_loss": 0.77615428, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.79755384, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12194824, + "step": 14941, + "time_per_iteration": 2.433631658554077 + }, + { + "auxiliary_loss_clip": 0.01114925, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.04174376, + "balance_loss_mlp": 1.02321672, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 2.3612534755365826, + "language_loss": 0.73436546, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75586683, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11999512, + "step": 14942, + "time_per_iteration": 4.033549785614014 + }, + { + "auxiliary_loss_clip": 0.01121634, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.0471673, + "balance_loss_mlp": 1.02371597, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.0032875130712338, + "language_loss": 0.79518032, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.81676066, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12695312, + "step": 14943, + "time_per_iteration": 2.4441821575164795 + }, + { + "auxiliary_loss_clip": 0.01114303, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.03947377, + "balance_loss_mlp": 1.0212723, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.4746460091928713, + "language_loss": 0.71188009, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73336482, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12896729, + "step": 14944, + "time_per_iteration": 2.4441771507263184 + }, + { + "auxiliary_loss_clip": 0.01117299, + "auxiliary_loss_mlp": 0.01026854, + "balance_loss_clip": 1.0430733, + "balance_loss_mlp": 1.01467645, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 6.272724431015358, + "language_loss": 0.75867891, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.78012049, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12176514, + "step": 14945, + "time_per_iteration": 2.434347152709961 + }, + { + "auxiliary_loss_clip": 0.01124098, + "auxiliary_loss_mlp": 0.01038441, + "balance_loss_clip": 1.04398155, + "balance_loss_mlp": 1.02495265, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 2.1690702735695493, + "language_loss": 0.73672473, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75835013, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13500977, + "step": 14946, + "time_per_iteration": 2.4293675422668457 + }, + { + "auxiliary_loss_clip": 0.01113708, + "auxiliary_loss_mlp": 0.01026869, + "balance_loss_clip": 1.04132962, + "balance_loss_mlp": 1.01495981, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 2.1585493465082686, + "language_loss": 0.64360678, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.6650126, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11914062, + "step": 14947, + "time_per_iteration": 2.447575807571411 + }, + { + "auxiliary_loss_clip": 0.01113683, + "auxiliary_loss_mlp": 0.01031558, + "balance_loss_clip": 1.04218745, + "balance_loss_mlp": 1.0200603, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 2.7440895245605237, + "language_loss": 0.69796777, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.71942019, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11499023, + "step": 14948, + "time_per_iteration": 2.4509644508361816 + }, + { + "auxiliary_loss_clip": 0.01113079, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.0392561, + "balance_loss_mlp": 1.01943922, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 5.791598210172341, + "language_loss": 0.74617231, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76762044, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1229248, + "step": 14949, + "time_per_iteration": 2.6203155517578125 + }, + { + "auxiliary_loss_clip": 0.01120448, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.04632509, + "balance_loss_mlp": 1.01977623, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.6521709926270551, + "language_loss": 0.758506, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.78004622, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.13800049, + "step": 14950, + "time_per_iteration": 2.5041539669036865 + }, + { + "auxiliary_loss_clip": 0.0111721, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.04710996, + "balance_loss_mlp": 1.01771212, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 2.9790133888673664, + "language_loss": 0.66197062, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68342799, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10809326, + "step": 14951, + "time_per_iteration": 2.438019275665283 + }, + { + "auxiliary_loss_clip": 0.01119295, + "auxiliary_loss_mlp": 0.01027797, + "balance_loss_clip": 1.04110336, + "balance_loss_mlp": 1.01629305, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 2.207291699212228, + "language_loss": 0.73853397, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76000488, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.11499023, + "step": 14952, + "time_per_iteration": 2.4555678367614746 + }, + { + "auxiliary_loss_clip": 0.011214, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.04831409, + "balance_loss_mlp": 1.02052462, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.5120336969688006, + "language_loss": 0.56892943, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.59046096, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11230469, + "step": 14953, + "time_per_iteration": 2.499509811401367 + }, + { + "auxiliary_loss_clip": 0.0112737, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.0536449, + "balance_loss_mlp": 1.02059686, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 2.2567308062635822, + "language_loss": 0.82604063, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.84763765, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11743164, + "step": 14954, + "time_per_iteration": 2.489762783050537 + }, + { + "auxiliary_loss_clip": 0.01114353, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.04301023, + "balance_loss_mlp": 1.01884544, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 1.8019739379472801, + "language_loss": 0.60188496, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62334019, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12335205, + "step": 14955, + "time_per_iteration": 2.527863025665283 + }, + { + "auxiliary_loss_clip": 0.01109154, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.03882289, + "balance_loss_mlp": 1.01904869, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.6648639771219123, + "language_loss": 0.55250168, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57389122, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10754395, + "step": 14956, + "time_per_iteration": 2.4629809856414795 + }, + { + "auxiliary_loss_clip": 0.01108905, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.03716683, + "balance_loss_mlp": 1.0211494, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 2.4927804119917645, + "language_loss": 0.80023748, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82165778, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11981201, + "step": 14957, + "time_per_iteration": 2.524341583251953 + }, + { + "auxiliary_loss_clip": 0.01122103, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.04410219, + "balance_loss_mlp": 1.01737607, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.6245831800916501, + "language_loss": 0.78811455, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80963403, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12481689, + "step": 14958, + "time_per_iteration": 2.4250383377075195 + }, + { + "auxiliary_loss_clip": 0.01112611, + "auxiliary_loss_mlp": 0.01029026, + "balance_loss_clip": 1.04172635, + "balance_loss_mlp": 1.01803446, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.4309182328841288, + "language_loss": 0.74890018, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.7703166, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10992432, + "step": 14959, + "time_per_iteration": 2.4759795665740967 + }, + { + "auxiliary_loss_clip": 0.0110744, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.03827691, + "balance_loss_mlp": 1.0183084, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 2.426427126647021, + "language_loss": 0.68324447, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70461667, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11474609, + "step": 14960, + "time_per_iteration": 3.8612780570983887 + }, + { + "auxiliary_loss_clip": 0.01112189, + "auxiliary_loss_mlp": 0.01025778, + "balance_loss_clip": 1.04041743, + "balance_loss_mlp": 1.01463223, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.7592498107734584, + "language_loss": 0.65678746, + "learning_rate": 1.049510991294591e-07, + "loss": 0.6781671, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11151123, + "step": 14961, + "time_per_iteration": 2.47892689704895 + }, + { + "auxiliary_loss_clip": 0.01103693, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.0346719, + "balance_loss_mlp": 1.01690817, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.5989202684560835, + "language_loss": 0.83014613, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85146618, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.11407471, + "step": 14962, + "time_per_iteration": 2.4532053470611572 + }, + { + "auxiliary_loss_clip": 0.01122343, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.04514003, + "balance_loss_mlp": 1.01473427, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 1.9577101336754277, + "language_loss": 0.76391894, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78542078, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13122559, + "step": 14963, + "time_per_iteration": 2.4775309562683105 + }, + { + "auxiliary_loss_clip": 0.01042381, + "auxiliary_loss_mlp": 0.01005862, + "balance_loss_clip": 1.01771975, + "balance_loss_mlp": 1.0044359, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7798107964424182, + "language_loss": 0.57496154, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59544396, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01425171, + "step": 14964, + "time_per_iteration": 2.911869764328003 + }, + { + "auxiliary_loss_clip": 0.01125016, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.0463624, + "balance_loss_mlp": 1.01800859, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.697760895872397, + "language_loss": 0.6748513, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69640815, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12658691, + "step": 14965, + "time_per_iteration": 2.56984281539917 + }, + { + "auxiliary_loss_clip": 0.01108804, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.0363934, + "balance_loss_mlp": 1.01994693, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 1.8980508716939564, + "language_loss": 0.71622729, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.73763233, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11755371, + "step": 14966, + "time_per_iteration": 2.4518942832946777 + }, + { + "auxiliary_loss_clip": 0.01116088, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.04173565, + "balance_loss_mlp": 1.01841068, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 1.6564078168755463, + "language_loss": 0.73928374, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.76075828, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12963867, + "step": 14967, + "time_per_iteration": 2.5479443073272705 + }, + { + "auxiliary_loss_clip": 0.01107327, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.03529143, + "balance_loss_mlp": 1.01912332, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 2.0720683943661724, + "language_loss": 0.72508234, + "learning_rate": 1.040813291960323e-07, + "loss": 0.74646354, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11676025, + "step": 14968, + "time_per_iteration": 2.511378049850464 + }, + { + "auxiliary_loss_clip": 0.01112324, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.03993189, + "balance_loss_mlp": 1.02038264, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 2.2563877519220394, + "language_loss": 0.71665615, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73812103, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.13787842, + "step": 14969, + "time_per_iteration": 2.5132670402526855 + }, + { + "auxiliary_loss_clip": 0.01122711, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.04818153, + "balance_loss_mlp": 1.01540935, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 2.109524805847142, + "language_loss": 0.7653017, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78679675, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11395264, + "step": 14970, + "time_per_iteration": 2.4184672832489014 + }, + { + "auxiliary_loss_clip": 0.01127182, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.05193901, + "balance_loss_mlp": 1.01787221, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.83061867359436, + "language_loss": 0.73326713, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75482488, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.10717773, + "step": 14971, + "time_per_iteration": 2.4192214012145996 + }, + { + "auxiliary_loss_clip": 0.01110268, + "auxiliary_loss_mlp": 0.01027814, + "balance_loss_clip": 1.03894103, + "balance_loss_mlp": 1.0153147, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 2.4426400463543443, + "language_loss": 0.82292283, + "learning_rate": 1.035858993572476e-07, + "loss": 0.84430367, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12475586, + "step": 14972, + "time_per_iteration": 2.4033291339874268 + }, + { + "auxiliary_loss_clip": 0.01118384, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.04256332, + "balance_loss_mlp": 1.0191896, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 1.907604199643538, + "language_loss": 0.82182342, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.84332931, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13006592, + "step": 14973, + "time_per_iteration": 2.3910889625549316 + }, + { + "auxiliary_loss_clip": 0.01117988, + "auxiliary_loss_mlp": 0.01041444, + "balance_loss_clip": 1.04321361, + "balance_loss_mlp": 1.02873635, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.8769502986555127, + "language_loss": 0.58340758, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60500193, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12701416, + "step": 14974, + "time_per_iteration": 2.4543585777282715 + }, + { + "auxiliary_loss_clip": 0.01116982, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.04399681, + "balance_loss_mlp": 1.0236963, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.8748693854999403, + "language_loss": 0.63634539, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65786946, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11730957, + "step": 14975, + "time_per_iteration": 2.4607088565826416 + }, + { + "auxiliary_loss_clip": 0.01122078, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.04696941, + "balance_loss_mlp": 1.01989698, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.8807796945878044, + "language_loss": 0.7333976, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75493836, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12097168, + "step": 14976, + "time_per_iteration": 3.8609418869018555 + }, + { + "auxiliary_loss_clip": 0.01119022, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.04643917, + "balance_loss_mlp": 1.01901937, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.8723918145544896, + "language_loss": 0.6990636, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.72055495, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11096191, + "step": 14977, + "time_per_iteration": 2.538297176361084 + }, + { + "auxiliary_loss_clip": 0.01110336, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.03731132, + "balance_loss_mlp": 1.0205189, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.147268885056048, + "language_loss": 0.65683329, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67827332, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.13140869, + "step": 14978, + "time_per_iteration": 3.949300765991211 + }, + { + "auxiliary_loss_clip": 0.01116722, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.04018068, + "balance_loss_mlp": 1.02132738, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.7818188713350303, + "language_loss": 0.7922132, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81372333, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12963867, + "step": 14979, + "time_per_iteration": 2.4321177005767822 + }, + { + "auxiliary_loss_clip": 0.01034328, + "auxiliary_loss_mlp": 0.01003787, + "balance_loss_clip": 1.00957584, + "balance_loss_mlp": 1.00242543, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7188713774222524, + "language_loss": 0.5356797, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55606079, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.0136261, + "step": 14980, + "time_per_iteration": 3.1801609992980957 + }, + { + "auxiliary_loss_clip": 0.01121248, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.04549229, + "balance_loss_mlp": 1.02520239, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.7556880891588955, + "language_loss": 0.82384521, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.8454361, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12634277, + "step": 14981, + "time_per_iteration": 2.563481092453003 + }, + { + "auxiliary_loss_clip": 0.01110316, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.04017138, + "balance_loss_mlp": 1.01982474, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 1.742799625868724, + "language_loss": 0.81466627, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83608127, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11340332, + "step": 14982, + "time_per_iteration": 2.468600034713745 + }, + { + "auxiliary_loss_clip": 0.01113561, + "auxiliary_loss_mlp": 0.0103155, + "balance_loss_clip": 1.04336667, + "balance_loss_mlp": 1.02061868, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.8228320741677644, + "language_loss": 0.72006083, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.74151188, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.109375, + "step": 14983, + "time_per_iteration": 2.4918336868286133 + }, + { + "auxiliary_loss_clip": 0.01110073, + "auxiliary_loss_mlp": 0.01024419, + "balance_loss_clip": 1.04050851, + "balance_loss_mlp": 1.01363063, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.452998985021295, + "language_loss": 0.74963051, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77097547, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10784912, + "step": 14984, + "time_per_iteration": 2.453284978866577 + }, + { + "auxiliary_loss_clip": 0.01103927, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.0354135, + "balance_loss_mlp": 1.01669455, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.5283370500514513, + "language_loss": 0.70413089, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72546065, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.12347412, + "step": 14985, + "time_per_iteration": 2.4562199115753174 + }, + { + "auxiliary_loss_clip": 0.01111913, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.03796995, + "balance_loss_mlp": 1.01947057, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 2.196480847644549, + "language_loss": 0.70865613, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.73009133, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12127686, + "step": 14986, + "time_per_iteration": 3.8481340408325195 + }, + { + "auxiliary_loss_clip": 0.01114122, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.03781629, + "balance_loss_mlp": 1.02602422, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.752475714791285, + "language_loss": 0.76851869, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.79004246, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12225342, + "step": 14987, + "time_per_iteration": 2.4332823753356934 + }, + { + "auxiliary_loss_clip": 0.01113298, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.04132056, + "balance_loss_mlp": 1.02270293, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 2.0522323294021496, + "language_loss": 0.73942792, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.7609098, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12188721, + "step": 14988, + "time_per_iteration": 2.4498274326324463 + }, + { + "auxiliary_loss_clip": 0.01122427, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.04631615, + "balance_loss_mlp": 1.01448965, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.972240127652511, + "language_loss": 0.6952191, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71671778, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12969971, + "step": 14989, + "time_per_iteration": 2.4967007637023926 + }, + { + "auxiliary_loss_clip": 0.01114949, + "auxiliary_loss_mlp": 0.01026138, + "balance_loss_clip": 1.0431546, + "balance_loss_mlp": 1.01480746, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 2.2552084474292617, + "language_loss": 0.80541587, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.82682675, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11328125, + "step": 14990, + "time_per_iteration": 2.39270281791687 + }, + { + "auxiliary_loss_clip": 0.011229, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.04748011, + "balance_loss_mlp": 1.01838601, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 3.956687702146012, + "language_loss": 0.77784288, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.79937804, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12225342, + "step": 14991, + "time_per_iteration": 2.4229393005371094 + }, + { + "auxiliary_loss_clip": 0.01041472, + "auxiliary_loss_mlp": 0.0100628, + "balance_loss_clip": 1.01621652, + "balance_loss_mlp": 1.00481367, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.8008580558138287, + "language_loss": 0.60191822, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62239569, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.0146637, + "step": 14992, + "time_per_iteration": 2.9971067905426025 + }, + { + "auxiliary_loss_clip": 0.01106162, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.0351541, + "balance_loss_mlp": 1.01449621, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 1.9181041875796367, + "language_loss": 0.83241761, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.85373896, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11480713, + "step": 14993, + "time_per_iteration": 2.440945625305176 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.04122806, + "balance_loss_mlp": 1.01814592, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 2.089492284862674, + "language_loss": 0.72950011, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75094151, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11999512, + "step": 14994, + "time_per_iteration": 2.3908941745758057 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03922415, + "balance_loss_mlp": 1.01907921, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.965982075964998, + "language_loss": 0.6435892, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66497123, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10388184, + "step": 14995, + "time_per_iteration": 2.5071933269500732 + }, + { + "auxiliary_loss_clip": 0.01113535, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04016924, + "balance_loss_mlp": 1.01759601, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 2.001059311680989, + "language_loss": 0.66477084, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68621224, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.13006592, + "step": 14996, + "time_per_iteration": 2.489809989929199 + }, + { + "auxiliary_loss_clip": 0.01121604, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.04988992, + "balance_loss_mlp": 1.0149864, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.8173093837331122, + "language_loss": 0.66012841, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68160349, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10913086, + "step": 14997, + "time_per_iteration": 2.4623970985412598 + }, + { + "auxiliary_loss_clip": 0.01111486, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.0388577, + "balance_loss_mlp": 1.02227879, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 5.523266435852438, + "language_loss": 0.77679378, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79824954, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1182251, + "step": 14998, + "time_per_iteration": 2.3945608139038086 + }, + { + "auxiliary_loss_clip": 0.01113087, + "auxiliary_loss_mlp": 0.01027121, + "balance_loss_clip": 1.04098511, + "balance_loss_mlp": 1.01595068, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.6755848670205697, + "language_loss": 0.75139606, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77279812, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11175537, + "step": 14999, + "time_per_iteration": 2.4301297664642334 + }, + { + "auxiliary_loss_clip": 0.01117942, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.04626632, + "balance_loss_mlp": 1.01659322, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.095812565813333, + "language_loss": 0.76351738, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.78498977, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12701416, + "step": 15000, + "time_per_iteration": 2.431731700897217 + }, + { + "auxiliary_loss_clip": 0.0110887, + "auxiliary_loss_mlp": 0.01025769, + "balance_loss_clip": 1.03880334, + "balance_loss_mlp": 1.01379418, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.084458104926697, + "language_loss": 0.8079828, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.82932913, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11975098, + "step": 15001, + "time_per_iteration": 2.7358288764953613 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.04242301, + "balance_loss_mlp": 1.01563191, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.3450985295135547, + "language_loss": 0.78539556, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80680162, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10998535, + "step": 15002, + "time_per_iteration": 2.5145740509033203 + }, + { + "auxiliary_loss_clip": 0.0112066, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.04715037, + "balance_loss_mlp": 1.01759744, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 2.092594230655747, + "language_loss": 0.68306518, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70457196, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12414551, + "step": 15003, + "time_per_iteration": 2.4574928283691406 + }, + { + "auxiliary_loss_clip": 0.01119296, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.0448674, + "balance_loss_mlp": 1.02271819, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 1.799707064602105, + "language_loss": 0.86298382, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88452268, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11889648, + "step": 15004, + "time_per_iteration": 3.952200174331665 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.04062665, + "balance_loss_mlp": 1.02183783, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.585974669083136, + "language_loss": 0.72568786, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74717319, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11859131, + "step": 15005, + "time_per_iteration": 2.5360851287841797 + }, + { + "auxiliary_loss_clip": 0.0111537, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.03767693, + "balance_loss_mlp": 1.01898658, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 2.019877781740608, + "language_loss": 0.7137562, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73522568, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12585449, + "step": 15006, + "time_per_iteration": 2.464235305786133 + }, + { + "auxiliary_loss_clip": 0.01124177, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.04883492, + "balance_loss_mlp": 1.0170356, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.925705520345355, + "language_loss": 0.84213394, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86365712, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11108398, + "step": 15007, + "time_per_iteration": 2.4118967056274414 + }, + { + "auxiliary_loss_clip": 0.01118143, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.04772115, + "balance_loss_mlp": 1.0190165, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.620404066021739, + "language_loss": 0.7869162, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80840486, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11706543, + "step": 15008, + "time_per_iteration": 2.529480218887329 + }, + { + "auxiliary_loss_clip": 0.01117768, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.04828119, + "balance_loss_mlp": 1.01966858, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.816072194918417, + "language_loss": 0.73697615, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75846529, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11474609, + "step": 15009, + "time_per_iteration": 2.433828115463257 + }, + { + "auxiliary_loss_clip": 0.01120606, + "auxiliary_loss_mlp": 0.01027916, + "balance_loss_clip": 1.04851532, + "balance_loss_mlp": 1.01646626, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.8374736072691364, + "language_loss": 0.73286879, + "learning_rate": 9.893674402495399e-08, + "loss": 0.754354, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11456299, + "step": 15010, + "time_per_iteration": 2.471836805343628 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.04333758, + "balance_loss_mlp": 1.01636434, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.9489485862473452, + "language_loss": 0.73934913, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76079339, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12561035, + "step": 15011, + "time_per_iteration": 2.4369893074035645 + }, + { + "auxiliary_loss_clip": 0.01121426, + "auxiliary_loss_mlp": 0.01030864, + "balance_loss_clip": 1.04630947, + "balance_loss_mlp": 1.01886582, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 3.16079205348327, + "language_loss": 0.72792256, + "learning_rate": 9.869493296493204e-08, + "loss": 0.74944544, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11999512, + "step": 15012, + "time_per_iteration": 2.5094857215881348 + }, + { + "auxiliary_loss_clip": 0.01114681, + "auxiliary_loss_mlp": 0.0103341, + "balance_loss_clip": 1.0440042, + "balance_loss_mlp": 1.02251399, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.8177813707478294, + "language_loss": 0.69327384, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71475482, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10900879, + "step": 15013, + "time_per_iteration": 2.432389497756958 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01028334, + "balance_loss_clip": 1.03870082, + "balance_loss_mlp": 1.01733685, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.8565534984564265, + "language_loss": 0.73075688, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75211072, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.11004639, + "step": 15014, + "time_per_iteration": 2.5184922218322754 + }, + { + "auxiliary_loss_clip": 0.01117217, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.04512596, + "balance_loss_mlp": 1.0163734, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.863305762436232, + "language_loss": 0.72011787, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74157715, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12341309, + "step": 15015, + "time_per_iteration": 2.583817481994629 + }, + { + "auxiliary_loss_clip": 0.01113438, + "auxiliary_loss_mlp": 0.01025595, + "balance_loss_clip": 1.04119372, + "balance_loss_mlp": 1.01468146, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 1.9175073304824506, + "language_loss": 0.69253659, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71392703, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10919189, + "step": 15016, + "time_per_iteration": 2.5014994144439697 + }, + { + "auxiliary_loss_clip": 0.01111231, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.04111886, + "balance_loss_mlp": 1.0176481, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 1.7648194761965623, + "language_loss": 0.70316744, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72456253, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10644531, + "step": 15017, + "time_per_iteration": 2.477121114730835 + }, + { + "auxiliary_loss_clip": 0.01111658, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.04041362, + "balance_loss_mlp": 1.02203345, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.7100740564371562, + "language_loss": 0.6983968, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71984756, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11383057, + "step": 15018, + "time_per_iteration": 3.899169921875 + }, + { + "auxiliary_loss_clip": 0.01117931, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.04172587, + "balance_loss_mlp": 1.02385569, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.9212315133868636, + "language_loss": 0.68969655, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71124029, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12591553, + "step": 15019, + "time_per_iteration": 2.4494469165802 + }, + { + "auxiliary_loss_clip": 0.01115915, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.04492474, + "balance_loss_mlp": 1.01683772, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.6998966858526143, + "language_loss": 0.71929497, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74073446, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11199951, + "step": 15020, + "time_per_iteration": 2.4071242809295654 + }, + { + "auxiliary_loss_clip": 0.01106397, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.03434622, + "balance_loss_mlp": 1.01883316, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5290652566262122, + "language_loss": 0.74374759, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76511884, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11895752, + "step": 15021, + "time_per_iteration": 2.467862129211426 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.04688025, + "balance_loss_mlp": 1.01875103, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 2.146335496103789, + "language_loss": 0.72528732, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74680889, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12567139, + "step": 15022, + "time_per_iteration": 2.5003819465637207 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.03503287, + "balance_loss_mlp": 1.01409483, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.8672318669474914, + "language_loss": 0.72557032, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74683458, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 0.66308594, + "router_z_loss_mlp": 0.10986328, + "step": 15023, + "time_per_iteration": 4.033831357955933 + }, + { + "auxiliary_loss_clip": 0.0110921, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.03811359, + "balance_loss_mlp": 1.02169645, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.665165471837928, + "language_loss": 0.82927895, + "learning_rate": 9.725012409042155e-08, + "loss": 0.85071653, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.128479, + "step": 15024, + "time_per_iteration": 2.4697320461273193 + }, + { + "auxiliary_loss_clip": 0.0111119, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.03850591, + "balance_loss_mlp": 1.01736927, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.5218898932432736, + "language_loss": 0.70060533, + "learning_rate": 9.713019223328966e-08, + "loss": 0.72201228, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12145996, + "step": 15025, + "time_per_iteration": 2.492131233215332 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.0441134, + "balance_loss_mlp": 1.02160478, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.6954850641267731, + "language_loss": 0.76805317, + "learning_rate": 9.70103325331717e-08, + "loss": 0.78954065, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.12896729, + "step": 15026, + "time_per_iteration": 2.469550609588623 + }, + { + "auxiliary_loss_clip": 0.01115174, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.04202008, + "balance_loss_mlp": 1.02109313, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.673212093641502, + "language_loss": 0.68392277, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70540345, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11810303, + "step": 15027, + "time_per_iteration": 2.462904930114746 + }, + { + "auxiliary_loss_clip": 0.0111057, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.04180658, + "balance_loss_mlp": 1.01493955, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 2.302912704690658, + "language_loss": 0.75868583, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78004873, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10766602, + "step": 15028, + "time_per_iteration": 2.4510536193847656 + }, + { + "auxiliary_loss_clip": 0.01119569, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.04376125, + "balance_loss_mlp": 1.02454925, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.7764043440697963, + "language_loss": 0.69154263, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71310496, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12109375, + "step": 15029, + "time_per_iteration": 3.899757146835327 + }, + { + "auxiliary_loss_clip": 0.01117971, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.04379427, + "balance_loss_mlp": 1.01571822, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 2.1976054927298674, + "language_loss": 0.73920846, + "learning_rate": 9.653161539369858e-08, + "loss": 0.76066852, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12329102, + "step": 15030, + "time_per_iteration": 2.5019919872283936 + }, + { + "auxiliary_loss_clip": 0.01118174, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.04357183, + "balance_loss_mlp": 1.01690578, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 2.560071910784893, + "language_loss": 0.68364334, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70511371, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11968994, + "step": 15031, + "time_per_iteration": 2.5886306762695312 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01021277, + "balance_loss_clip": 1.04206824, + "balance_loss_mlp": 1.01066148, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.8171657455208523, + "language_loss": 0.76311707, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78444898, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10614014, + "step": 15032, + "time_per_iteration": 2.4537768363952637 + }, + { + "auxiliary_loss_clip": 0.01125591, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.0493803, + "balance_loss_mlp": 1.01745152, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 2.0739984114140455, + "language_loss": 0.75434303, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77589428, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12097168, + "step": 15033, + "time_per_iteration": 2.4113893508911133 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01028532, + "balance_loss_clip": 1.04458511, + "balance_loss_mlp": 1.0168674, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.9403416497170434, + "language_loss": 0.73821831, + "learning_rate": 9.605405312956105e-08, + "loss": 0.75967914, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11663818, + "step": 15034, + "time_per_iteration": 2.499415636062622 + }, + { + "auxiliary_loss_clip": 0.01121388, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.04947019, + "balance_loss_mlp": 1.02332616, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.5812553291819147, + "language_loss": 0.63400483, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65556836, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11651611, + "step": 15035, + "time_per_iteration": 2.497457265853882 + }, + { + "auxiliary_loss_clip": 0.01113669, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.04187334, + "balance_loss_mlp": 1.01710391, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 2.583260396447395, + "language_loss": 0.62114239, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64257431, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12420654, + "step": 15036, + "time_per_iteration": 2.498786211013794 + }, + { + "auxiliary_loss_clip": 0.01115373, + "auxiliary_loss_mlp": 0.01023616, + "balance_loss_clip": 1.04709375, + "balance_loss_mlp": 1.01251185, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.6521524281669226, + "language_loss": 0.82192254, + "learning_rate": 9.569663949272455e-08, + "loss": 0.8433125, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.11102295, + "step": 15037, + "time_per_iteration": 2.450993061065674 + }, + { + "auxiliary_loss_clip": 0.01116085, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.04287291, + "balance_loss_mlp": 1.01898372, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 4.5246226161804675, + "language_loss": 0.67680514, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69827354, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11749268, + "step": 15038, + "time_per_iteration": 2.418229341506958 + }, + { + "auxiliary_loss_clip": 0.01108706, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.03642654, + "balance_loss_mlp": 1.02076387, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.9451144395933926, + "language_loss": 0.75087118, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77228498, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11907959, + "step": 15039, + "time_per_iteration": 2.4411845207214355 + }, + { + "auxiliary_loss_clip": 0.01119788, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.04930508, + "balance_loss_mlp": 1.01597381, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.5734150822614614, + "language_loss": 0.70247579, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72394645, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11309814, + "step": 15040, + "time_per_iteration": 2.4341583251953125 + }, + { + "auxiliary_loss_clip": 0.01114051, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.04274106, + "balance_loss_mlp": 1.02081537, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.6802703185509773, + "language_loss": 0.67989266, + "learning_rate": 9.522109895720709e-08, + "loss": 0.70136863, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12744141, + "step": 15041, + "time_per_iteration": 2.432154417037964 + }, + { + "auxiliary_loss_clip": 0.01110572, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.03965521, + "balance_loss_mlp": 1.01555431, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 2.309148745932902, + "language_loss": 0.57020259, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59158516, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12127686, + "step": 15042, + "time_per_iteration": 2.546816110610962 + }, + { + "auxiliary_loss_clip": 0.01036851, + "auxiliary_loss_mlp": 0.01000362, + "balance_loss_clip": 1.01193094, + "balance_loss_mlp": 0.99898398, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7981648304991628, + "language_loss": 0.56921017, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58958232, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01377869, + "step": 15043, + "time_per_iteration": 3.0305113792419434 + }, + { + "auxiliary_loss_clip": 0.01115582, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.04271877, + "balance_loss_mlp": 1.01497769, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 1.7283883564661564, + "language_loss": 0.70371068, + "learning_rate": 9.486520194855274e-08, + "loss": 0.72514498, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12866211, + "step": 15044, + "time_per_iteration": 2.420956611633301 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.04462004, + "balance_loss_mlp": 1.02279234, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.2189830366758025, + "language_loss": 0.70293862, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72447234, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.13214111, + "step": 15045, + "time_per_iteration": 2.4210243225097656 + }, + { + "auxiliary_loss_clip": 0.01120282, + "auxiliary_loss_mlp": 0.01034028, + "balance_loss_clip": 1.04590714, + "balance_loss_mlp": 1.02097499, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 1.9673654700495053, + "language_loss": 0.65867734, + "learning_rate": 9.462829848313081e-08, + "loss": 0.68022037, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13061523, + "step": 15046, + "time_per_iteration": 2.4512789249420166 + }, + { + "auxiliary_loss_clip": 0.01115141, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.04005837, + "balance_loss_mlp": 1.02195632, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 1.9821548817580252, + "language_loss": 0.62393337, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64543766, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13342285, + "step": 15047, + "time_per_iteration": 3.941960334777832 + }, + { + "auxiliary_loss_clip": 0.01116932, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.04428267, + "balance_loss_mlp": 1.01415968, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.7548499893541512, + "language_loss": 0.71662915, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73805255, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11230469, + "step": 15048, + "time_per_iteration": 2.515282154083252 + }, + { + "auxiliary_loss_clip": 0.01112033, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.03901911, + "balance_loss_mlp": 1.01836109, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 3.2604907264087517, + "language_loss": 0.74896836, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77039921, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12701416, + "step": 15049, + "time_per_iteration": 2.4169180393218994 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.04740214, + "balance_loss_mlp": 1.02118587, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.7353483819621638, + "language_loss": 0.75888735, + "learning_rate": 9.415535861079993e-08, + "loss": 0.78040755, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1204834, + "step": 15050, + "time_per_iteration": 2.4894626140594482 + }, + { + "auxiliary_loss_clip": 0.01111896, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.03990781, + "balance_loss_mlp": 1.02046704, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.664859456545867, + "language_loss": 0.82089639, + "learning_rate": 9.403730430606472e-08, + "loss": 0.8423295, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10925293, + "step": 15051, + "time_per_iteration": 2.4473485946655273 + }, + { + "auxiliary_loss_clip": 0.01116312, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.04330087, + "balance_loss_mlp": 1.0181762, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.236100691614751, + "language_loss": 0.89310306, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91455948, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11151123, + "step": 15052, + "time_per_iteration": 2.6237151622772217 + }, + { + "auxiliary_loss_clip": 0.01123674, + "auxiliary_loss_mlp": 0.01029135, + "balance_loss_clip": 1.04838371, + "balance_loss_mlp": 1.01779866, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 1.8872643638857616, + "language_loss": 0.76857686, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79010499, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11340332, + "step": 15053, + "time_per_iteration": 2.443002462387085 + }, + { + "auxiliary_loss_clip": 0.01109166, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03872097, + "balance_loss_mlp": 1.01932251, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.6228806529785922, + "language_loss": 0.72995961, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75136119, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11663818, + "step": 15054, + "time_per_iteration": 2.50154972076416 + }, + { + "auxiliary_loss_clip": 0.01114236, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.04171395, + "balance_loss_mlp": 1.01942945, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.7238464181007795, + "language_loss": 0.83274108, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85419446, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11682129, + "step": 15055, + "time_per_iteration": 2.5538344383239746 + }, + { + "auxiliary_loss_clip": 0.01111578, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.04196084, + "balance_loss_mlp": 1.02082539, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 2.1051890719846926, + "language_loss": 0.85079855, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87223411, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.1116333, + "step": 15056, + "time_per_iteration": 2.476931095123291 + }, + { + "auxiliary_loss_clip": 0.01112644, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.04108131, + "balance_loss_mlp": 1.02096438, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.9668560411276756, + "language_loss": 0.72115183, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74260128, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11328125, + "step": 15057, + "time_per_iteration": 2.508385419845581 + }, + { + "auxiliary_loss_clip": 0.01112561, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.04099035, + "balance_loss_mlp": 1.01536655, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.918799210405426, + "language_loss": 0.81142354, + "learning_rate": 9.321294810356418e-08, + "loss": 0.83281583, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11297607, + "step": 15058, + "time_per_iteration": 2.4971461296081543 + }, + { + "auxiliary_loss_clip": 0.01039283, + "auxiliary_loss_mlp": 0.01001855, + "balance_loss_clip": 1.01474047, + "balance_loss_mlp": 1.00053, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6743568582076992, + "language_loss": 0.5138486, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53425992, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01324463, + "step": 15059, + "time_per_iteration": 3.201606273651123 + }, + { + "auxiliary_loss_clip": 0.01111223, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.04073548, + "balance_loss_mlp": 1.01785386, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.950756073882903, + "language_loss": 0.67509902, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69651687, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.1270752, + "step": 15060, + "time_per_iteration": 2.4363958835601807 + }, + { + "auxiliary_loss_clip": 0.01120748, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.0462817, + "balance_loss_mlp": 1.01785493, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.407551126621336, + "language_loss": 0.64015299, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66166008, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12091064, + "step": 15061, + "time_per_iteration": 2.4317209720611572 + }, + { + "auxiliary_loss_clip": 0.0111828, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.04567671, + "balance_loss_mlp": 1.02331281, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.7799998516784339, + "language_loss": 0.71932912, + "learning_rate": 9.274347804044058e-08, + "loss": 0.74086475, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11975098, + "step": 15062, + "time_per_iteration": 3.8933184146881104 + }, + { + "auxiliary_loss_clip": 0.01107872, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.03627849, + "balance_loss_mlp": 1.01948357, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.6774730122533146, + "language_loss": 0.711465, + "learning_rate": 9.2626291321936e-08, + "loss": 0.73285091, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11230469, + "step": 15063, + "time_per_iteration": 2.4682605266571045 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.03914309, + "balance_loss_mlp": 1.01721144, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.735953807781871, + "language_loss": 0.72365546, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74503505, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.12408447, + "step": 15064, + "time_per_iteration": 2.5552923679351807 + }, + { + "auxiliary_loss_clip": 0.01111256, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.03711569, + "balance_loss_mlp": 1.02172136, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 2.925915572004128, + "language_loss": 0.69831711, + "learning_rate": 9.23921348727752e-08, + "loss": 0.7197746, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12774658, + "step": 15065, + "time_per_iteration": 2.5590498447418213 + }, + { + "auxiliary_loss_clip": 0.01117889, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.04562283, + "balance_loss_mlp": 1.02253485, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.6236454729740946, + "language_loss": 0.62909776, + "learning_rate": 9.227516515099743e-08, + "loss": 0.6506108, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10876465, + "step": 15066, + "time_per_iteration": 3.8568308353424072 + }, + { + "auxiliary_loss_clip": 0.01117572, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.03882194, + "balance_loss_mlp": 1.02254426, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 2.071336708891335, + "language_loss": 0.80283856, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82438159, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.14172363, + "step": 15067, + "time_per_iteration": 2.680433988571167 + }, + { + "auxiliary_loss_clip": 0.01120227, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.04515767, + "balance_loss_mlp": 1.01767612, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.6376827991065213, + "language_loss": 0.69921464, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72071719, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12365723, + "step": 15068, + "time_per_iteration": 2.3911292552948 + }, + { + "auxiliary_loss_clip": 0.0111271, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.04115283, + "balance_loss_mlp": 1.01923823, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.9262298136233862, + "language_loss": 0.85223937, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87367558, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11682129, + "step": 15069, + "time_per_iteration": 2.431766986846924 + }, + { + "auxiliary_loss_clip": 0.01121167, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.04342556, + "balance_loss_mlp": 1.01959157, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 2.1949083017835824, + "language_loss": 0.59585428, + "learning_rate": 9.180800971936987e-08, + "loss": 0.6173898, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12786865, + "step": 15070, + "time_per_iteration": 2.4906790256500244 + }, + { + "auxiliary_loss_clip": 0.01114252, + "auxiliary_loss_mlp": 0.01024601, + "balance_loss_clip": 1.04041481, + "balance_loss_mlp": 1.01222134, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 2.2248555989853562, + "language_loss": 0.81768727, + "learning_rate": 9.169140174747724e-08, + "loss": 0.8390758, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12384033, + "step": 15071, + "time_per_iteration": 2.436887741088867 + }, + { + "auxiliary_loss_clip": 0.01125931, + "auxiliary_loss_mlp": 0.010404, + "balance_loss_clip": 1.0462954, + "balance_loss_mlp": 1.02770448, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 2.0068217668516044, + "language_loss": 0.61684042, + "learning_rate": 9.157486613883758e-08, + "loss": 0.63850373, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.1270752, + "step": 15072, + "time_per_iteration": 2.3916826248168945 + }, + { + "auxiliary_loss_clip": 0.01114729, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.0423646, + "balance_loss_mlp": 1.01690555, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 2.06525555430541, + "language_loss": 0.72805452, + "learning_rate": 9.145840289787021e-08, + "loss": 0.7494837, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.112854, + "step": 15073, + "time_per_iteration": 4.048861265182495 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01027015, + "balance_loss_clip": 1.04385817, + "balance_loss_mlp": 1.01607132, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.943184020843994, + "language_loss": 0.80672145, + "learning_rate": 9.134201202899161e-08, + "loss": 0.82812715, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10949707, + "step": 15074, + "time_per_iteration": 2.4566893577575684 + }, + { + "auxiliary_loss_clip": 0.01038549, + "auxiliary_loss_mlp": 0.01001032, + "balance_loss_clip": 1.01349998, + "balance_loss_mlp": 0.99976051, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7563828036012418, + "language_loss": 0.52356851, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54396433, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.01271057, + "step": 15075, + "time_per_iteration": 3.126239538192749 + }, + { + "auxiliary_loss_clip": 0.01045755, + "auxiliary_loss_mlp": 0.01004322, + "balance_loss_clip": 1.01998234, + "balance_loss_mlp": 1.00291073, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7407457648639221, + "language_loss": 0.62055016, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64105093, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.25830078, + "router_z_loss_mlp": 0.01409912, + "step": 15076, + "time_per_iteration": 3.0306737422943115 + }, + { + "auxiliary_loss_clip": 0.01114813, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.04409468, + "balance_loss_mlp": 1.02296817, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 2.1047941113758424, + "language_loss": 0.82369334, + "learning_rate": 9.09932736990091e-08, + "loss": 0.8451876, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11657715, + "step": 15077, + "time_per_iteration": 2.4617526531219482 + }, + { + "auxiliary_loss_clip": 0.01107501, + "auxiliary_loss_mlp": 0.01022749, + "balance_loss_clip": 1.03845239, + "balance_loss_mlp": 1.01177549, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.6845075071360445, + "language_loss": 0.83986115, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86116374, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10961914, + "step": 15078, + "time_per_iteration": 2.4926352500915527 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.04809737, + "balance_loss_mlp": 1.0185492, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.7284427985590447, + "language_loss": 0.65383577, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67529213, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.11047363, + "step": 15079, + "time_per_iteration": 2.7030954360961914 + }, + { + "auxiliary_loss_clip": 0.01110959, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.03961468, + "balance_loss_mlp": 1.01594901, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.6054424057387222, + "language_loss": 0.70969689, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73109037, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12438965, + "step": 15080, + "time_per_iteration": 2.666595697402954 + }, + { + "auxiliary_loss_clip": 0.01116925, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.04158306, + "balance_loss_mlp": 1.0185926, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.2211909100987897, + "language_loss": 0.71234822, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73383003, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12670898, + "step": 15081, + "time_per_iteration": 2.404245376586914 + }, + { + "auxiliary_loss_clip": 0.01112119, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.04043877, + "balance_loss_mlp": 1.01902461, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 1.8487240250486792, + "language_loss": 0.74430686, + "learning_rate": 9.04134910022032e-08, + "loss": 0.7657454, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12731934, + "step": 15082, + "time_per_iteration": 2.4981095790863037 + }, + { + "auxiliary_loss_clip": 0.01122134, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.05133414, + "balance_loss_mlp": 1.01975429, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 1.6653324544183226, + "language_loss": 0.78166234, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80319452, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11334229, + "step": 15083, + "time_per_iteration": 2.5066211223602295 + }, + { + "auxiliary_loss_clip": 0.01105684, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.02924013, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.5946247311913326, + "language_loss": 0.68889558, + "learning_rate": 9.01820847747028e-08, + "loss": 0.71037364, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.12884521, + "step": 15084, + "time_per_iteration": 2.477647304534912 + }, + { + "auxiliary_loss_clip": 0.01117998, + "auxiliary_loss_mlp": 0.01028656, + "balance_loss_clip": 1.04478085, + "balance_loss_mlp": 1.01685476, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 1.6085824415269356, + "language_loss": 0.66782099, + "learning_rate": 9.006649028948965e-08, + "loss": 0.68928754, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11804199, + "step": 15085, + "time_per_iteration": 2.4810400009155273 + }, + { + "auxiliary_loss_clip": 0.01036904, + "auxiliary_loss_mlp": 0.0100728, + "balance_loss_clip": 1.0112915, + "balance_loss_mlp": 1.00590563, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7637328867138174, + "language_loss": 0.61287236, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63331425, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.25537109, + "router_z_loss_mlp": 0.01374817, + "step": 15086, + "time_per_iteration": 3.058926582336426 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.04746222, + "balance_loss_mlp": 1.02181458, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.3977372059059998, + "language_loss": 0.719657, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74119353, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1272583, + "step": 15087, + "time_per_iteration": 2.4710967540740967 + }, + { + "auxiliary_loss_clip": 0.01116085, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.04502761, + "balance_loss_mlp": 1.01777947, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 4.5520279475512, + "language_loss": 0.76984578, + "learning_rate": 8.972014140059058e-08, + "loss": 0.79129553, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11114502, + "step": 15088, + "time_per_iteration": 2.43957257270813 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.01027782, + "balance_loss_clip": 1.04247928, + "balance_loss_mlp": 1.01660633, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.8725102909324498, + "language_loss": 0.73195893, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75337756, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11175537, + "step": 15089, + "time_per_iteration": 2.516335964202881 + }, + { + "auxiliary_loss_clip": 0.01118286, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.04917693, + "balance_loss_mlp": 1.02235579, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.8127051250827455, + "language_loss": 0.75632334, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77783871, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10894775, + "step": 15090, + "time_per_iteration": 3.9007697105407715 + }, + { + "auxiliary_loss_clip": 0.01112271, + "auxiliary_loss_mlp": 0.01026826, + "balance_loss_clip": 1.03913152, + "balance_loss_mlp": 1.01454735, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.6702247373286534, + "language_loss": 0.77868474, + "learning_rate": 8.93744444537079e-08, + "loss": 0.80007571, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1229248, + "step": 15091, + "time_per_iteration": 2.5404410362243652 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01029385, + "balance_loss_clip": 1.03876972, + "balance_loss_mlp": 1.01930022, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.5499704952416538, + "language_loss": 0.86179745, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88315362, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.10089111, + "step": 15092, + "time_per_iteration": 2.4833059310913086 + }, + { + "auxiliary_loss_clip": 0.01116754, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.04556131, + "balance_loss_mlp": 1.01969576, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.796579145586846, + "language_loss": 0.78842533, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80990446, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11468506, + "step": 15093, + "time_per_iteration": 2.6194674968719482 + }, + { + "auxiliary_loss_clip": 0.01036463, + "auxiliary_loss_mlp": 0.01003872, + "balance_loss_clip": 1.01167953, + "balance_loss_mlp": 1.00244617, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7377433365618172, + "language_loss": 0.57005703, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59046036, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01425171, + "step": 15094, + "time_per_iteration": 2.9826629161834717 + }, + { + "auxiliary_loss_clip": 0.01113845, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.04022503, + "balance_loss_mlp": 1.02232051, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 2.334107920281882, + "language_loss": 0.71489507, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73639512, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13842773, + "step": 15095, + "time_per_iteration": 2.455381155014038 + }, + { + "auxiliary_loss_clip": 0.01115259, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.04286003, + "balance_loss_mlp": 1.02075291, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.6786444858688143, + "language_loss": 0.73887157, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76034987, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11828613, + "step": 15096, + "time_per_iteration": 2.3961362838745117 + }, + { + "auxiliary_loss_clip": 0.01120077, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.04447007, + "balance_loss_mlp": 1.02293611, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 2.073695080910223, + "language_loss": 0.57284659, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59440947, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.13269043, + "step": 15097, + "time_per_iteration": 2.4910061359405518 + }, + { + "auxiliary_loss_clip": 0.0110755, + "auxiliary_loss_mlp": 0.01023788, + "balance_loss_clip": 1.0381124, + "balance_loss_mlp": 1.0132854, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.6594288124856849, + "language_loss": 0.79662192, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81793523, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.1050415, + "step": 15098, + "time_per_iteration": 2.411912202835083 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.04255748, + "balance_loss_mlp": 1.01950765, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 1.8542796299490631, + "language_loss": 0.66419911, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68568724, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11968994, + "step": 15099, + "time_per_iteration": 2.4468817710876465 + }, + { + "auxiliary_loss_clip": 0.01118793, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.04332471, + "balance_loss_mlp": 1.02050781, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.668671261517341, + "language_loss": 0.70659328, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72811049, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12420654, + "step": 15100, + "time_per_iteration": 2.418830633163452 + }, + { + "auxiliary_loss_clip": 0.01044479, + "auxiliary_loss_mlp": 0.01005481, + "balance_loss_clip": 1.01903629, + "balance_loss_mlp": 1.00408316, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.7147119341894802, + "language_loss": 0.53424227, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55474186, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01397705, + "step": 15101, + "time_per_iteration": 3.163748025894165 + }, + { + "auxiliary_loss_clip": 0.01120789, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.04943907, + "balance_loss_mlp": 1.01662087, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.7337952930381137, + "language_loss": 0.68370992, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70519894, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1149292, + "step": 15102, + "time_per_iteration": 2.467648506164551 + }, + { + "auxiliary_loss_clip": 0.01104937, + "auxiliary_loss_mlp": 0.01025849, + "balance_loss_clip": 1.0358057, + "balance_loss_mlp": 1.01480436, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.8826016134650672, + "language_loss": 0.78990424, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81121212, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11047363, + "step": 15103, + "time_per_iteration": 2.4961771965026855 + }, + { + "auxiliary_loss_clip": 0.01116212, + "auxiliary_loss_mlp": 0.01026552, + "balance_loss_clip": 1.04319906, + "balance_loss_mlp": 1.01503611, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.7351823598481833, + "language_loss": 0.71741927, + "learning_rate": 8.78839607763413e-08, + "loss": 0.7388469, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11517334, + "step": 15104, + "time_per_iteration": 2.5671730041503906 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01023879, + "balance_loss_clip": 1.04645622, + "balance_loss_mlp": 1.01302505, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.7942590230983477, + "language_loss": 0.77543133, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79683733, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10858154, + "step": 15105, + "time_per_iteration": 2.5191807746887207 + }, + { + "auxiliary_loss_clip": 0.01112407, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.03991091, + "balance_loss_mlp": 1.02539539, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.866919278466545, + "language_loss": 0.73858118, + "learning_rate": 8.765574297104628e-08, + "loss": 0.7600739, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11474609, + "step": 15106, + "time_per_iteration": 3.926839828491211 + }, + { + "auxiliary_loss_clip": 0.01111841, + "auxiliary_loss_mlp": 0.01033644, + "balance_loss_clip": 1.03885078, + "balance_loss_mlp": 1.02168715, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.7336675515422626, + "language_loss": 0.80300885, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82446373, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11956787, + "step": 15107, + "time_per_iteration": 2.47347354888916 + }, + { + "auxiliary_loss_clip": 0.01032451, + "auxiliary_loss_mlp": 0.01002655, + "balance_loss_clip": 1.00827026, + "balance_loss_mlp": 1.00136042, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8108860491496028, + "language_loss": 0.59689498, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61724609, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.24194336, + "router_z_loss_mlp": 0.01293945, + "step": 15108, + "time_per_iteration": 3.075308084487915 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01026493, + "balance_loss_clip": 1.04265761, + "balance_loss_mlp": 1.01365972, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.9916423260028473, + "language_loss": 0.74010611, + "learning_rate": 8.73139601460482e-08, + "loss": 0.7615177, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1282959, + "step": 15109, + "time_per_iteration": 2.555539608001709 + }, + { + "auxiliary_loss_clip": 0.01110742, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.04005551, + "balance_loss_mlp": 1.01606321, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 1.7076206220380026, + "language_loss": 0.71804184, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73941827, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10827637, + "step": 15110, + "time_per_iteration": 3.8614866733551025 + }, + { + "auxiliary_loss_clip": 0.0110774, + "auxiliary_loss_mlp": 0.01025288, + "balance_loss_clip": 1.03765178, + "balance_loss_mlp": 1.01422536, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 2.180655955338284, + "language_loss": 0.69068933, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71201956, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11065674, + "step": 15111, + "time_per_iteration": 2.5202529430389404 + }, + { + "auxiliary_loss_clip": 0.01036613, + "auxiliary_loss_mlp": 0.01002096, + "balance_loss_clip": 1.01169634, + "balance_loss_mlp": 1.00068891, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.689602957688728, + "language_loss": 0.51664913, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53703618, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01405334, + "step": 15112, + "time_per_iteration": 3.1194911003112793 + }, + { + "auxiliary_loss_clip": 0.01108776, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.0374248, + "balance_loss_mlp": 1.01941824, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 1.960818051860486, + "language_loss": 0.6997155, + "learning_rate": 8.685926514226837e-08, + "loss": 0.72111428, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11676025, + "step": 15113, + "time_per_iteration": 2.4429824352264404 + }, + { + "auxiliary_loss_clip": 0.01113874, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.04156637, + "balance_loss_mlp": 1.01797509, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.3341604295177842, + "language_loss": 0.79087299, + "learning_rate": 8.674577274677508e-08, + "loss": 0.8123045, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11297607, + "step": 15114, + "time_per_iteration": 2.551816701889038 + }, + { + "auxiliary_loss_clip": 0.01117839, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.04421091, + "balance_loss_mlp": 1.01574326, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 1.9306556645146242, + "language_loss": 0.70078093, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72224557, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12878418, + "step": 15115, + "time_per_iteration": 2.5816872119903564 + }, + { + "auxiliary_loss_clip": 0.01124357, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.04870641, + "balance_loss_mlp": 1.02071619, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.5531936263524517, + "language_loss": 0.65728033, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67885584, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12475586, + "step": 15116, + "time_per_iteration": 2.492241382598877 + }, + { + "auxiliary_loss_clip": 0.01113191, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.04194725, + "balance_loss_mlp": 1.01878285, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 1.591763419638081, + "language_loss": 0.69730473, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71874666, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12213135, + "step": 15117, + "time_per_iteration": 3.8583884239196777 + }, + { + "auxiliary_loss_clip": 0.01108514, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.03748584, + "balance_loss_mlp": 1.01439309, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.4626545352989093, + "language_loss": 0.74572802, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76708031, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.12335205, + "step": 15118, + "time_per_iteration": 2.5705525875091553 + }, + { + "auxiliary_loss_clip": 0.01114456, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.03757238, + "balance_loss_mlp": 1.02095008, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 5.632344958861396, + "language_loss": 0.73392117, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75541985, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.14471436, + "step": 15119, + "time_per_iteration": 2.4379351139068604 + }, + { + "auxiliary_loss_clip": 0.01122601, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.0466466, + "balance_loss_mlp": 1.01873088, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.6234162720982197, + "language_loss": 0.71260226, + "learning_rate": 8.60663420908827e-08, + "loss": 0.7341494, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13378906, + "step": 15120, + "time_per_iteration": 2.511507987976074 + }, + { + "auxiliary_loss_clip": 0.01117027, + "auxiliary_loss_mlp": 0.01022901, + "balance_loss_clip": 1.0437901, + "balance_loss_mlp": 1.01119471, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.0269265216465993, + "language_loss": 0.65729892, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67869812, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11712646, + "step": 15121, + "time_per_iteration": 2.4792957305908203 + }, + { + "auxiliary_loss_clip": 0.01110783, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.02075064, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.6335531585956826, + "language_loss": 0.70005643, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72148854, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11688232, + "step": 15122, + "time_per_iteration": 2.7331418991088867 + }, + { + "auxiliary_loss_clip": 0.01119695, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.04790103, + "balance_loss_mlp": 1.01614046, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.974493983299242, + "language_loss": 0.7457841, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76725686, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11444092, + "step": 15123, + "time_per_iteration": 2.4794256687164307 + }, + { + "auxiliary_loss_clip": 0.01117531, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.04713893, + "balance_loss_mlp": 1.01567245, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 1.9850418009914295, + "language_loss": 0.75930154, + "learning_rate": 8.561483979414253e-08, + "loss": 0.78074133, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10778809, + "step": 15124, + "time_per_iteration": 2.523716688156128 + }, + { + "auxiliary_loss_clip": 0.01111978, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.04140878, + "balance_loss_mlp": 1.01795411, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 2.075592530439762, + "language_loss": 0.72071999, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74213773, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11846924, + "step": 15125, + "time_per_iteration": 2.42867112159729 + }, + { + "auxiliary_loss_clip": 0.01122987, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.05275548, + "balance_loss_mlp": 1.02786267, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.8489479049808828, + "language_loss": 0.79645497, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81808043, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11706543, + "step": 15126, + "time_per_iteration": 2.4217686653137207 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.04125738, + "balance_loss_mlp": 1.02041471, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.6875279265517968, + "language_loss": 0.75530404, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77674043, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.12512207, + "step": 15127, + "time_per_iteration": 2.5255353450775146 + }, + { + "auxiliary_loss_clip": 0.01112789, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.04073501, + "balance_loss_mlp": 1.0177958, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.8529489197240276, + "language_loss": 0.62830377, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64973199, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12243652, + "step": 15128, + "time_per_iteration": 2.558521270751953 + }, + { + "auxiliary_loss_clip": 0.01118342, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.04613435, + "balance_loss_mlp": 1.01502275, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.594055419576254, + "language_loss": 0.76483506, + "learning_rate": 8.505209531291013e-08, + "loss": 0.7862829, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11413574, + "step": 15129, + "time_per_iteration": 2.456857442855835 + }, + { + "auxiliary_loss_clip": 0.01113691, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.04111385, + "balance_loss_mlp": 1.0207659, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 6.964141826787434, + "language_loss": 0.83376712, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85524458, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13287354, + "step": 15130, + "time_per_iteration": 2.4317078590393066 + }, + { + "auxiliary_loss_clip": 0.0111352, + "auxiliary_loss_mlp": 0.01042675, + "balance_loss_clip": 1.04033315, + "balance_loss_mlp": 1.02808976, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 2.0561923022752913, + "language_loss": 0.75257397, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77413589, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.14581299, + "step": 15131, + "time_per_iteration": 2.576565742492676 + }, + { + "auxiliary_loss_clip": 0.01112928, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.04073656, + "balance_loss_mlp": 1.02409327, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 2.155224829492751, + "language_loss": 0.5989123, + "learning_rate": 8.471531997023085e-08, + "loss": 0.6204201, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.13763428, + "step": 15132, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01124097, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.05238104, + "balance_loss_mlp": 1.01943088, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.3852582878106272, + "language_loss": 0.82624221, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84779155, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11395264, + "step": 15133, + "time_per_iteration": 2.4627034664154053 + }, + { + "auxiliary_loss_clip": 0.01114538, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.0399878, + "balance_loss_mlp": 1.0228796, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.6002142951044858, + "language_loss": 0.74071288, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76221526, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12835693, + "step": 15134, + "time_per_iteration": 3.8711047172546387 + }, + { + "auxiliary_loss_clip": 0.01122754, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.04391181, + "balance_loss_mlp": 1.02100718, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 2.5056547817099584, + "language_loss": 0.7260083, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74757093, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12506104, + "step": 15135, + "time_per_iteration": 2.434628963470459 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.0433991, + "balance_loss_mlp": 1.01815939, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 2.212184074776379, + "language_loss": 0.7009697, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72240627, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1126709, + "step": 15136, + "time_per_iteration": 2.542428970336914 + }, + { + "auxiliary_loss_clip": 0.01069577, + "auxiliary_loss_mlp": 0.01008302, + "balance_loss_clip": 1.04552627, + "balance_loss_mlp": 1.00681508, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.819590355345096, + "language_loss": 0.59308374, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61386251, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.24047852, + "router_z_loss_mlp": 0.01487732, + "step": 15137, + "time_per_iteration": 2.848628282546997 + }, + { + "auxiliary_loss_clip": 0.01120726, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.0460062, + "balance_loss_mlp": 1.02435899, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 2.5153845220976736, + "language_loss": 0.82668841, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84825647, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11737061, + "step": 15138, + "time_per_iteration": 2.501462459564209 + }, + { + "auxiliary_loss_clip": 0.01110115, + "auxiliary_loss_mlp": 0.01023165, + "balance_loss_clip": 1.04231083, + "balance_loss_mlp": 1.0125792, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.499140100353093, + "language_loss": 0.81332099, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83465374, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 0.67773438, + "router_z_loss_mlp": 0.10595703, + "step": 15139, + "time_per_iteration": 2.5052480697631836 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01030211, + "balance_loss_clip": 1.04228699, + "balance_loss_mlp": 1.01869512, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.6709034597236987, + "language_loss": 0.77492464, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79638463, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1151123, + "step": 15140, + "time_per_iteration": 2.5705156326293945 + }, + { + "auxiliary_loss_clip": 0.01118449, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.04233789, + "balance_loss_mlp": 1.02065635, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 4.370673826609985, + "language_loss": 0.6667645, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68827331, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11785889, + "step": 15141, + "time_per_iteration": 2.590984344482422 + }, + { + "auxiliary_loss_clip": 0.01120133, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.04484928, + "balance_loss_mlp": 1.02284658, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.7060192565204944, + "language_loss": 0.74924767, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77079308, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11578369, + "step": 15142, + "time_per_iteration": 2.485126256942749 + }, + { + "auxiliary_loss_clip": 0.0111234, + "auxiliary_loss_mlp": 0.01026172, + "balance_loss_clip": 1.04154313, + "balance_loss_mlp": 1.01564598, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.801836906983293, + "language_loss": 0.64609611, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66748118, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10516357, + "step": 15143, + "time_per_iteration": 2.429227352142334 + }, + { + "auxiliary_loss_clip": 0.0111343, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.03999293, + "balance_loss_mlp": 1.01999569, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 1.7037191056837646, + "language_loss": 0.60760111, + "learning_rate": 8.337475624618152e-08, + "loss": 0.62905937, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12402344, + "step": 15144, + "time_per_iteration": 2.534263849258423 + }, + { + "auxiliary_loss_clip": 0.01116477, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.04702377, + "balance_loss_mlp": 1.01609969, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.5902480112261672, + "language_loss": 0.7083115, + "learning_rate": 8.326351491278382e-08, + "loss": 0.72975194, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11480713, + "step": 15145, + "time_per_iteration": 2.4978036880493164 + }, + { + "auxiliary_loss_clip": 0.01110826, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.04156661, + "balance_loss_mlp": 1.01871467, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.6325277283910715, + "language_loss": 0.70921475, + "learning_rate": 8.315234626222545e-08, + "loss": 0.73061967, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10955811, + "step": 15146, + "time_per_iteration": 2.489353895187378 + }, + { + "auxiliary_loss_clip": 0.01110614, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.04004145, + "balance_loss_mlp": 1.01912856, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 1.8776498906595742, + "language_loss": 0.7263115, + "learning_rate": 8.304125029872233e-08, + "loss": 0.74771547, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10662842, + "step": 15147, + "time_per_iteration": 2.4713473320007324 + }, + { + "auxiliary_loss_clip": 0.01114614, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.03944242, + "balance_loss_mlp": 1.01578522, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 2.2749183203488372, + "language_loss": 0.80104911, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82247055, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11737061, + "step": 15148, + "time_per_iteration": 2.4710824489593506 + }, + { + "auxiliary_loss_clip": 0.01118536, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.04319477, + "balance_loss_mlp": 1.02628446, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 2.2188669697083383, + "language_loss": 0.67958015, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70114917, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12078857, + "step": 15149, + "time_per_iteration": 2.426276922225952 + }, + { + "auxiliary_loss_clip": 0.01120753, + "auxiliary_loss_mlp": 0.01024578, + "balance_loss_clip": 1.04792845, + "balance_loss_mlp": 1.01261544, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.6095877161734728, + "language_loss": 0.63158554, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65303886, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11962891, + "step": 15150, + "time_per_iteration": 3.951775074005127 + }, + { + "auxiliary_loss_clip": 0.01109072, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.03769588, + "balance_loss_mlp": 1.01928866, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 1.9631268156342794, + "language_loss": 0.72873986, + "learning_rate": 8.259759339947514e-08, + "loss": 0.75014472, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12145996, + "step": 15151, + "time_per_iteration": 2.4865260124206543 + }, + { + "auxiliary_loss_clip": 0.0111423, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.04350984, + "balance_loss_mlp": 1.01634908, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.724105058569063, + "language_loss": 0.64604688, + "learning_rate": 8.248686093438429e-08, + "loss": 0.6674673, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11474609, + "step": 15152, + "time_per_iteration": 2.474667549133301 + }, + { + "auxiliary_loss_clip": 0.01110097, + "auxiliary_loss_mlp": 0.01027251, + "balance_loss_clip": 1.03876472, + "balance_loss_mlp": 1.01485348, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 1.9443778518828385, + "language_loss": 0.73738211, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75875556, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12402344, + "step": 15153, + "time_per_iteration": 3.9899544715881348 + }, + { + "auxiliary_loss_clip": 0.01112087, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.04067016, + "balance_loss_mlp": 1.0163753, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 2.3687389471821114, + "language_loss": 0.72148091, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74287879, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11340332, + "step": 15154, + "time_per_iteration": 2.4487404823303223 + }, + { + "auxiliary_loss_clip": 0.01112245, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.04186606, + "balance_loss_mlp": 1.01892877, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 3.726896447894254, + "language_loss": 0.81825423, + "learning_rate": 8.215509982963564e-08, + "loss": 0.83967745, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.1114502, + "step": 15155, + "time_per_iteration": 2.4351181983947754 + }, + { + "auxiliary_loss_clip": 0.0111343, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.04278076, + "balance_loss_mlp": 1.01721728, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.520112052813614, + "language_loss": 0.596398, + "learning_rate": 8.204465823887252e-08, + "loss": 0.6178236, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11914062, + "step": 15156, + "time_per_iteration": 2.43693208694458 + }, + { + "auxiliary_loss_clip": 0.01110526, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.03496301, + "balance_loss_mlp": 1.01520562, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 9.333265599181289, + "language_loss": 0.73716247, + "learning_rate": 8.193428937716796e-08, + "loss": 0.7585485, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12878418, + "step": 15157, + "time_per_iteration": 2.502052068710327 + }, + { + "auxiliary_loss_clip": 0.0111995, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.0472846, + "balance_loss_mlp": 1.02390623, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 2.076042970712567, + "language_loss": 0.59525949, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61680567, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10766602, + "step": 15158, + "time_per_iteration": 2.5263359546661377 + }, + { + "auxiliary_loss_clip": 0.01106832, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.03585339, + "balance_loss_mlp": 1.02336013, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.7598348254483767, + "language_loss": 0.6783787, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69979274, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11206055, + "step": 15159, + "time_per_iteration": 2.4726486206054688 + }, + { + "auxiliary_loss_clip": 0.01105185, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.03460097, + "balance_loss_mlp": 1.0183022, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 1.9455301052147471, + "language_loss": 0.78314221, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80449426, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11712646, + "step": 15160, + "time_per_iteration": 2.4936437606811523 + }, + { + "auxiliary_loss_clip": 0.01115077, + "auxiliary_loss_mlp": 0.01028008, + "balance_loss_clip": 1.04173136, + "balance_loss_mlp": 1.01541984, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6722663291439686, + "language_loss": 0.68791479, + "learning_rate": 8.149354130460073e-08, + "loss": 0.7093457, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12597656, + "step": 15161, + "time_per_iteration": 3.9240312576293945 + }, + { + "auxiliary_loss_clip": 0.01110025, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.03674603, + "balance_loss_mlp": 1.01719832, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 2.5053952280473384, + "language_loss": 0.76582599, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78722113, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12286377, + "step": 15162, + "time_per_iteration": 2.6206464767456055 + }, + { + "auxiliary_loss_clip": 0.0110968, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.03812063, + "balance_loss_mlp": 1.01847243, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 3.11596741808107, + "language_loss": 0.67223418, + "learning_rate": 8.127360375135395e-08, + "loss": 0.69363326, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11743164, + "step": 15163, + "time_per_iteration": 2.4550750255584717 + }, + { + "auxiliary_loss_clip": 0.01113714, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.03943276, + "balance_loss_mlp": 1.01784456, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.1566354198884534, + "language_loss": 0.70332766, + "learning_rate": 8.116374411009186e-08, + "loss": 0.7247659, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12268066, + "step": 15164, + "time_per_iteration": 2.4241814613342285 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.04230809, + "balance_loss_mlp": 1.01995802, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.4843139041022753, + "language_loss": 0.76163042, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78307265, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11450195, + "step": 15165, + "time_per_iteration": 2.5174336433410645 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.02307832, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.102081691788182, + "language_loss": 0.72502089, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74648619, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.13153076, + "step": 15166, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01121252, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.04824758, + "balance_loss_mlp": 1.02100158, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.9158031655676564, + "language_loss": 0.7302264, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75177622, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12731934, + "step": 15167, + "time_per_iteration": 2.4286439418792725 + }, + { + "auxiliary_loss_clip": 0.01040238, + "auxiliary_loss_mlp": 0.01001154, + "balance_loss_clip": 1.01576269, + "balance_loss_mlp": 0.99980521, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7685071879368122, + "language_loss": 0.65549624, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67591012, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.24438477, + "router_z_loss_mlp": 0.01348877, + "step": 15168, + "time_per_iteration": 3.0375442504882812 + }, + { + "auxiliary_loss_clip": 0.0110928, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0385859, + "balance_loss_mlp": 1.02192616, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 2.1633500610137806, + "language_loss": 0.78515691, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80659467, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12561035, + "step": 15169, + "time_per_iteration": 2.470186233520508 + }, + { + "auxiliary_loss_clip": 0.01111928, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.04020023, + "balance_loss_mlp": 1.02041209, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.6150065235356568, + "language_loss": 0.81950712, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84095836, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12774658, + "step": 15170, + "time_per_iteration": 2.451641321182251 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.04560828, + "balance_loss_mlp": 1.0180012, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 1.969322808942209, + "language_loss": 0.7721355, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79361022, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.12115479, + "step": 15171, + "time_per_iteration": 2.4135630130767822 + }, + { + "auxiliary_loss_clip": 0.01108323, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.03822923, + "balance_loss_mlp": 1.0194633, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.3292511187845875, + "language_loss": 0.67050028, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69189286, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11468506, + "step": 15172, + "time_per_iteration": 2.437014102935791 + }, + { + "auxiliary_loss_clip": 0.01119845, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.04645038, + "balance_loss_mlp": 1.02274907, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 2.201704119599536, + "language_loss": 0.75185066, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77340209, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12554932, + "step": 15173, + "time_per_iteration": 2.4149088859558105 + }, + { + "auxiliary_loss_clip": 0.01118852, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.04250169, + "balance_loss_mlp": 1.02184153, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.2874407934524412, + "language_loss": 0.65996164, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68152344, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.15490723, + "step": 15174, + "time_per_iteration": 2.4510657787323 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.04629445, + "balance_loss_mlp": 1.01795435, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.9795084122120965, + "language_loss": 0.75257999, + "learning_rate": 7.996009129329894e-08, + "loss": 0.77410525, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12908936, + "step": 15175, + "time_per_iteration": 2.4305732250213623 + }, + { + "auxiliary_loss_clip": 0.01047623, + "auxiliary_loss_mlp": 0.01004391, + "balance_loss_clip": 1.02350497, + "balance_loss_mlp": 1.00322759, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9649104204966903, + "language_loss": 0.58394486, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60446501, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.24145508, + "router_z_loss_mlp": 0.01164246, + "step": 15176, + "time_per_iteration": 3.100468158721924 + }, + { + "auxiliary_loss_clip": 0.01129344, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.05418229, + "balance_loss_mlp": 1.02039218, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.8977676914383352, + "language_loss": 0.65608001, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67769021, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.112854, + "step": 15177, + "time_per_iteration": 3.936816930770874 + }, + { + "auxiliary_loss_clip": 0.01113328, + "auxiliary_loss_mlp": 0.01025768, + "balance_loss_clip": 1.04193473, + "balance_loss_mlp": 1.01460433, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 1.9966621685502994, + "language_loss": 0.81026226, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83165324, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1116333, + "step": 15178, + "time_per_iteration": 2.3830487728118896 + }, + { + "auxiliary_loss_clip": 0.01111479, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.04072261, + "balance_loss_mlp": 1.01559746, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 1.9832674915714072, + "language_loss": 0.79205275, + "learning_rate": 7.952458331306711e-08, + "loss": 0.81343406, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11053467, + "step": 15179, + "time_per_iteration": 2.3968963623046875 + }, + { + "auxiliary_loss_clip": 0.01105013, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.03485048, + "balance_loss_mlp": 1.02215099, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 1.703271351959681, + "language_loss": 0.68153036, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70291394, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11193848, + "step": 15180, + "time_per_iteration": 2.4800031185150146 + }, + { + "auxiliary_loss_clip": 0.01104555, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.03454518, + "balance_loss_mlp": 1.01566994, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.761920137991812, + "language_loss": 0.7509743, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77228582, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10925293, + "step": 15181, + "time_per_iteration": 2.439173698425293 + }, + { + "auxiliary_loss_clip": 0.01121763, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.0467298, + "balance_loss_mlp": 1.0179143, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 1.7618440570920075, + "language_loss": 0.74806988, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76958388, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11724854, + "step": 15182, + "time_per_iteration": 2.476186513900757 + }, + { + "auxiliary_loss_clip": 0.0111258, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.0397948, + "balance_loss_mlp": 1.01875114, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.544151505503698, + "language_loss": 0.76583606, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78726566, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11633301, + "step": 15183, + "time_per_iteration": 2.5245871543884277 + }, + { + "auxiliary_loss_clip": 0.0111353, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.03897583, + "balance_loss_mlp": 1.02275038, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.4754806809107266, + "language_loss": 0.77081686, + "learning_rate": 7.898183692255256e-08, + "loss": 0.79231292, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13342285, + "step": 15184, + "time_per_iteration": 2.435163974761963 + }, + { + "auxiliary_loss_clip": 0.01109209, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.03766954, + "balance_loss_mlp": 1.01992977, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 2.022421636256193, + "language_loss": 0.74522799, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76663792, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11846924, + "step": 15185, + "time_per_iteration": 2.479752540588379 + }, + { + "auxiliary_loss_clip": 0.01112687, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.04028559, + "balance_loss_mlp": 1.01980567, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.177160593878594, + "language_loss": 0.6885972, + "learning_rate": 7.876524825396158e-08, + "loss": 0.71004891, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12652588, + "step": 15186, + "time_per_iteration": 2.4401581287384033 + }, + { + "auxiliary_loss_clip": 0.01124353, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.04645991, + "balance_loss_mlp": 1.02076185, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 1.979987206296395, + "language_loss": 0.77415264, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79573119, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12750244, + "step": 15187, + "time_per_iteration": 2.443927526473999 + }, + { + "auxiliary_loss_clip": 0.01115033, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.04309833, + "balance_loss_mlp": 1.01797771, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 1.9734189872859889, + "language_loss": 0.65719551, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67863101, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10534668, + "step": 15188, + "time_per_iteration": 2.4714574813842773 + }, + { + "auxiliary_loss_clip": 0.01105442, + "auxiliary_loss_mlp": 0.0102746, + "balance_loss_clip": 1.03466272, + "balance_loss_mlp": 1.01652861, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.7774389262114805, + "language_loss": 0.76761341, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78894246, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10919189, + "step": 15189, + "time_per_iteration": 2.415513753890991 + }, + { + "auxiliary_loss_clip": 0.01104574, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.03540134, + "balance_loss_mlp": 1.0174377, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.669385477275456, + "language_loss": 0.75927907, + "learning_rate": 7.8332945190551e-08, + "loss": 0.78060341, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10412598, + "step": 15190, + "time_per_iteration": 2.471383810043335 + }, + { + "auxiliary_loss_clip": 0.01037627, + "auxiliary_loss_mlp": 0.0100324, + "balance_loss_clip": 1.0128516, + "balance_loss_mlp": 1.00187027, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.6928719673958451, + "language_loss": 0.57334322, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59375191, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01368713, + "step": 15191, + "time_per_iteration": 3.156212091445923 + }, + { + "auxiliary_loss_clip": 0.01114464, + "auxiliary_loss_mlp": 0.010383, + "balance_loss_clip": 1.04152322, + "balance_loss_mlp": 1.02486467, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 9.6991129906409, + "language_loss": 0.74408174, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76560938, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.13439941, + "step": 15192, + "time_per_iteration": 2.472882032394409 + }, + { + "auxiliary_loss_clip": 0.01106497, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.03791583, + "balance_loss_mlp": 1.0184176, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.5711520285298757, + "language_loss": 0.69449592, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71587586, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.13079834, + "step": 15193, + "time_per_iteration": 4.050630569458008 + }, + { + "auxiliary_loss_clip": 0.01113446, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.0434916, + "balance_loss_mlp": 1.0246532, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.649239485866699, + "language_loss": 0.73310161, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75459588, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11322021, + "step": 15194, + "time_per_iteration": 2.44238543510437 + }, + { + "auxiliary_loss_clip": 0.01111972, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.03798223, + "balance_loss_mlp": 1.01994002, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 1.8403264312973169, + "language_loss": 0.61694103, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63840806, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.14788818, + "step": 15195, + "time_per_iteration": 2.424696445465088 + }, + { + "auxiliary_loss_clip": 0.01114656, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.04165566, + "balance_loss_mlp": 1.01900744, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.535215364567886, + "language_loss": 0.71409762, + "learning_rate": 7.768667677132201e-08, + "loss": 0.7355628, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12866211, + "step": 15196, + "time_per_iteration": 2.473538398742676 + }, + { + "auxiliary_loss_clip": 0.01118176, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.04464436, + "balance_loss_mlp": 1.02039027, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.568488920951521, + "language_loss": 0.71204132, + "learning_rate": 7.757922047441411e-08, + "loss": 0.7335465, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11950684, + "step": 15197, + "time_per_iteration": 3.9788882732391357 + }, + { + "auxiliary_loss_clip": 0.01123437, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.04900384, + "balance_loss_mlp": 1.01892507, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.9279962078774566, + "language_loss": 0.78036052, + "learning_rate": 7.747183707589489e-08, + "loss": 0.80192012, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13598633, + "step": 15198, + "time_per_iteration": 2.462153911590576 + }, + { + "auxiliary_loss_clip": 0.01109201, + "auxiliary_loss_mlp": 0.01029391, + "balance_loss_clip": 1.03954339, + "balance_loss_mlp": 1.01823878, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.5383397212800702, + "language_loss": 0.6777761, + "learning_rate": 7.736452657983616e-08, + "loss": 0.69916201, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11151123, + "step": 15199, + "time_per_iteration": 2.5092077255249023 + }, + { + "auxiliary_loss_clip": 0.01114798, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.04306936, + "balance_loss_mlp": 1.02014518, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.4913680300896528, + "language_loss": 0.67463005, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69609118, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1116333, + "step": 15200, + "time_per_iteration": 2.4918947219848633 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.04836035, + "balance_loss_mlp": 1.01878333, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 3.217781776027631, + "language_loss": 0.7090959, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73057711, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10571289, + "step": 15201, + "time_per_iteration": 2.4428656101226807 + }, + { + "auxiliary_loss_clip": 0.01110747, + "auxiliary_loss_mlp": 0.0102501, + "balance_loss_clip": 1.04047084, + "balance_loss_mlp": 1.01441789, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.8057136804054204, + "language_loss": 0.71030784, + "learning_rate": 7.704303254710165e-08, + "loss": 0.73166537, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.105896, + "step": 15202, + "time_per_iteration": 2.5334692001342773 + }, + { + "auxiliary_loss_clip": 0.01111044, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.03918386, + "balance_loss_mlp": 1.01709735, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.890025057638321, + "language_loss": 0.66736281, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68876618, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12188721, + "step": 15203, + "time_per_iteration": 2.3866748809814453 + }, + { + "auxiliary_loss_clip": 0.01119657, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.04383683, + "balance_loss_mlp": 1.01690912, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.849596056684508, + "language_loss": 0.69179773, + "learning_rate": 7.682906777877751e-08, + "loss": 0.71328944, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.1262207, + "step": 15204, + "time_per_iteration": 2.5156936645507812 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.0453229, + "balance_loss_mlp": 1.01368415, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 1.8922148419846654, + "language_loss": 0.59292591, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61436898, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12145996, + "step": 15205, + "time_per_iteration": 3.891430377960205 + }, + { + "auxiliary_loss_clip": 0.0110906, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.04150784, + "balance_loss_mlp": 1.01833189, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 3.103823742827317, + "language_loss": 0.81211472, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83350539, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 0.67480469, + "router_z_loss_mlp": 0.11682129, + "step": 15206, + "time_per_iteration": 2.500159740447998 + }, + { + "auxiliary_loss_clip": 0.01120201, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.04663479, + "balance_loss_mlp": 1.01545286, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.539896606824441, + "language_loss": 0.74051213, + "learning_rate": 7.650866758767382e-08, + "loss": 0.76198781, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11920166, + "step": 15207, + "time_per_iteration": 2.426978349685669 + }, + { + "auxiliary_loss_clip": 0.0111225, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.04163122, + "balance_loss_mlp": 1.02174079, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 2.5343547568289915, + "language_loss": 0.72809321, + "learning_rate": 7.640201339654373e-08, + "loss": 0.74955463, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12158203, + "step": 15208, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01122098, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.05008614, + "balance_loss_mlp": 1.01689601, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.3530797601429203, + "language_loss": 0.86858594, + "learning_rate": 7.629543214844237e-08, + "loss": 0.8900851, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10919189, + "step": 15209, + "time_per_iteration": 2.4566166400909424 + }, + { + "auxiliary_loss_clip": 0.01114414, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.0446018, + "balance_loss_mlp": 1.02024865, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.6857172973213268, + "language_loss": 0.75286919, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77432567, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10986328, + "step": 15210, + "time_per_iteration": 2.467021942138672 + }, + { + "auxiliary_loss_clip": 0.01110069, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.03803504, + "balance_loss_mlp": 1.02107131, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.8370044701674222, + "language_loss": 0.78673351, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80816388, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11895752, + "step": 15211, + "time_per_iteration": 2.542013645172119 + }, + { + "auxiliary_loss_clip": 0.01122469, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.04783154, + "balance_loss_mlp": 1.01609778, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.7134199956659273, + "language_loss": 0.82898021, + "learning_rate": 7.597612610270986e-08, + "loss": 0.85047686, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11102295, + "step": 15212, + "time_per_iteration": 2.5233893394470215 + }, + { + "auxiliary_loss_clip": 0.01108896, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.03862953, + "balance_loss_mlp": 1.01771307, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.8272598832655833, + "language_loss": 0.83870387, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86008012, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11010742, + "step": 15213, + "time_per_iteration": 2.4436697959899902 + }, + { + "auxiliary_loss_clip": 0.01124731, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.05025578, + "balance_loss_mlp": 1.01704967, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.945328331302103, + "language_loss": 0.70737112, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72889912, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11016846, + "step": 15214, + "time_per_iteration": 2.4460630416870117 + }, + { + "auxiliary_loss_clip": 0.01122842, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.04832458, + "balance_loss_mlp": 1.02301383, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.898775256979432, + "language_loss": 0.62450171, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64608663, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12652588, + "step": 15215, + "time_per_iteration": 2.583104372024536 + }, + { + "auxiliary_loss_clip": 0.01125962, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.04846346, + "balance_loss_mlp": 1.01759565, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.386838188012298, + "language_loss": 0.76843143, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78998917, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12219238, + "step": 15216, + "time_per_iteration": 2.4183309078216553 + }, + { + "auxiliary_loss_clip": 0.01121082, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.0465138, + "balance_loss_mlp": 1.02191663, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 43.91595658341342, + "language_loss": 0.68444824, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70600259, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12457275, + "step": 15217, + "time_per_iteration": 2.4607410430908203 + }, + { + "auxiliary_loss_clip": 0.01120821, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.04794264, + "balance_loss_mlp": 1.01791763, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.9812710178994375, + "language_loss": 0.80026233, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82176429, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11450195, + "step": 15218, + "time_per_iteration": 2.407285451889038 + }, + { + "auxiliary_loss_clip": 0.01039308, + "auxiliary_loss_mlp": 0.01001511, + "balance_loss_clip": 1.01434481, + "balance_loss_mlp": 1.00009203, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8469244706221537, + "language_loss": 0.59155607, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61196423, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.24926758, + "router_z_loss_mlp": 0.01417542, + "step": 15219, + "time_per_iteration": 3.0522427558898926 + }, + { + "auxiliary_loss_clip": 0.01106407, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.03819215, + "balance_loss_mlp": 1.01823485, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 2.2849978065669223, + "language_loss": 0.7853446, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80670083, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.10980225, + "step": 15220, + "time_per_iteration": 2.410907030105591 + }, + { + "auxiliary_loss_clip": 0.0112187, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.04748225, + "balance_loss_mlp": 1.01793981, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 1.8222574462096182, + "language_loss": 0.65958142, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68110782, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12835693, + "step": 15221, + "time_per_iteration": 3.9969356060028076 + }, + { + "auxiliary_loss_clip": 0.01118427, + "auxiliary_loss_mlp": 0.01028632, + "balance_loss_clip": 1.04623497, + "balance_loss_mlp": 1.01815367, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 2.2058812076734653, + "language_loss": 0.8430109, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86448151, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10479736, + "step": 15222, + "time_per_iteration": 2.411332845687866 + }, + { + "auxiliary_loss_clip": 0.01050537, + "auxiliary_loss_mlp": 0.01006066, + "balance_loss_clip": 1.02552724, + "balance_loss_mlp": 1.00471461, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.718514627926108, + "language_loss": 0.49589589, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51646191, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01350403, + "step": 15223, + "time_per_iteration": 3.1008574962615967 + }, + { + "auxiliary_loss_clip": 0.01117743, + "auxiliary_loss_mlp": 0.01037353, + "balance_loss_clip": 1.04477, + "balance_loss_mlp": 1.02537262, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 2.077323657422068, + "language_loss": 0.72415543, + "learning_rate": 7.470546933201349e-08, + "loss": 0.74570638, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11975098, + "step": 15224, + "time_per_iteration": 2.4601216316223145 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01025363, + "balance_loss_clip": 1.04307079, + "balance_loss_mlp": 1.01320934, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.9087472344757774, + "language_loss": 0.81256938, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83396143, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.121521, + "step": 15225, + "time_per_iteration": 2.4872822761535645 + }, + { + "auxiliary_loss_clip": 0.01108916, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.03719246, + "balance_loss_mlp": 1.0148946, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.4239196436438895, + "language_loss": 0.71364349, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73500603, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12445068, + "step": 15226, + "time_per_iteration": 2.5220978260040283 + }, + { + "auxiliary_loss_clip": 0.01117899, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.04078388, + "balance_loss_mlp": 1.02443326, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 2.605012354436974, + "language_loss": 0.74942172, + "learning_rate": 7.43894475344613e-08, + "loss": 0.77096957, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12469482, + "step": 15227, + "time_per_iteration": 2.5753655433654785 + }, + { + "auxiliary_loss_clip": 0.01113346, + "auxiliary_loss_mlp": 0.01027112, + "balance_loss_clip": 1.04331517, + "balance_loss_mlp": 1.01630592, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4381693976995424, + "language_loss": 0.74049151, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76189613, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.1081543, + "step": 15228, + "time_per_iteration": 2.501323699951172 + }, + { + "auxiliary_loss_clip": 0.01108899, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.02260292, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 2.0219179745504703, + "language_loss": 0.72207892, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74350709, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11315918, + "step": 15229, + "time_per_iteration": 2.5025408267974854 + }, + { + "auxiliary_loss_clip": 0.01116628, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.04474473, + "balance_loss_mlp": 1.01993966, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 1.547214158820024, + "language_loss": 0.8318522, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85333842, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12060547, + "step": 15230, + "time_per_iteration": 2.4445881843566895 + }, + { + "auxiliary_loss_clip": 0.01116822, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.04185188, + "balance_loss_mlp": 1.02441859, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.895450267074514, + "language_loss": 0.83620632, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85773629, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11755371, + "step": 15231, + "time_per_iteration": 2.4589505195617676 + }, + { + "auxiliary_loss_clip": 0.01112668, + "auxiliary_loss_mlp": 0.01023151, + "balance_loss_clip": 1.04048872, + "balance_loss_mlp": 1.01207042, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.615466127641185, + "language_loss": 0.72506607, + "learning_rate": 7.386420497856516e-08, + "loss": 0.7464242, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11083984, + "step": 15232, + "time_per_iteration": 2.4980950355529785 + }, + { + "auxiliary_loss_clip": 0.01115925, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_clip": 1.04286838, + "balance_loss_mlp": 1.02124298, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 4.030797839386708, + "language_loss": 0.67683893, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69832176, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11114502, + "step": 15233, + "time_per_iteration": 2.4276280403137207 + }, + { + "auxiliary_loss_clip": 0.01117455, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.04344177, + "balance_loss_mlp": 1.01723671, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 3.21387676887765, + "language_loss": 0.69539219, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71686232, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12322998, + "step": 15234, + "time_per_iteration": 2.426774263381958 + }, + { + "auxiliary_loss_clip": 0.01116731, + "auxiliary_loss_mlp": 0.01035088, + "balance_loss_clip": 1.04163265, + "balance_loss_mlp": 1.02288103, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.6124865795367997, + "language_loss": 0.88143992, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90295815, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12194824, + "step": 15235, + "time_per_iteration": 2.4582877159118652 + }, + { + "auxiliary_loss_clip": 0.01114286, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.04218769, + "balance_loss_mlp": 1.01633072, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.6428795080284202, + "language_loss": 0.76883852, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79025722, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11248779, + "step": 15236, + "time_per_iteration": 2.464513063430786 + }, + { + "auxiliary_loss_clip": 0.01050631, + "auxiliary_loss_mlp": 0.01001147, + "balance_loss_clip": 1.02637887, + "balance_loss_mlp": 0.99998891, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6733288335618249, + "language_loss": 0.62161076, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64212859, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.24267578, + "router_z_loss_mlp": 0.01156616, + "step": 15237, + "time_per_iteration": 4.443428993225098 + }, + { + "auxiliary_loss_clip": 0.01122849, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.04889941, + "balance_loss_mlp": 1.01638663, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 1.9718300480386635, + "language_loss": 0.74528503, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76680076, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12335205, + "step": 15238, + "time_per_iteration": 2.506683588027954 + }, + { + "auxiliary_loss_clip": 0.01120652, + "auxiliary_loss_mlp": 0.01023475, + "balance_loss_clip": 1.047014, + "balance_loss_mlp": 1.01233435, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.6369425985377608, + "language_loss": 0.74754238, + "learning_rate": 7.313193316030464e-08, + "loss": 0.7689836, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11138916, + "step": 15239, + "time_per_iteration": 2.4103944301605225 + }, + { + "auxiliary_loss_clip": 0.01114153, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.04133606, + "balance_loss_mlp": 1.02001286, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.194069379629493, + "language_loss": 0.63369161, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65515625, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12298584, + "step": 15240, + "time_per_iteration": 2.5215766429901123 + }, + { + "auxiliary_loss_clip": 0.01118671, + "auxiliary_loss_mlp": 0.01023256, + "balance_loss_clip": 1.04852152, + "balance_loss_mlp": 1.01246727, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.944080464476372, + "language_loss": 0.76223898, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78365827, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10791016, + "step": 15241, + "time_per_iteration": 3.8660354614257812 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.04157591, + "balance_loss_mlp": 1.01629448, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.5096961279566723, + "language_loss": 0.67228818, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69379723, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13519287, + "step": 15242, + "time_per_iteration": 2.4033472537994385 + }, + { + "auxiliary_loss_clip": 0.01111554, + "auxiliary_loss_mlp": 0.01027589, + "balance_loss_clip": 1.03971767, + "balance_loss_mlp": 1.01583505, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.7770537111115776, + "language_loss": 0.80913377, + "learning_rate": 7.271509950872334e-08, + "loss": 0.83052522, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11749268, + "step": 15243, + "time_per_iteration": 2.433703660964966 + }, + { + "auxiliary_loss_clip": 0.01122095, + "auxiliary_loss_mlp": 0.01028925, + "balance_loss_clip": 1.04523218, + "balance_loss_mlp": 1.01719475, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.9692932184089458, + "language_loss": 0.81917202, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84068227, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11743164, + "step": 15244, + "time_per_iteration": 2.417884111404419 + }, + { + "auxiliary_loss_clip": 0.01115384, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.04063082, + "balance_loss_mlp": 1.01991558, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.5070457658178107, + "language_loss": 0.72442675, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74590397, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12414551, + "step": 15245, + "time_per_iteration": 2.4365463256835938 + }, + { + "auxiliary_loss_clip": 0.0111696, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.04439199, + "balance_loss_mlp": 1.01851273, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.9052298323021415, + "language_loss": 0.74760026, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76906419, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10919189, + "step": 15246, + "time_per_iteration": 2.414515256881714 + }, + { + "auxiliary_loss_clip": 0.01108622, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.03666437, + "balance_loss_mlp": 1.01862025, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 2.6925328636304946, + "language_loss": 0.75441086, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77580547, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12231445, + "step": 15247, + "time_per_iteration": 2.419968843460083 + }, + { + "auxiliary_loss_clip": 0.01118603, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.04548037, + "balance_loss_mlp": 1.0143702, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.8883121973348782, + "language_loss": 0.75910401, + "learning_rate": 7.219570183756052e-08, + "loss": 0.78055441, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12060547, + "step": 15248, + "time_per_iteration": 2.4694881439208984 + }, + { + "auxiliary_loss_clip": 0.01109381, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.03747845, + "balance_loss_mlp": 1.01932919, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 5.364845846212463, + "language_loss": 0.72810692, + "learning_rate": 7.209204159518178e-08, + "loss": 0.74951935, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12524414, + "step": 15249, + "time_per_iteration": 3.991572141647339 + }, + { + "auxiliary_loss_clip": 0.01110268, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.03748178, + "balance_loss_mlp": 1.01591778, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 2.093148125582712, + "language_loss": 0.759345, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78073001, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12329102, + "step": 15250, + "time_per_iteration": 2.449061870574951 + }, + { + "auxiliary_loss_clip": 0.01112313, + "auxiliary_loss_mlp": 0.01026056, + "balance_loss_clip": 1.04200649, + "balance_loss_mlp": 1.01432002, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.553683565643078, + "language_loss": 0.75930494, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78068864, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11743164, + "step": 15251, + "time_per_iteration": 2.5186092853546143 + }, + { + "auxiliary_loss_clip": 0.01113467, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.04057813, + "balance_loss_mlp": 1.01969314, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.3402437711487725, + "language_loss": 0.80452359, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82598776, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13262939, + "step": 15252, + "time_per_iteration": 2.5634803771972656 + }, + { + "auxiliary_loss_clip": 0.01107026, + "auxiliary_loss_mlp": 0.01031601, + "balance_loss_clip": 1.03565764, + "balance_loss_mlp": 1.01995456, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.8821233990073698, + "language_loss": 0.77228308, + "learning_rate": 7.167813172956316e-08, + "loss": 0.7936694, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11657715, + "step": 15253, + "time_per_iteration": 2.4574997425079346 + }, + { + "auxiliary_loss_clip": 0.01115296, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.04144275, + "balance_loss_mlp": 1.01622224, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.8499297596103086, + "language_loss": 0.72907817, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75050473, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11126709, + "step": 15254, + "time_per_iteration": 2.429443359375 + }, + { + "auxiliary_loss_clip": 0.01106569, + "auxiliary_loss_mlp": 0.01024785, + "balance_loss_clip": 1.03776932, + "balance_loss_mlp": 1.01380563, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.5386971501280102, + "language_loss": 0.79521883, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81653237, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10974121, + "step": 15255, + "time_per_iteration": 2.566540479660034 + }, + { + "auxiliary_loss_clip": 0.01120461, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.04610991, + "balance_loss_mlp": 1.02052283, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 2.14966661184434, + "language_loss": 0.68657345, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70811427, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.13110352, + "step": 15256, + "time_per_iteration": 2.5690765380859375 + }, + { + "auxiliary_loss_clip": 0.01106976, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.0373528, + "balance_loss_mlp": 1.01895642, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.702092080668885, + "language_loss": 0.83872479, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86009139, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10723877, + "step": 15257, + "time_per_iteration": 2.469701051712036 + }, + { + "auxiliary_loss_clip": 0.01107408, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.03788066, + "balance_loss_mlp": 1.01886797, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 2.0891981018160584, + "language_loss": 0.77602553, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79739594, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10766602, + "step": 15258, + "time_per_iteration": 2.467012882232666 + }, + { + "auxiliary_loss_clip": 0.01115337, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.04434717, + "balance_loss_mlp": 1.01734114, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 6.183288570637782, + "language_loss": 0.78786159, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80930507, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11669922, + "step": 15259, + "time_per_iteration": 2.419156312942505 + }, + { + "auxiliary_loss_clip": 0.01109347, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.03722966, + "balance_loss_mlp": 1.02096128, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.5489483743665242, + "language_loss": 0.76331127, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78472477, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11035156, + "step": 15260, + "time_per_iteration": 2.467891216278076 + }, + { + "auxiliary_loss_clip": 0.01108142, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.03586054, + "balance_loss_mlp": 1.0205667, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.56251957054244, + "language_loss": 0.61214614, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63355023, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11706543, + "step": 15261, + "time_per_iteration": 2.5605878829956055 + }, + { + "auxiliary_loss_clip": 0.01105908, + "auxiliary_loss_mlp": 0.01026319, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.01497591, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.8565686855903651, + "language_loss": 0.73637605, + "learning_rate": 7.075111255942002e-08, + "loss": 0.7576983, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11346436, + "step": 15262, + "time_per_iteration": 2.4939517974853516 + }, + { + "auxiliary_loss_clip": 0.01113401, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.03649735, + "balance_loss_mlp": 1.02604842, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.97236290052155, + "language_loss": 0.77958846, + "learning_rate": 7.064847616396496e-08, + "loss": 0.80110562, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12268066, + "step": 15263, + "time_per_iteration": 2.4354536533355713 + }, + { + "auxiliary_loss_clip": 0.0111911, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.04480577, + "balance_loss_mlp": 1.01825488, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.93043099480518, + "language_loss": 0.75505406, + "learning_rate": 7.054591292971324e-08, + "loss": 0.77654517, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11755371, + "step": 15264, + "time_per_iteration": 3.9593963623046875 + }, + { + "auxiliary_loss_clip": 0.01111131, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.03892684, + "balance_loss_mlp": 1.02273142, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 2.2844073310412982, + "language_loss": 0.83438611, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85583407, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.109375, + "step": 15265, + "time_per_iteration": 2.4243178367614746 + }, + { + "auxiliary_loss_clip": 0.011118, + "auxiliary_loss_mlp": 0.01039487, + "balance_loss_clip": 1.03761506, + "balance_loss_mlp": 1.02681482, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.6116634354095316, + "language_loss": 0.73410654, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75561947, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12683105, + "step": 15266, + "time_per_iteration": 2.531498432159424 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.04297376, + "balance_loss_mlp": 1.01954365, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.6221448430143444, + "language_loss": 0.77871001, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80017877, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.112854, + "step": 15267, + "time_per_iteration": 2.503526449203491 + }, + { + "auxiliary_loss_clip": 0.01062963, + "auxiliary_loss_mlp": 0.0100611, + "balance_loss_clip": 1.03841567, + "balance_loss_mlp": 1.00454128, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7366387535845828, + "language_loss": 0.56212401, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58281475, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01568604, + "step": 15268, + "time_per_iteration": 3.1290335655212402 + }, + { + "auxiliary_loss_clip": 0.01116928, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.04229259, + "balance_loss_mlp": 1.01583958, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 2.048408541749834, + "language_loss": 0.76250231, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78396356, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.13354492, + "step": 15269, + "time_per_iteration": 2.4108052253723145 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.04559243, + "balance_loss_mlp": 1.01901388, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 2.304820461638219, + "language_loss": 0.72630793, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74779677, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12536621, + "step": 15270, + "time_per_iteration": 2.607632875442505 + }, + { + "auxiliary_loss_clip": 0.01111554, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03841352, + "balance_loss_mlp": 1.02100706, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.6180795706449058, + "language_loss": 0.79982781, + "learning_rate": 6.98300191299821e-08, + "loss": 0.82127136, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11791992, + "step": 15271, + "time_per_iteration": 2.463569402694702 + }, + { + "auxiliary_loss_clip": 0.01115029, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.04117274, + "balance_loss_mlp": 1.02150178, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 1.8588674627056883, + "language_loss": 0.72837424, + "learning_rate": 6.972804132513355e-08, + "loss": 0.74985933, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11993408, + "step": 15272, + "time_per_iteration": 2.493314266204834 + }, + { + "auxiliary_loss_clip": 0.01105323, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03422344, + "balance_loss_mlp": 1.02186084, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 1.9373857536829782, + "language_loss": 0.72736758, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74875176, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11236572, + "step": 15273, + "time_per_iteration": 2.4794440269470215 + }, + { + "auxiliary_loss_clip": 0.01103826, + "auxiliary_loss_mlp": 0.010258, + "balance_loss_clip": 1.03721058, + "balance_loss_mlp": 1.01539874, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.8364388066893018, + "language_loss": 0.74429739, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76559365, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 0.66601562, + "router_z_loss_mlp": 0.10400391, + "step": 15274, + "time_per_iteration": 2.498835802078247 + }, + { + "auxiliary_loss_clip": 0.01105597, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.0334928, + "balance_loss_mlp": 1.02115083, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.5252626958833706, + "language_loss": 0.68651253, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70789301, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11303711, + "step": 15275, + "time_per_iteration": 2.440244436264038 + }, + { + "auxiliary_loss_clip": 0.01106539, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.03583097, + "balance_loss_mlp": 1.01860797, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.893291323125131, + "language_loss": 0.72568679, + "learning_rate": 6.932086210542953e-08, + "loss": 0.7470566, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11834717, + "step": 15276, + "time_per_iteration": 2.428719997406006 + }, + { + "auxiliary_loss_clip": 0.01115528, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.04294181, + "balance_loss_mlp": 1.02035129, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 2.0702264160433237, + "language_loss": 0.73774588, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75921977, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11505127, + "step": 15277, + "time_per_iteration": 2.5060062408447266 + }, + { + "auxiliary_loss_clip": 0.0104629, + "auxiliary_loss_mlp": 0.01003731, + "balance_loss_clip": 1.0208497, + "balance_loss_mlp": 1.00231993, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7154472156965723, + "language_loss": 0.59189528, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61239541, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01412964, + "step": 15278, + "time_per_iteration": 3.1488497257232666 + }, + { + "auxiliary_loss_clip": 0.0110977, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.03754735, + "balance_loss_mlp": 1.01670933, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.9158250834388268, + "language_loss": 0.64608961, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66745603, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10168457, + "step": 15279, + "time_per_iteration": 2.4381656646728516 + }, + { + "auxiliary_loss_clip": 0.01042706, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.01790774, + "balance_loss_mlp": 0.99984372, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.853736607253029, + "language_loss": 0.60138726, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62182689, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01412964, + "step": 15280, + "time_per_iteration": 4.44871973991394 + }, + { + "auxiliary_loss_clip": 0.01126966, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.04959011, + "balance_loss_mlp": 1.02309752, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 1.793521612914704, + "language_loss": 0.69574368, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71736497, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.1206665, + "step": 15281, + "time_per_iteration": 2.4586386680603027 + }, + { + "auxiliary_loss_clip": 0.0111379, + "auxiliary_loss_mlp": 0.01028944, + "balance_loss_clip": 1.04101992, + "balance_loss_mlp": 1.01674902, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.6733253168089288, + "language_loss": 0.84997618, + "learning_rate": 6.871228969916831e-08, + "loss": 0.87140346, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12213135, + "step": 15282, + "time_per_iteration": 2.4738152027130127 + }, + { + "auxiliary_loss_clip": 0.01116186, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.04575491, + "balance_loss_mlp": 1.02220643, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.8904725868589374, + "language_loss": 0.60493129, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62643683, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.121521, + "step": 15283, + "time_per_iteration": 2.4092376232147217 + }, + { + "auxiliary_loss_clip": 0.01118434, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.04328716, + "balance_loss_mlp": 1.01931298, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.7079100987566718, + "language_loss": 0.6544717, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67596757, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11846924, + "step": 15284, + "time_per_iteration": 3.907470941543579 + }, + { + "auxiliary_loss_clip": 0.01106223, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.03473032, + "balance_loss_mlp": 1.01772082, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.8837111475454376, + "language_loss": 0.73492265, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75628209, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12017822, + "step": 15285, + "time_per_iteration": 2.4661998748779297 + }, + { + "auxiliary_loss_clip": 0.01109882, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.03968215, + "balance_loss_mlp": 1.01990104, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 2.1151122561793887, + "language_loss": 0.72001946, + "learning_rate": 6.830803940283458e-08, + "loss": 0.74144572, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.12841797, + "step": 15286, + "time_per_iteration": 2.400109052658081 + }, + { + "auxiliary_loss_clip": 0.01113292, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.04052174, + "balance_loss_mlp": 1.01929665, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 2.560504167411755, + "language_loss": 0.7339974, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75544167, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11834717, + "step": 15287, + "time_per_iteration": 2.4877521991729736 + }, + { + "auxiliary_loss_clip": 0.0112019, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.04722238, + "balance_loss_mlp": 1.02130222, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 2.012909249370959, + "language_loss": 0.65501326, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67655861, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.13043213, + "step": 15288, + "time_per_iteration": 2.412946939468384 + }, + { + "auxiliary_loss_clip": 0.01121669, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.04681516, + "balance_loss_mlp": 1.01927638, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 2.0745633605653384, + "language_loss": 0.71550035, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73702681, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11724854, + "step": 15289, + "time_per_iteration": 2.566445827484131 + }, + { + "auxiliary_loss_clip": 0.01119477, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.04711866, + "balance_loss_mlp": 1.01941431, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 1.9478035535345632, + "language_loss": 0.74611306, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76761937, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11743164, + "step": 15290, + "time_per_iteration": 2.4177660942077637 + }, + { + "auxiliary_loss_clip": 0.01109798, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.03947282, + "balance_loss_mlp": 1.0216372, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 1.8314989579152066, + "language_loss": 0.71982485, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74126315, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12402344, + "step": 15291, + "time_per_iteration": 2.4421849250793457 + }, + { + "auxiliary_loss_clip": 0.01118415, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.0463953, + "balance_loss_mlp": 1.0165112, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.6245918476122883, + "language_loss": 0.70939732, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73086059, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11413574, + "step": 15292, + "time_per_iteration": 3.971813440322876 + }, + { + "auxiliary_loss_clip": 0.01113733, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.04108191, + "balance_loss_mlp": 1.02134943, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 1.6805690255590782, + "language_loss": 0.72940254, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75087339, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11999512, + "step": 15293, + "time_per_iteration": 2.4517571926116943 + }, + { + "auxiliary_loss_clip": 0.01112091, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.04150915, + "balance_loss_mlp": 1.01620865, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.7607676911944574, + "language_loss": 0.77949989, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80089986, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11688232, + "step": 15294, + "time_per_iteration": 2.42793345451355 + }, + { + "auxiliary_loss_clip": 0.01116721, + "auxiliary_loss_mlp": 0.01031329, + "balance_loss_clip": 1.04147482, + "balance_loss_mlp": 1.0183059, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.59221794068179, + "language_loss": 0.77316689, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79464746, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13018799, + "step": 15295, + "time_per_iteration": 2.479174852371216 + }, + { + "auxiliary_loss_clip": 0.01112585, + "auxiliary_loss_mlp": 0.01026206, + "balance_loss_clip": 1.0437386, + "balance_loss_mlp": 1.01585317, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.072369035914449, + "language_loss": 0.71753204, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73891997, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10345459, + "step": 15296, + "time_per_iteration": 2.5079989433288574 + }, + { + "auxiliary_loss_clip": 0.01116741, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.04381371, + "balance_loss_mlp": 1.02652454, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.7904407072038344, + "language_loss": 0.75209701, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77364695, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11730957, + "step": 15297, + "time_per_iteration": 2.4264345169067383 + }, + { + "auxiliary_loss_clip": 0.01114216, + "auxiliary_loss_mlp": 0.01025477, + "balance_loss_clip": 1.04192054, + "balance_loss_mlp": 1.01338339, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 2.4223279597857044, + "language_loss": 0.73796999, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75936687, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12103271, + "step": 15298, + "time_per_iteration": 2.5795023441314697 + }, + { + "auxiliary_loss_clip": 0.01121278, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.04891908, + "balance_loss_mlp": 1.01910663, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.0110757625998965, + "language_loss": 0.79383916, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81536186, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11895752, + "step": 15299, + "time_per_iteration": 2.4665558338165283 + }, + { + "auxiliary_loss_clip": 0.01114972, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.04244077, + "balance_loss_mlp": 1.01480675, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 1.9689766532324493, + "language_loss": 0.64171374, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66312158, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11010742, + "step": 15300, + "time_per_iteration": 2.4182722568511963 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.0102791, + "balance_loss_clip": 1.04206872, + "balance_loss_mlp": 1.01826012, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.835817979862791, + "language_loss": 0.69496226, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71634567, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.09649658, + "step": 15301, + "time_per_iteration": 2.451773166656494 + }, + { + "auxiliary_loss_clip": 0.01126919, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.05046618, + "balance_loss_mlp": 1.0212698, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 2.3264221324995993, + "language_loss": 0.71336555, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73497379, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12634277, + "step": 15302, + "time_per_iteration": 2.5120346546173096 + }, + { + "auxiliary_loss_clip": 0.01125745, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.05136323, + "balance_loss_mlp": 1.02203727, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.7772855185038168, + "language_loss": 0.76912338, + "learning_rate": 6.660305371021579e-08, + "loss": 0.79071718, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1159668, + "step": 15303, + "time_per_iteration": 2.490402936935425 + }, + { + "auxiliary_loss_clip": 0.01118482, + "auxiliary_loss_mlp": 0.01028698, + "balance_loss_clip": 1.04814005, + "balance_loss_mlp": 1.0176537, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.7719304892708876, + "language_loss": 0.87550175, + "learning_rate": 6.650342008365006e-08, + "loss": 0.89697349, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.1104126, + "step": 15304, + "time_per_iteration": 2.448392868041992 + }, + { + "auxiliary_loss_clip": 0.0112443, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.04956722, + "balance_loss_mlp": 1.02069271, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 1.904809956426782, + "language_loss": 0.77419484, + "learning_rate": 6.64038597754677e-08, + "loss": 0.795789, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.1428833, + "step": 15305, + "time_per_iteration": 2.4326891899108887 + }, + { + "auxiliary_loss_clip": 0.01118559, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.04669523, + "balance_loss_mlp": 1.02263594, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 1.9949652921649872, + "language_loss": 0.81031954, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83185053, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11889648, + "step": 15306, + "time_per_iteration": 2.4948651790618896 + }, + { + "auxiliary_loss_clip": 0.01122196, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.05208957, + "balance_loss_mlp": 1.02213025, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 2.014973019521936, + "language_loss": 0.72522056, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74677247, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10858154, + "step": 15307, + "time_per_iteration": 3.9486873149871826 + }, + { + "auxiliary_loss_clip": 0.01110295, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.03711224, + "balance_loss_mlp": 1.02061999, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.7997951392207714, + "language_loss": 0.7890358, + "learning_rate": 6.610561879896526e-08, + "loss": 0.81047881, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.13409424, + "step": 15308, + "time_per_iteration": 2.454188585281372 + }, + { + "auxiliary_loss_clip": 0.01108891, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.03824079, + "balance_loss_mlp": 1.01662803, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 2.136317252610953, + "language_loss": 0.77813065, + "learning_rate": 6.600635180204484e-08, + "loss": 0.79950446, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11877441, + "step": 15309, + "time_per_iteration": 2.415113687515259 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.03848314, + "balance_loss_mlp": 1.0158118, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 2.15652139733791, + "language_loss": 0.66249096, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68387932, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12225342, + "step": 15310, + "time_per_iteration": 2.4528088569641113 + }, + { + "auxiliary_loss_clip": 0.0111326, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.03781652, + "balance_loss_mlp": 1.01802349, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6641970932216428, + "language_loss": 0.66317517, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68460608, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11798096, + "step": 15311, + "time_per_iteration": 2.5176525115966797 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01039707, + "balance_loss_clip": 1.04137778, + "balance_loss_mlp": 1.02621841, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.69236653083451, + "language_loss": 0.76191366, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78345197, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.13494873, + "step": 15312, + "time_per_iteration": 2.50364351272583 + }, + { + "auxiliary_loss_clip": 0.01111245, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.04216266, + "balance_loss_mlp": 1.02744675, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.7536654608932887, + "language_loss": 0.7937749, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81528139, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.11950684, + "step": 15313, + "time_per_iteration": 2.450819492340088 + }, + { + "auxiliary_loss_clip": 0.01114274, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.04044473, + "balance_loss_mlp": 1.02173555, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.7901579717703313, + "language_loss": 0.78387636, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80535734, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12084961, + "step": 15314, + "time_per_iteration": 2.5464491844177246 + }, + { + "auxiliary_loss_clip": 0.01120353, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.04401088, + "balance_loss_mlp": 1.01880753, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 3.53938912026744, + "language_loss": 0.79078496, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81231177, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13525391, + "step": 15315, + "time_per_iteration": 2.448601484298706 + }, + { + "auxiliary_loss_clip": 0.01124605, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.04618096, + "balance_loss_mlp": 1.02043521, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.8789777255491396, + "language_loss": 0.76264429, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78422141, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12677002, + "step": 15316, + "time_per_iteration": 2.48240327835083 + }, + { + "auxiliary_loss_clip": 0.01111633, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.03847122, + "balance_loss_mlp": 1.01857615, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.6841920026335564, + "language_loss": 0.69238585, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71381283, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12469482, + "step": 15317, + "time_per_iteration": 2.4227280616760254 + }, + { + "auxiliary_loss_clip": 0.01112944, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.0417484, + "balance_loss_mlp": 1.01543558, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.7760836203284143, + "language_loss": 0.83585203, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85725451, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11883545, + "step": 15318, + "time_per_iteration": 2.480560541152954 + }, + { + "auxiliary_loss_clip": 0.01108031, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.03682077, + "balance_loss_mlp": 1.01939082, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 1.953220749532169, + "language_loss": 0.85950583, + "learning_rate": 6.501771600037354e-08, + "loss": 0.88089907, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11914062, + "step": 15319, + "time_per_iteration": 2.415778636932373 + }, + { + "auxiliary_loss_clip": 0.01045042, + "auxiliary_loss_mlp": 0.01002, + "balance_loss_clip": 1.01988268, + "balance_loss_mlp": 1.00067043, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7768889228110072, + "language_loss": 0.56174874, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58221918, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.01329041, + "step": 15320, + "time_per_iteration": 3.1187856197357178 + }, + { + "auxiliary_loss_clip": 0.01111762, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.03681016, + "balance_loss_mlp": 1.01981974, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.2567421201610363, + "language_loss": 0.63800651, + "learning_rate": 6.482086921695384e-08, + "loss": 0.65945327, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13092041, + "step": 15321, + "time_per_iteration": 2.4245786666870117 + }, + { + "auxiliary_loss_clip": 0.01111078, + "auxiliary_loss_mlp": 0.01026993, + "balance_loss_clip": 1.04198074, + "balance_loss_mlp": 1.01593029, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.5089523715370603, + "language_loss": 0.71854901, + "learning_rate": 6.47225558966582e-08, + "loss": 0.7399298, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11065674, + "step": 15322, + "time_per_iteration": 2.5030031204223633 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.03657126, + "balance_loss_mlp": 1.01948047, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 3.268889282806992, + "language_loss": 0.69896865, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72035074, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.1126709, + "step": 15323, + "time_per_iteration": 2.4305002689361572 + }, + { + "auxiliary_loss_clip": 0.01112078, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.03662932, + "balance_loss_mlp": 1.02139103, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 1.7958452852681088, + "language_loss": 0.7483651, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76983535, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.13568115, + "step": 15324, + "time_per_iteration": 3.8649849891662598 + }, + { + "auxiliary_loss_clip": 0.01115484, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_clip": 1.04222202, + "balance_loss_mlp": 1.03102279, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 1.8899632558602466, + "language_loss": 0.71219301, + "learning_rate": 6.442805626615744e-08, + "loss": 0.7337963, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1383667, + "step": 15325, + "time_per_iteration": 2.4799904823303223 + }, + { + "auxiliary_loss_clip": 0.01124056, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.04771364, + "balance_loss_mlp": 1.0217793, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.6343008697430923, + "language_loss": 0.77957511, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80115086, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11743164, + "step": 15326, + "time_per_iteration": 2.494062662124634 + }, + { + "auxiliary_loss_clip": 0.01115197, + "auxiliary_loss_mlp": 0.01037654, + "balance_loss_clip": 1.04070854, + "balance_loss_mlp": 1.02528596, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.3203922011573983, + "language_loss": 0.71365052, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73517901, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1237793, + "step": 15327, + "time_per_iteration": 2.5156452655792236 + }, + { + "auxiliary_loss_clip": 0.01119687, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.04394281, + "balance_loss_mlp": 1.02357531, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 17.065022079784423, + "language_loss": 0.78129041, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80285287, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12982178, + "step": 15328, + "time_per_iteration": 3.9216084480285645 + }, + { + "auxiliary_loss_clip": 0.01114344, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.04386234, + "balance_loss_mlp": 1.01624334, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.284320343363452, + "language_loss": 0.70891339, + "learning_rate": 6.4036417668619e-08, + "loss": 0.7303322, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11297607, + "step": 15329, + "time_per_iteration": 2.486624240875244 + }, + { + "auxiliary_loss_clip": 0.0110997, + "auxiliary_loss_mlp": 0.01024121, + "balance_loss_clip": 1.03849626, + "balance_loss_mlp": 1.01343369, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 1.86076783082363, + "language_loss": 0.86569417, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88703507, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10693359, + "step": 15330, + "time_per_iteration": 2.394029140472412 + }, + { + "auxiliary_loss_clip": 0.01122133, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.04666615, + "balance_loss_mlp": 1.01606143, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.2916163876181637, + "language_loss": 0.76529384, + "learning_rate": 6.384103882660397e-08, + "loss": 0.78679031, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11444092, + "step": 15331, + "time_per_iteration": 2.4303221702575684 + }, + { + "auxiliary_loss_clip": 0.01116296, + "auxiliary_loss_mlp": 0.0102622, + "balance_loss_clip": 1.04423833, + "balance_loss_mlp": 1.01519871, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 3.0334463056166094, + "language_loss": 0.7522887, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77371389, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11016846, + "step": 15332, + "time_per_iteration": 2.43572998046875 + }, + { + "auxiliary_loss_clip": 0.01112486, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.04167342, + "balance_loss_mlp": 1.01841784, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.812121242299169, + "language_loss": 0.74896014, + "learning_rate": 6.364595366195358e-08, + "loss": 0.77037585, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10687256, + "step": 15333, + "time_per_iteration": 2.461435556411743 + }, + { + "auxiliary_loss_clip": 0.01043054, + "auxiliary_loss_mlp": 0.01003133, + "balance_loss_clip": 1.01733255, + "balance_loss_mlp": 1.00177276, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8070641363597513, + "language_loss": 0.52898335, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54944527, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01361084, + "step": 15334, + "time_per_iteration": 3.0907185077667236 + }, + { + "auxiliary_loss_clip": 0.01114099, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.04518926, + "balance_loss_mlp": 1.0198698, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 1.9089917132758518, + "language_loss": 0.6241883, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64563704, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10906982, + "step": 15335, + "time_per_iteration": 2.4254226684570312 + }, + { + "auxiliary_loss_clip": 0.01120987, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.04954183, + "balance_loss_mlp": 1.01910293, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.7017343342950226, + "language_loss": 0.71535897, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73688078, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12084961, + "step": 15336, + "time_per_iteration": 3.974557638168335 + }, + { + "auxiliary_loss_clip": 0.01116754, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.04614544, + "balance_loss_mlp": 1.01933229, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.9264714831887713, + "language_loss": 0.7185477, + "learning_rate": 6.325666448306433e-08, + "loss": 0.74001443, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.105896, + "step": 15337, + "time_per_iteration": 2.394601821899414 + }, + { + "auxiliary_loss_clip": 0.01043585, + "auxiliary_loss_mlp": 0.01003061, + "balance_loss_clip": 1.01886129, + "balance_loss_mlp": 1.00173759, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8831481909063673, + "language_loss": 0.65301096, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67347741, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01322937, + "step": 15338, + "time_per_iteration": 3.061556577682495 + }, + { + "auxiliary_loss_clip": 0.01116803, + "auxiliary_loss_mlp": 0.01028756, + "balance_loss_clip": 1.04393625, + "balance_loss_mlp": 1.01673341, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 1.6925886287092475, + "language_loss": 0.6740014, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69545698, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12011719, + "step": 15339, + "time_per_iteration": 2.506989002227783 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.04041266, + "balance_loss_mlp": 1.01990509, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 2.04217074583718, + "language_loss": 0.71808666, + "learning_rate": 6.296546872173513e-08, + "loss": 0.7395159, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11657715, + "step": 15340, + "time_per_iteration": 2.467910051345825 + }, + { + "auxiliary_loss_clip": 0.01112867, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.04273188, + "balance_loss_mlp": 1.01781344, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.5329634597114947, + "language_loss": 0.70180392, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72322297, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11224365, + "step": 15341, + "time_per_iteration": 2.4966509342193604 + }, + { + "auxiliary_loss_clip": 0.01107567, + "auxiliary_loss_mlp": 0.01027157, + "balance_loss_clip": 1.04055119, + "balance_loss_mlp": 1.01670802, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.5976097407182348, + "language_loss": 0.67347318, + "learning_rate": 6.277170547076571e-08, + "loss": 0.6948204, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 0.66992188, + "router_z_loss_mlp": 0.10449219, + "step": 15342, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01116155, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.04472017, + "balance_loss_mlp": 1.02279973, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 1.9270255620073151, + "language_loss": 0.69820106, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71969891, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10827637, + "step": 15343, + "time_per_iteration": 2.6649599075317383 + }, + { + "auxiliary_loss_clip": 0.0103328, + "auxiliary_loss_mlp": 0.01000411, + "balance_loss_clip": 1.00918984, + "balance_loss_mlp": 0.9991073, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7342952334592157, + "language_loss": 0.51989257, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54022956, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.24047852, + "router_z_loss_mlp": 0.01304626, + "step": 15344, + "time_per_iteration": 3.235145092010498 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.04527164, + "balance_loss_mlp": 1.01910353, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.6288275378571384, + "language_loss": 0.71073103, + "learning_rate": 6.248161155266162e-08, + "loss": 0.73215139, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 0.66894531, + "router_z_loss_mlp": 0.10681152, + "step": 15345, + "time_per_iteration": 2.4911508560180664 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.03814578, + "balance_loss_mlp": 1.02448082, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.8455173944246654, + "language_loss": 0.7719872, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79345435, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1204834, + "step": 15346, + "time_per_iteration": 2.4648404121398926 + }, + { + "auxiliary_loss_clip": 0.01114361, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.03963208, + "balance_loss_mlp": 1.02408886, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 2.0646300255634853, + "language_loss": 0.75761729, + "learning_rate": 6.228858295560457e-08, + "loss": 0.77912581, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12402344, + "step": 15347, + "time_per_iteration": 2.4642858505249023 + }, + { + "auxiliary_loss_clip": 0.01103428, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.03631115, + "balance_loss_mlp": 1.01863265, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.488637498621779, + "language_loss": 0.76870924, + "learning_rate": 6.219217887256367e-08, + "loss": 0.7900362, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.10638428, + "step": 15348, + "time_per_iteration": 2.6055195331573486 + }, + { + "auxiliary_loss_clip": 0.01112845, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.04009414, + "balance_loss_mlp": 1.01649237, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 1.9258991557716698, + "language_loss": 0.67823935, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69965571, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1229248, + "step": 15349, + "time_per_iteration": 2.5070862770080566 + }, + { + "auxiliary_loss_clip": 0.01107156, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.03528833, + "balance_loss_mlp": 1.01451087, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.4862002684305393, + "language_loss": 0.86989868, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89122975, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11437988, + "step": 15350, + "time_per_iteration": 2.448733329772949 + }, + { + "auxiliary_loss_clip": 0.01031703, + "auxiliary_loss_mlp": 0.0099968, + "balance_loss_clip": 1.00763822, + "balance_loss_mlp": 0.99844611, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7731608338639999, + "language_loss": 0.60355759, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62387139, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.0123291, + "step": 15351, + "time_per_iteration": 3.17617130279541 + }, + { + "auxiliary_loss_clip": 0.01112076, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.03908026, + "balance_loss_mlp": 1.01451969, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.084678454284269, + "language_loss": 0.7755689, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79694557, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11071777, + "step": 15352, + "time_per_iteration": 3.863351345062256 + }, + { + "auxiliary_loss_clip": 0.01117835, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_clip": 1.0423398, + "balance_loss_mlp": 1.03384376, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 1.9600473564269543, + "language_loss": 0.59990561, + "learning_rate": 6.171126075837585e-08, + "loss": 0.62156487, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.14251709, + "step": 15353, + "time_per_iteration": 2.481494903564453 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.01028539, + "balance_loss_clip": 1.04514301, + "balance_loss_mlp": 1.01744032, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.6195985778364672, + "language_loss": 0.74785578, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76929533, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11102295, + "step": 15354, + "time_per_iteration": 2.443397283554077 + }, + { + "auxiliary_loss_clip": 0.01122506, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.04663396, + "balance_loss_mlp": 1.01859546, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 4.328613983396666, + "language_loss": 0.65190756, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67343742, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11889648, + "step": 15355, + "time_per_iteration": 2.4306914806365967 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.04734635, + "balance_loss_mlp": 1.01777959, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.5540723887998575, + "language_loss": 0.74110729, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76255274, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10437012, + "step": 15356, + "time_per_iteration": 2.481451988220215 + }, + { + "auxiliary_loss_clip": 0.01109623, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03750134, + "balance_loss_mlp": 1.01912475, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 2.543087160908738, + "language_loss": 0.61544538, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63685358, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12078857, + "step": 15357, + "time_per_iteration": 2.454188108444214 + }, + { + "auxiliary_loss_clip": 0.0111446, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.03778899, + "balance_loss_mlp": 1.01768959, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.3957140051807702, + "language_loss": 0.70116085, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72260892, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12646484, + "step": 15358, + "time_per_iteration": 2.4559977054595947 + }, + { + "auxiliary_loss_clip": 0.01114806, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.04247844, + "balance_loss_mlp": 1.01769745, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 1.9226726675927548, + "language_loss": 0.72998869, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75142694, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11328125, + "step": 15359, + "time_per_iteration": 2.5035338401794434 + }, + { + "auxiliary_loss_clip": 0.01121033, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.04600167, + "balance_loss_mlp": 1.01708603, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 1.8700989195685531, + "language_loss": 0.6471318, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66862589, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11291504, + "step": 15360, + "time_per_iteration": 2.507230520248413 + }, + { + "auxiliary_loss_clip": 0.01046184, + "auxiliary_loss_mlp": 0.01004168, + "balance_loss_clip": 1.02143669, + "balance_loss_mlp": 1.00291014, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.8000404853001842, + "language_loss": 0.55071586, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57121933, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01257324, + "step": 15361, + "time_per_iteration": 3.0276176929473877 + }, + { + "auxiliary_loss_clip": 0.01124756, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.04942107, + "balance_loss_mlp": 1.01635814, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 2.155309140903002, + "language_loss": 0.69804907, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71958345, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12329102, + "step": 15362, + "time_per_iteration": 2.443408250808716 + }, + { + "auxiliary_loss_clip": 0.01117369, + "auxiliary_loss_mlp": 0.01028233, + "balance_loss_clip": 1.04236031, + "balance_loss_mlp": 1.01479816, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.4963952750666338, + "language_loss": 0.75874126, + "learning_rate": 6.075493749149463e-08, + "loss": 0.78019726, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13433838, + "step": 15363, + "time_per_iteration": 2.5765726566314697 + }, + { + "auxiliary_loss_clip": 0.01128982, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.05365896, + "balance_loss_mlp": 1.01595449, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 2.0979230972985756, + "language_loss": 0.83546054, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85702896, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11907959, + "step": 15364, + "time_per_iteration": 2.476858377456665 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01024511, + "balance_loss_clip": 1.04164147, + "balance_loss_mlp": 1.01380634, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 2.2119137907938993, + "language_loss": 0.68240857, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70377588, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10705566, + "step": 15365, + "time_per_iteration": 2.455033540725708 + }, + { + "auxiliary_loss_clip": 0.01112761, + "auxiliary_loss_mlp": 0.01035214, + "balance_loss_clip": 1.04111397, + "balance_loss_mlp": 1.02168953, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.088326909186843, + "language_loss": 0.63354218, + "learning_rate": 6.046947430586913e-08, + "loss": 0.65502191, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.13525391, + "step": 15366, + "time_per_iteration": 2.4759085178375244 + }, + { + "auxiliary_loss_clip": 0.01117293, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.04545951, + "balance_loss_mlp": 1.01606369, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.47664297598205, + "language_loss": 0.74434447, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76580054, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12249756, + "step": 15367, + "time_per_iteration": 3.8937718868255615 + }, + { + "auxiliary_loss_clip": 0.01109119, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.04060447, + "balance_loss_mlp": 1.02113819, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 1.7246905914131563, + "language_loss": 0.64698803, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66840041, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10974121, + "step": 15368, + "time_per_iteration": 2.4915382862091064 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.04724002, + "balance_loss_mlp": 1.01743412, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 2.1047804472160805, + "language_loss": 0.74603105, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76755929, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12072754, + "step": 15369, + "time_per_iteration": 2.458343982696533 + }, + { + "auxiliary_loss_clip": 0.01125759, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.04878211, + "balance_loss_mlp": 1.02044427, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 2.3935510952120067, + "language_loss": 0.76444709, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78604007, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13104248, + "step": 15370, + "time_per_iteration": 3.899261951446533 + }, + { + "auxiliary_loss_clip": 0.01113163, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.04159188, + "balance_loss_mlp": 1.0199666, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.092730171952161, + "language_loss": 0.67092836, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69237471, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11499023, + "step": 15371, + "time_per_iteration": 2.4966089725494385 + }, + { + "auxiliary_loss_clip": 0.01056311, + "auxiliary_loss_mlp": 0.01005587, + "balance_loss_clip": 1.03268707, + "balance_loss_mlp": 1.00427151, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.9888847191317036, + "language_loss": 0.57630682, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59692574, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.23583984, + "router_z_loss_mlp": 0.01315308, + "step": 15372, + "time_per_iteration": 3.008849620819092 + }, + { + "auxiliary_loss_clip": 0.01118422, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.04666138, + "balance_loss_mlp": 1.02434731, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 1.9808017076315454, + "language_loss": 0.69977736, + "learning_rate": 5.98059678590237e-08, + "loss": 0.72131348, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10839844, + "step": 15373, + "time_per_iteration": 2.490583896636963 + }, + { + "auxiliary_loss_clip": 0.01112742, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.04183316, + "balance_loss_mlp": 1.02547359, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.2757776382433064, + "language_loss": 0.75107187, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77256763, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11352539, + "step": 15374, + "time_per_iteration": 2.428067445755005 + }, + { + "auxiliary_loss_clip": 0.01112807, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.04123545, + "balance_loss_mlp": 1.01906717, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.962975274433352, + "language_loss": 0.64784843, + "learning_rate": 5.961705668581784e-08, + "loss": 0.66927671, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10955811, + "step": 15375, + "time_per_iteration": 2.6160497665405273 + }, + { + "auxiliary_loss_clip": 0.01110781, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.03974152, + "balance_loss_mlp": 1.02100146, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.9593926366153471, + "language_loss": 0.66192323, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68336087, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11987305, + "step": 15376, + "time_per_iteration": 2.54399037361145 + }, + { + "auxiliary_loss_clip": 0.01048259, + "auxiliary_loss_mlp": 0.01002704, + "balance_loss_clip": 1.02355993, + "balance_loss_mlp": 1.00140572, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6505082543408947, + "language_loss": 0.61101675, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63152641, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01296997, + "step": 15377, + "time_per_iteration": 3.1670639514923096 + }, + { + "auxiliary_loss_clip": 0.0110504, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.03427005, + "balance_loss_mlp": 1.02330923, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.646967535934354, + "language_loss": 0.73816383, + "learning_rate": 5.933424178131341e-08, + "loss": 0.75956339, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11602783, + "step": 15378, + "time_per_iteration": 2.4471583366394043 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.04049134, + "balance_loss_mlp": 1.01863885, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 1.937144763427924, + "language_loss": 0.62296271, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64441413, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1217041, + "step": 15379, + "time_per_iteration": 3.987487316131592 + }, + { + "auxiliary_loss_clip": 0.01113853, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.04306245, + "balance_loss_mlp": 1.01427436, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 1.8722295856084343, + "language_loss": 0.8384096, + "learning_rate": 5.914606645688591e-08, + "loss": 0.85980523, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11437988, + "step": 15380, + "time_per_iteration": 2.414358615875244 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.04254806, + "balance_loss_mlp": 1.01749825, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.9072484300731143, + "language_loss": 0.73523086, + "learning_rate": 5.905208918895233e-08, + "loss": 0.7567029, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12341309, + "step": 15381, + "time_per_iteration": 2.442511796951294 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.03562784, + "balance_loss_mlp": 1.01958406, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 5.406769937289633, + "language_loss": 0.78589344, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80725622, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10900879, + "step": 15382, + "time_per_iteration": 2.5539586544036865 + }, + { + "auxiliary_loss_clip": 0.01106488, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.03419042, + "balance_loss_mlp": 1.02115798, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.721148022887284, + "language_loss": 0.74999213, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77138925, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12054443, + "step": 15383, + "time_per_iteration": 2.5358402729034424 + }, + { + "auxiliary_loss_clip": 0.01108413, + "auxiliary_loss_mlp": 0.01025891, + "balance_loss_clip": 1.03765965, + "balance_loss_mlp": 1.01494205, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.5546263142955539, + "language_loss": 0.75435722, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77570027, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10943604, + "step": 15384, + "time_per_iteration": 2.5078041553497314 + }, + { + "auxiliary_loss_clip": 0.01117316, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.04648376, + "balance_loss_mlp": 1.02143669, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.9306852329320023, + "language_loss": 0.66302794, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68453413, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11853027, + "step": 15385, + "time_per_iteration": 2.463545560836792 + }, + { + "auxiliary_loss_clip": 0.01120345, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.04796171, + "balance_loss_mlp": 1.02063918, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 1.7786951073256465, + "language_loss": 0.80475026, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82627207, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11193848, + "step": 15386, + "time_per_iteration": 2.486264705657959 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01037571, + "balance_loss_clip": 1.04519415, + "balance_loss_mlp": 1.02597177, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.9928616597197317, + "language_loss": 0.75403821, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77556014, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.1159668, + "step": 15387, + "time_per_iteration": 2.475525379180908 + }, + { + "auxiliary_loss_clip": 0.01107881, + "auxiliary_loss_mlp": 0.01029923, + "balance_loss_clip": 1.03804278, + "balance_loss_mlp": 1.0191468, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.3437491035759925, + "language_loss": 0.70140362, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72278166, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10778809, + "step": 15388, + "time_per_iteration": 2.556262731552124 + }, + { + "auxiliary_loss_clip": 0.01115523, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.04122722, + "balance_loss_mlp": 1.015733, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.6651881531583665, + "language_loss": 0.81984365, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84127235, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11633301, + "step": 15389, + "time_per_iteration": 2.4667105674743652 + }, + { + "auxiliary_loss_clip": 0.01129484, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.04946375, + "balance_loss_mlp": 1.02057815, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.6646505659007798, + "language_loss": 0.78998327, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81161356, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12976074, + "step": 15390, + "time_per_iteration": 2.4543707370758057 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.05163741, + "balance_loss_mlp": 1.02329493, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.695897408799537, + "language_loss": 0.75246453, + "learning_rate": 5.811636514789597e-08, + "loss": 0.7741152, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12658691, + "step": 15391, + "time_per_iteration": 2.4491045475006104 + }, + { + "auxiliary_loss_clip": 0.01112384, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.04007411, + "balance_loss_mlp": 1.0186398, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 25.230336380235077, + "language_loss": 0.52609694, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54753566, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12835693, + "step": 15392, + "time_per_iteration": 2.5836007595062256 + }, + { + "auxiliary_loss_clip": 0.01107942, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.03515518, + "balance_loss_mlp": 1.01593912, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.7160837074280102, + "language_loss": 0.77078372, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79213673, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11413574, + "step": 15393, + "time_per_iteration": 2.494832754135132 + }, + { + "auxiliary_loss_clip": 0.01112706, + "auxiliary_loss_mlp": 0.01030976, + "balance_loss_clip": 1.04237723, + "balance_loss_mlp": 1.01940095, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.8220989207597884, + "language_loss": 0.6925211, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71395797, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11566162, + "step": 15394, + "time_per_iteration": 2.462780237197876 + }, + { + "auxiliary_loss_clip": 0.01113802, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.04018068, + "balance_loss_mlp": 1.01597607, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.6604379977143753, + "language_loss": 0.72808385, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.749502, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12036133, + "step": 15395, + "time_per_iteration": 3.8616299629211426 + }, + { + "auxiliary_loss_clip": 0.01113508, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.04390967, + "balance_loss_mlp": 1.01724803, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 2.0437249334415295, + "language_loss": 0.71570253, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73711413, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10406494, + "step": 15396, + "time_per_iteration": 2.471973419189453 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01028795, + "balance_loss_clip": 1.04004693, + "balance_loss_mlp": 1.01705337, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.6652711787185663, + "language_loss": 0.87176818, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89315605, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11737061, + "step": 15397, + "time_per_iteration": 2.611518383026123 + }, + { + "auxiliary_loss_clip": 0.01040403, + "auxiliary_loss_mlp": 0.01003635, + "balance_loss_clip": 1.01554942, + "balance_loss_mlp": 1.0022831, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.8064931403154729, + "language_loss": 0.55168825, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57212859, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01350403, + "step": 15398, + "time_per_iteration": 2.962545394897461 + }, + { + "auxiliary_loss_clip": 0.01116207, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.03929043, + "balance_loss_mlp": 1.01940453, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.8799045663226512, + "language_loss": 0.75925517, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78074712, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13580322, + "step": 15399, + "time_per_iteration": 2.4755678176879883 + }, + { + "auxiliary_loss_clip": 0.01106749, + "auxiliary_loss_mlp": 0.01026839, + "balance_loss_clip": 1.03812218, + "balance_loss_mlp": 1.01633716, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.7426316323694517, + "language_loss": 0.78212988, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80346584, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.1050415, + "step": 15400, + "time_per_iteration": 2.5173590183258057 + }, + { + "auxiliary_loss_clip": 0.01035794, + "auxiliary_loss_mlp": 0.01001324, + "balance_loss_clip": 1.01137638, + "balance_loss_mlp": 0.99999201, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7266555544367584, + "language_loss": 0.51324546, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53361666, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.24462891, + "router_z_loss_mlp": 0.01332092, + "step": 15401, + "time_per_iteration": 3.0398104190826416 + }, + { + "auxiliary_loss_clip": 0.01113723, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.04451561, + "balance_loss_mlp": 1.01900625, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.630005451877698, + "language_loss": 0.82120371, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84264135, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11029053, + "step": 15402, + "time_per_iteration": 2.4473252296447754 + }, + { + "auxiliary_loss_clip": 0.01050311, + "auxiliary_loss_mlp": 0.01003382, + "balance_loss_clip": 1.0260818, + "balance_loss_mlp": 1.00216961, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7324431496628464, + "language_loss": 0.51045173, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53098863, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.01213074, + "step": 15403, + "time_per_iteration": 3.154452085494995 + }, + { + "auxiliary_loss_clip": 0.01040878, + "auxiliary_loss_mlp": 0.01004693, + "balance_loss_clip": 1.01562285, + "balance_loss_mlp": 1.00326371, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6827990242637686, + "language_loss": 0.58734715, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60780287, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.01428223, + "step": 15404, + "time_per_iteration": 3.073364734649658 + }, + { + "auxiliary_loss_clip": 0.01114123, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.0411036, + "balance_loss_mlp": 1.0214479, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 2.4999444735396916, + "language_loss": 0.71651149, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73798972, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12255859, + "step": 15405, + "time_per_iteration": 2.479001760482788 + }, + { + "auxiliary_loss_clip": 0.01121434, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.04814196, + "balance_loss_mlp": 1.02012336, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.8398237412416907, + "language_loss": 0.68624532, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70778626, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12542725, + "step": 15406, + "time_per_iteration": 2.433481454849243 + }, + { + "auxiliary_loss_clip": 0.01113276, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.02410841, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 9.257377511202865, + "language_loss": 0.7684195, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78991944, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12615967, + "step": 15407, + "time_per_iteration": 2.437953472137451 + }, + { + "auxiliary_loss_clip": 0.01117598, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.03985715, + "balance_loss_mlp": 1.02419782, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 1.9848907627901988, + "language_loss": 0.72908384, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.75063014, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12841797, + "step": 15408, + "time_per_iteration": 2.4577465057373047 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.04512703, + "balance_loss_mlp": 1.0164814, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.70421702612214, + "language_loss": 0.68276727, + "learning_rate": 5.645062061315675e-08, + "loss": 0.7041809, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10247803, + "step": 15409, + "time_per_iteration": 2.6792678833007812 + }, + { + "auxiliary_loss_clip": 0.01113771, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.04101205, + "balance_loss_mlp": 1.01957321, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 2.9413553238850363, + "language_loss": 0.75534415, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77679837, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1206665, + "step": 15410, + "time_per_iteration": 3.89420223236084 + }, + { + "auxiliary_loss_clip": 0.01114043, + "auxiliary_loss_mlp": 0.01027651, + "balance_loss_clip": 1.04148436, + "balance_loss_mlp": 1.01636815, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.5604883771080929, + "language_loss": 0.82136154, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84277844, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11279297, + "step": 15411, + "time_per_iteration": 2.4368324279785156 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.04389405, + "balance_loss_mlp": 1.02178776, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.9743569860843078, + "language_loss": 0.75457245, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77606291, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11358643, + "step": 15412, + "time_per_iteration": 2.4688196182250977 + }, + { + "auxiliary_loss_clip": 0.01114324, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.04221201, + "balance_loss_mlp": 1.01463652, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 2.1224514223409123, + "language_loss": 0.67205739, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69346672, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11956787, + "step": 15413, + "time_per_iteration": 2.536252498626709 + }, + { + "auxiliary_loss_clip": 0.0111284, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.03862786, + "balance_loss_mlp": 1.02164769, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.9424236782713502, + "language_loss": 0.76081526, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78228253, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12231445, + "step": 15414, + "time_per_iteration": 3.888140916824341 + }, + { + "auxiliary_loss_clip": 0.01114376, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.01675844, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 2.578040591883895, + "language_loss": 0.81301093, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83443367, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11126709, + "step": 15415, + "time_per_iteration": 2.435452699661255 + }, + { + "auxiliary_loss_clip": 0.0111474, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.04260445, + "balance_loss_mlp": 1.02161384, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 2.296291457160062, + "language_loss": 0.54208714, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56356621, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11547852, + "step": 15416, + "time_per_iteration": 2.5469868183135986 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.03912294, + "balance_loss_mlp": 1.0271461, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.4415493859315343, + "language_loss": 0.71866572, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74016684, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.13427734, + "step": 15417, + "time_per_iteration": 2.4201438426971436 + }, + { + "auxiliary_loss_clip": 0.01113422, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.04271626, + "balance_loss_mlp": 1.01873827, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 2.0142704753511564, + "language_loss": 0.7596091, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.7810514, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.12084961, + "step": 15418, + "time_per_iteration": 2.4924745559692383 + }, + { + "auxiliary_loss_clip": 0.01104867, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.03473878, + "balance_loss_mlp": 1.01631284, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.4946875681731717, + "language_loss": 0.76345897, + "learning_rate": 5.553552361633174e-08, + "loss": 0.784805, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.13409424, + "step": 15419, + "time_per_iteration": 2.526557207107544 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.03526068, + "balance_loss_mlp": 1.01694024, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 1.8715135742202882, + "language_loss": 0.76327753, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78458208, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.10742188, + "step": 15420, + "time_per_iteration": 2.5389044284820557 + }, + { + "auxiliary_loss_clip": 0.01123162, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.04522312, + "balance_loss_mlp": 1.0229013, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.718765938944448, + "language_loss": 0.7692889, + "learning_rate": 5.535338891759389e-08, + "loss": 0.79087299, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12365723, + "step": 15421, + "time_per_iteration": 2.4861576557159424 + }, + { + "auxiliary_loss_clip": 0.01106893, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.03606129, + "balance_loss_mlp": 1.01594758, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 1.9771829966580101, + "language_loss": 0.72651041, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74785531, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11633301, + "step": 15422, + "time_per_iteration": 2.5213046073913574 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01038391, + "balance_loss_clip": 1.0458709, + "balance_loss_mlp": 1.02620196, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.9931489498377484, + "language_loss": 0.77218533, + "learning_rate": 5.517154918363065e-08, + "loss": 0.7937603, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12188721, + "step": 15423, + "time_per_iteration": 3.834799289703369 + }, + { + "auxiliary_loss_clip": 0.01116112, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.04334903, + "balance_loss_mlp": 1.01866686, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 1.826039285135585, + "language_loss": 0.75304508, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77451122, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1184082, + "step": 15424, + "time_per_iteration": 2.441309690475464 + }, + { + "auxiliary_loss_clip": 0.01031727, + "auxiliary_loss_mlp": 0.0100028, + "balance_loss_clip": 1.00763321, + "balance_loss_mlp": 0.99898034, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7776469833500353, + "language_loss": 0.60675532, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62707537, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01300049, + "step": 15425, + "time_per_iteration": 2.8867998123168945 + }, + { + "auxiliary_loss_clip": 0.01114672, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.04292727, + "balance_loss_mlp": 1.0169642, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.7640603813414164, + "language_loss": 0.70491219, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72634482, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11633301, + "step": 15426, + "time_per_iteration": 2.510930299758911 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01026481, + "balance_loss_clip": 1.0383153, + "balance_loss_mlp": 1.01534057, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.8970320540019656, + "language_loss": 0.82959521, + "learning_rate": 5.480875472030977e-08, + "loss": 0.8509621, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11132812, + "step": 15427, + "time_per_iteration": 2.4754021167755127 + }, + { + "auxiliary_loss_clip": 0.01120155, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.04710364, + "balance_loss_mlp": 1.02076912, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 2.079371932361592, + "language_loss": 0.77012897, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79165131, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11309814, + "step": 15428, + "time_per_iteration": 2.4713823795318604 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.04315853, + "balance_loss_mlp": 1.0164988, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 2.167498521289618, + "language_loss": 0.74617988, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76759839, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11419678, + "step": 15429, + "time_per_iteration": 2.4768874645233154 + }, + { + "auxiliary_loss_clip": 0.01110383, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.04041731, + "balance_loss_mlp": 1.02213573, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 2.004633164593324, + "language_loss": 0.74774015, + "learning_rate": 5.45374333601647e-08, + "loss": 0.76917386, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10870361, + "step": 15430, + "time_per_iteration": 2.4018466472625732 + }, + { + "auxiliary_loss_clip": 0.0110823, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.03676903, + "balance_loss_mlp": 1.01854801, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.383621696004785, + "language_loss": 0.76666629, + "learning_rate": 5.444714044648391e-08, + "loss": 0.7880581, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12414551, + "step": 15431, + "time_per_iteration": 2.618443012237549 + }, + { + "auxiliary_loss_clip": 0.01108466, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.03800392, + "balance_loss_mlp": 1.01983023, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.8477395745093477, + "language_loss": 0.70904982, + "learning_rate": 5.4356921308363e-08, + "loss": 0.73044932, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11639404, + "step": 15432, + "time_per_iteration": 2.5934834480285645 + }, + { + "auxiliary_loss_clip": 0.01111076, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.03652358, + "balance_loss_mlp": 1.02051067, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.770628304630394, + "language_loss": 0.82115716, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84259272, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11968994, + "step": 15433, + "time_per_iteration": 2.4703590869903564 + }, + { + "auxiliary_loss_clip": 0.01116176, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.04691982, + "balance_loss_mlp": 1.02142358, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.9111266763875563, + "language_loss": 0.66681141, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68829381, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10638428, + "step": 15434, + "time_per_iteration": 2.478238821029663 + }, + { + "auxiliary_loss_clip": 0.0111366, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.04685664, + "balance_loss_mlp": 1.01657295, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 2.082265576762006, + "language_loss": 0.69148576, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71289724, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 0.66748047, + "router_z_loss_mlp": 0.10919189, + "step": 15435, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.0110902, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.03598905, + "balance_loss_mlp": 1.01549125, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 1.9722220850866923, + "language_loss": 0.7229144, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74427968, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12017822, + "step": 15436, + "time_per_iteration": 2.379542350769043 + }, + { + "auxiliary_loss_clip": 0.01111942, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04026878, + "balance_loss_mlp": 1.01895738, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.3770378789993867, + "language_loss": 0.67284256, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69428134, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12988281, + "step": 15437, + "time_per_iteration": 2.4958243370056152 + }, + { + "auxiliary_loss_clip": 0.01108484, + "auxiliary_loss_mlp": 0.0103644, + "balance_loss_clip": 1.03520036, + "balance_loss_mlp": 1.02351809, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.115998381590604, + "language_loss": 0.71795541, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73940468, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12927246, + "step": 15438, + "time_per_iteration": 2.433882236480713 + }, + { + "auxiliary_loss_clip": 0.01115163, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.04221928, + "balance_loss_mlp": 1.02119374, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.8279498696339795, + "language_loss": 0.64621425, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.6677025, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12457275, + "step": 15439, + "time_per_iteration": 4.009876251220703 + }, + { + "auxiliary_loss_clip": 0.01118399, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.04361677, + "balance_loss_mlp": 1.01752853, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 1.7995194529870935, + "language_loss": 0.70504081, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7265197, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11962891, + "step": 15440, + "time_per_iteration": 2.4822471141815186 + }, + { + "auxiliary_loss_clip": 0.01125424, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.05012441, + "balance_loss_mlp": 1.02029312, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.6148892890525017, + "language_loss": 0.77005565, + "learning_rate": 5.354826952900682e-08, + "loss": 0.7916308, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11804199, + "step": 15441, + "time_per_iteration": 2.4882242679595947 + }, + { + "auxiliary_loss_clip": 0.01111552, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.04408562, + "balance_loss_mlp": 1.01681888, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.754495237401661, + "language_loss": 0.63878191, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66016048, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 0.67480469, + "router_z_loss_mlp": 0.09490967, + "step": 15442, + "time_per_iteration": 2.4757561683654785 + }, + { + "auxiliary_loss_clip": 0.01119653, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.04478443, + "balance_loss_mlp": 1.02577877, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.7756338650479282, + "language_loss": 0.81174368, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83331448, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11645508, + "step": 15443, + "time_per_iteration": 2.438127279281616 + }, + { + "auxiliary_loss_clip": 0.0111188, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.04083693, + "balance_loss_mlp": 1.01869845, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.952244678138863, + "language_loss": 0.65623724, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67766529, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.12213135, + "step": 15444, + "time_per_iteration": 2.4253320693969727 + }, + { + "auxiliary_loss_clip": 0.01118049, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.0437398, + "balance_loss_mlp": 1.01971161, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 2.2407352993517207, + "language_loss": 0.73577309, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.7572825, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.13171387, + "step": 15445, + "time_per_iteration": 2.396249294281006 + }, + { + "auxiliary_loss_clip": 0.0111968, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.04466212, + "balance_loss_mlp": 1.02638769, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.5999080147332074, + "language_loss": 0.71363473, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73521864, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12304688, + "step": 15446, + "time_per_iteration": 2.4369311332702637 + }, + { + "auxiliary_loss_clip": 0.01121118, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.04625571, + "balance_loss_mlp": 1.01593876, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.9445685670612116, + "language_loss": 0.69326121, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71475375, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12188721, + "step": 15447, + "time_per_iteration": 2.5092382431030273 + }, + { + "auxiliary_loss_clip": 0.01110444, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.04369426, + "balance_loss_mlp": 1.01662695, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.5630298723524476, + "language_loss": 0.72230673, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74367964, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 0.10217285, + "step": 15448, + "time_per_iteration": 2.4508235454559326 + }, + { + "auxiliary_loss_clip": 0.01115687, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.04242456, + "balance_loss_mlp": 1.0148654, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.770714451089635, + "language_loss": 0.7421186, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76355147, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12738037, + "step": 15449, + "time_per_iteration": 2.449023962020874 + }, + { + "auxiliary_loss_clip": 0.01116635, + "auxiliary_loss_mlp": 0.0102623, + "balance_loss_clip": 1.04445529, + "balance_loss_mlp": 1.01396334, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 2.143624397927063, + "language_loss": 0.67657745, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69800603, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12268066, + "step": 15450, + "time_per_iteration": 2.4836299419403076 + }, + { + "auxiliary_loss_clip": 0.01116268, + "auxiliary_loss_mlp": 0.01026409, + "balance_loss_clip": 1.04381478, + "balance_loss_mlp": 1.01466739, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.2525289040285768, + "language_loss": 0.71821636, + "learning_rate": 5.265677957368875e-08, + "loss": 0.7396431, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11737061, + "step": 15451, + "time_per_iteration": 2.428516387939453 + }, + { + "auxiliary_loss_clip": 0.01114791, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.04125094, + "balance_loss_mlp": 1.01872218, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 2.175077152644801, + "language_loss": 0.73916507, + "learning_rate": 5.25680366614687e-08, + "loss": 0.76061481, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11462402, + "step": 15452, + "time_per_iteration": 2.438488483428955 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.01026885, + "balance_loss_clip": 1.04760003, + "balance_loss_mlp": 1.01508343, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.6980229682423484, + "language_loss": 0.74229908, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76376891, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11804199, + "step": 15453, + "time_per_iteration": 2.473832607269287 + }, + { + "auxiliary_loss_clip": 0.01050873, + "auxiliary_loss_mlp": 0.01004235, + "balance_loss_clip": 1.02665281, + "balance_loss_mlp": 1.00292349, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8126529165923452, + "language_loss": 0.60674083, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62729192, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.0131073, + "step": 15454, + "time_per_iteration": 4.41094970703125 + }, + { + "auxiliary_loss_clip": 0.01112039, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.03832841, + "balance_loss_mlp": 1.01989663, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.678126460041923, + "language_loss": 0.68882346, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71025872, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11590576, + "step": 15455, + "time_per_iteration": 2.475149631500244 + }, + { + "auxiliary_loss_clip": 0.01118265, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.04437149, + "balance_loss_mlp": 1.02163744, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.6925670402481312, + "language_loss": 0.6487093, + "learning_rate": 5.22138035143509e-08, + "loss": 0.67024207, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.13378906, + "step": 15456, + "time_per_iteration": 2.485323190689087 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.04332411, + "balance_loss_mlp": 1.01988316, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 2.078731592991508, + "language_loss": 0.68395352, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70543826, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.125, + "step": 15457, + "time_per_iteration": 3.8743767738342285 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.04058599, + "balance_loss_mlp": 1.01524234, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 2.0314469115817158, + "language_loss": 0.80867279, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83008009, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11132812, + "step": 15458, + "time_per_iteration": 2.5012478828430176 + }, + { + "auxiliary_loss_clip": 0.01120545, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.04796743, + "balance_loss_mlp": 1.02020788, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.5585612681460024, + "language_loss": 0.7221669, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74369609, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1217041, + "step": 15459, + "time_per_iteration": 2.4714012145996094 + }, + { + "auxiliary_loss_clip": 0.01110251, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.03746414, + "balance_loss_mlp": 1.02063227, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.4296623904553445, + "language_loss": 0.59192038, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.61334491, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11560059, + "step": 15460, + "time_per_iteration": 2.487661600112915 + }, + { + "auxiliary_loss_clip": 0.01119679, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.04580402, + "balance_loss_mlp": 1.01916027, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 2.086110024584471, + "language_loss": 0.8048228, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82633185, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12054443, + "step": 15461, + "time_per_iteration": 2.5312178134918213 + }, + { + "auxiliary_loss_clip": 0.0110408, + "auxiliary_loss_mlp": 0.01025378, + "balance_loss_clip": 1.03509235, + "balance_loss_mlp": 1.01419055, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 10.2372258997823, + "language_loss": 0.78433955, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80563414, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.11187744, + "step": 15462, + "time_per_iteration": 2.4917640686035156 + }, + { + "auxiliary_loss_clip": 0.01106276, + "auxiliary_loss_mlp": 0.01023717, + "balance_loss_clip": 1.03455305, + "balance_loss_mlp": 1.01258254, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 2.094034648283338, + "language_loss": 0.63053334, + "learning_rate": 5.159673925518282e-08, + "loss": 0.6518333, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11138916, + "step": 15463, + "time_per_iteration": 2.4650990962982178 + }, + { + "auxiliary_loss_clip": 0.01102862, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.03308702, + "balance_loss_mlp": 1.01865804, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.4152472132856992, + "language_loss": 0.71207166, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73339361, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10675049, + "step": 15464, + "time_per_iteration": 2.5483946800231934 + }, + { + "auxiliary_loss_clip": 0.01112165, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.03866696, + "balance_loss_mlp": 1.01780438, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 2.835437218985497, + "language_loss": 0.77493531, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79634869, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1137085, + "step": 15465, + "time_per_iteration": 2.390031337738037 + }, + { + "auxiliary_loss_clip": 0.01038246, + "auxiliary_loss_mlp": 0.01002599, + "balance_loss_clip": 1.0133146, + "balance_loss_mlp": 1.00125039, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6951628345280002, + "language_loss": 0.56495172, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58536017, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01348877, + "step": 15466, + "time_per_iteration": 4.570564031600952 + }, + { + "auxiliary_loss_clip": 0.01113213, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.04145932, + "balance_loss_mlp": 1.02383041, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.6972445920921932, + "language_loss": 0.72546339, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.74696124, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12744141, + "step": 15467, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01111606, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.03872228, + "balance_loss_mlp": 1.01976693, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.6326874341276034, + "language_loss": 0.7171495, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73859596, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.13269043, + "step": 15468, + "time_per_iteration": 2.4762985706329346 + }, + { + "auxiliary_loss_clip": 0.01115513, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.04094946, + "balance_loss_mlp": 1.0198487, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.734686585505781, + "language_loss": 0.75563228, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77712464, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13873291, + "step": 15469, + "time_per_iteration": 2.4805281162261963 + }, + { + "auxiliary_loss_clip": 0.01115939, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.0404861, + "balance_loss_mlp": 1.02780938, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 2.2903665905513186, + "language_loss": 0.75895667, + "learning_rate": 5.098329529416379e-08, + "loss": 0.78052241, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12823486, + "step": 15470, + "time_per_iteration": 2.4823007583618164 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.0102825, + "balance_loss_clip": 1.04594266, + "balance_loss_mlp": 1.01721096, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.7369226042642034, + "language_loss": 0.74422711, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76568484, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11035156, + "step": 15471, + "time_per_iteration": 2.4659533500671387 + }, + { + "auxiliary_loss_clip": 0.01114124, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.04181457, + "balance_loss_mlp": 1.01707363, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 4.231968925027768, + "language_loss": 0.69377244, + "learning_rate": 5.080869070341487e-08, + "loss": 0.7152034, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11901855, + "step": 15472, + "time_per_iteration": 2.4374849796295166 + }, + { + "auxiliary_loss_clip": 0.01115237, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.04563069, + "balance_loss_mlp": 1.01842093, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.962502161613921, + "language_loss": 0.88504994, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90649152, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10491943, + "step": 15473, + "time_per_iteration": 2.480074405670166 + }, + { + "auxiliary_loss_clip": 0.01119624, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.04468203, + "balance_loss_mlp": 1.01911902, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 2.2838580667038664, + "language_loss": 0.64118421, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66270316, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13140869, + "step": 15474, + "time_per_iteration": 2.444810152053833 + }, + { + "auxiliary_loss_clip": 0.01119075, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.04686642, + "balance_loss_mlp": 1.01932168, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 2.539682998128521, + "language_loss": 0.75187767, + "learning_rate": 5.054733817702339e-08, + "loss": 0.77337891, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11743164, + "step": 15475, + "time_per_iteration": 2.641901731491089 + }, + { + "auxiliary_loss_clip": 0.01118881, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.04744887, + "balance_loss_mlp": 1.0171628, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.1042849873324445, + "language_loss": 0.66144526, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68291581, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11004639, + "step": 15476, + "time_per_iteration": 2.5279159545898438 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.04167247, + "balance_loss_mlp": 1.02215791, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.3647392018393436, + "language_loss": 0.6889798, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71047437, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1206665, + "step": 15477, + "time_per_iteration": 2.5042648315429688 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.04078412, + "balance_loss_mlp": 1.01953924, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.7209961320368763, + "language_loss": 0.58433163, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60574561, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11230469, + "step": 15478, + "time_per_iteration": 2.4668846130371094 + }, + { + "auxiliary_loss_clip": 0.01123379, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.04565954, + "balance_loss_mlp": 1.01831913, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 1.8069672866213746, + "language_loss": 0.79115939, + "learning_rate": 5.01999030853566e-08, + "loss": 0.81270802, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.1315918, + "step": 15479, + "time_per_iteration": 2.41652774810791 + }, + { + "auxiliary_loss_clip": 0.01114602, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.04264104, + "balance_loss_mlp": 1.0177573, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 1.7522228905200523, + "language_loss": 0.68839806, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70982897, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10736084, + "step": 15480, + "time_per_iteration": 2.578617572784424 + }, + { + "auxiliary_loss_clip": 0.01112525, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.04123759, + "balance_loss_mlp": 1.01639163, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.6597253321365266, + "language_loss": 0.6780169, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69942391, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11779785, + "step": 15481, + "time_per_iteration": 2.4768011569976807 + }, + { + "auxiliary_loss_clip": 0.01111101, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.03943264, + "balance_loss_mlp": 1.01973915, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.9013572640604208, + "language_loss": 0.74655384, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76797545, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11322021, + "step": 15482, + "time_per_iteration": 2.5025408267974854 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.04190266, + "balance_loss_mlp": 1.02029455, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.842394236677548, + "language_loss": 0.8010456, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82251287, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11260986, + "step": 15483, + "time_per_iteration": 3.98512864112854 + }, + { + "auxiliary_loss_clip": 0.01115139, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.04121971, + "balance_loss_mlp": 1.01865494, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 1.9991455182850897, + "language_loss": 0.74499637, + "learning_rate": 4.976727281916782e-08, + "loss": 0.7664541, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11987305, + "step": 15484, + "time_per_iteration": 2.4523842334747314 + }, + { + "auxiliary_loss_clip": 0.0111465, + "auxiliary_loss_mlp": 0.01028397, + "balance_loss_clip": 1.041821, + "balance_loss_mlp": 1.01655912, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 3.902089626117028, + "language_loss": 0.76553845, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78696889, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1182251, + "step": 15485, + "time_per_iteration": 2.4420981407165527 + }, + { + "auxiliary_loss_clip": 0.01116363, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.04122579, + "balance_loss_mlp": 1.01748943, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.795713683891391, + "language_loss": 0.78284776, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80431259, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1262207, + "step": 15486, + "time_per_iteration": 2.554112434387207 + }, + { + "auxiliary_loss_clip": 0.01118867, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.04461563, + "balance_loss_mlp": 1.018435, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 7.316990001831047, + "language_loss": 0.77027941, + "learning_rate": 4.950858206945674e-08, + "loss": 0.79177535, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12310791, + "step": 15487, + "time_per_iteration": 2.4485151767730713 + }, + { + "auxiliary_loss_clip": 0.01112237, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.04220009, + "balance_loss_mlp": 1.01434326, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.1314087577666454, + "language_loss": 0.67022288, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69160908, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.12030029, + "step": 15488, + "time_per_iteration": 2.5692877769470215 + }, + { + "auxiliary_loss_clip": 0.01112918, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.04435468, + "balance_loss_mlp": 1.01924992, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.9560739496439061, + "language_loss": 0.7507422, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77218503, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.12103271, + "step": 15489, + "time_per_iteration": 2.4448089599609375 + }, + { + "auxiliary_loss_clip": 0.01122693, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.04832482, + "balance_loss_mlp": 1.01833487, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 3.6713304506967375, + "language_loss": 0.81135499, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83288908, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.1237793, + "step": 15490, + "time_per_iteration": 2.461717367172241 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.04766309, + "balance_loss_mlp": 1.02008533, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 1.594501585717491, + "language_loss": 0.71632379, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.73786378, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12445068, + "step": 15491, + "time_per_iteration": 2.4498817920684814 + }, + { + "auxiliary_loss_clip": 0.01112962, + "auxiliary_loss_mlp": 0.01022881, + "balance_loss_clip": 1.04368854, + "balance_loss_mlp": 1.01264703, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.7182535329059416, + "language_loss": 0.74502152, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76637995, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10241699, + "step": 15492, + "time_per_iteration": 2.5164906978607178 + }, + { + "auxiliary_loss_clip": 0.01040406, + "auxiliary_loss_mlp": 0.01004925, + "balance_loss_clip": 1.01591206, + "balance_loss_mlp": 1.00363612, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.710873378763162, + "language_loss": 0.53397918, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55443251, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01289368, + "step": 15493, + "time_per_iteration": 2.9309728145599365 + }, + { + "auxiliary_loss_clip": 0.0111328, + "auxiliary_loss_mlp": 0.01028781, + "balance_loss_clip": 1.04259014, + "balance_loss_mlp": 1.01786709, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.7538916200872696, + "language_loss": 0.71005559, + "learning_rate": 4.890755917128531e-08, + "loss": 0.73147619, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10906982, + "step": 15494, + "time_per_iteration": 2.4480502605438232 + }, + { + "auxiliary_loss_clip": 0.01116529, + "auxiliary_loss_mlp": 0.01028104, + "balance_loss_clip": 1.04298019, + "balance_loss_mlp": 1.01618934, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.764911267225783, + "language_loss": 0.68409449, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70554084, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11920166, + "step": 15495, + "time_per_iteration": 2.5106184482574463 + }, + { + "auxiliary_loss_clip": 0.01107052, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.0370115, + "balance_loss_mlp": 1.0220927, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.8084370653582453, + "language_loss": 0.61807317, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63947833, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11376953, + "step": 15496, + "time_per_iteration": 2.52592134475708 + }, + { + "auxiliary_loss_clip": 0.01106135, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.03516698, + "balance_loss_mlp": 1.01891804, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.8623034988875502, + "language_loss": 0.76781058, + "learning_rate": 4.865108764847825e-08, + "loss": 0.78918087, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11987305, + "step": 15497, + "time_per_iteration": 4.059306383132935 + }, + { + "auxiliary_loss_clip": 0.01114582, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.04087162, + "balance_loss_mlp": 1.01996815, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.7973470036697323, + "language_loss": 0.66297925, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68444806, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12347412, + "step": 15498, + "time_per_iteration": 2.509568214416504 + }, + { + "auxiliary_loss_clip": 0.01107914, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.0366416, + "balance_loss_mlp": 1.02216673, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.7131603942637732, + "language_loss": 0.80014706, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82157075, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12298584, + "step": 15499, + "time_per_iteration": 2.509979486465454 + }, + { + "auxiliary_loss_clip": 0.0111247, + "auxiliary_loss_mlp": 0.01031538, + "balance_loss_clip": 1.04315281, + "balance_loss_mlp": 1.01882434, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.584395260479743, + "language_loss": 0.7655555, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.78699553, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.1270752, + "step": 15500, + "time_per_iteration": 3.962249517440796 + }, + { + "auxiliary_loss_clip": 0.01108143, + "auxiliary_loss_mlp": 0.01031225, + "balance_loss_clip": 1.03756285, + "balance_loss_mlp": 1.01859426, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 3.8839826775948154, + "language_loss": 0.72363245, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74502617, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12640381, + "step": 15501, + "time_per_iteration": 2.542881488800049 + }, + { + "auxiliary_loss_clip": 0.01113624, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.04007518, + "balance_loss_mlp": 1.02087164, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.7751246061741517, + "language_loss": 0.66096079, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68242037, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11480713, + "step": 15502, + "time_per_iteration": 2.48714017868042 + }, + { + "auxiliary_loss_clip": 0.01122271, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.04702914, + "balance_loss_mlp": 1.02216029, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.3707381799483045, + "language_loss": 0.65648687, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67804527, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11425781, + "step": 15503, + "time_per_iteration": 2.5116729736328125 + }, + { + "auxiliary_loss_clip": 0.01118603, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.04354787, + "balance_loss_mlp": 1.01834202, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.6735634625354618, + "language_loss": 0.75104862, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77255201, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.13397217, + "step": 15504, + "time_per_iteration": 2.4432733058929443 + }, + { + "auxiliary_loss_clip": 0.01111791, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.03959954, + "balance_loss_mlp": 1.01365995, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.4120568862913316, + "language_loss": 0.71425104, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73564768, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.14215088, + "step": 15505, + "time_per_iteration": 2.5055437088012695 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.04135907, + "balance_loss_mlp": 1.02034163, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 1.7045244359491902, + "language_loss": 0.75472105, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77621025, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.14147949, + "step": 15506, + "time_per_iteration": 2.410118341445923 + }, + { + "auxiliary_loss_clip": 0.01114173, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.0431869, + "balance_loss_mlp": 1.01649785, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 2.0002818706089958, + "language_loss": 0.83325148, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85466892, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11077881, + "step": 15507, + "time_per_iteration": 2.526907444000244 + }, + { + "auxiliary_loss_clip": 0.01115398, + "auxiliary_loss_mlp": 0.01028067, + "balance_loss_clip": 1.04255855, + "balance_loss_mlp": 1.01659322, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.5599123116930118, + "language_loss": 0.67971706, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70115173, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11474609, + "step": 15508, + "time_per_iteration": 2.5365402698516846 + }, + { + "auxiliary_loss_clip": 0.01115897, + "auxiliary_loss_mlp": 0.01027883, + "balance_loss_clip": 1.04315567, + "balance_loss_mlp": 1.01629567, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.6922374947540024, + "language_loss": 0.72274464, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74418247, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11584473, + "step": 15509, + "time_per_iteration": 3.9080393314361572 + }, + { + "auxiliary_loss_clip": 0.01114306, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.04234886, + "balance_loss_mlp": 1.01780248, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 38.39209581582018, + "language_loss": 0.74593067, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76736903, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11724854, + "step": 15510, + "time_per_iteration": 2.4095215797424316 + }, + { + "auxiliary_loss_clip": 0.01120294, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.04557514, + "balance_loss_mlp": 1.0166297, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.8831976480033121, + "language_loss": 0.70399189, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72547776, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11663818, + "step": 15511, + "time_per_iteration": 2.4890573024749756 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.0413177, + "balance_loss_mlp": 1.02371597, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.949186615919696, + "language_loss": 0.77976102, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80124837, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11767578, + "step": 15512, + "time_per_iteration": 2.4544949531555176 + }, + { + "auxiliary_loss_clip": 0.0111282, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.03965986, + "balance_loss_mlp": 1.02136731, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.6952590478378051, + "language_loss": 0.80725807, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82872641, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12646484, + "step": 15513, + "time_per_iteration": 2.459317922592163 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.05064857, + "balance_loss_mlp": 1.02031541, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 2.038792664839446, + "language_loss": 0.80293548, + "learning_rate": 4.721033078682768e-08, + "loss": 0.8245303, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12719727, + "step": 15514, + "time_per_iteration": 2.424879312515259 + }, + { + "auxiliary_loss_clip": 0.01110852, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.04177547, + "balance_loss_mlp": 1.02379429, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.8833730106327224, + "language_loss": 0.71981359, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.74126887, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10876465, + "step": 15515, + "time_per_iteration": 2.6349453926086426 + }, + { + "auxiliary_loss_clip": 0.01121111, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.04686236, + "balance_loss_mlp": 1.02023733, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.6723748091736055, + "language_loss": 0.81229007, + "learning_rate": 4.704223662500806e-08, + "loss": 0.8338275, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12402344, + "step": 15516, + "time_per_iteration": 2.4284121990203857 + }, + { + "auxiliary_loss_clip": 0.01115356, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.04149461, + "balance_loss_mlp": 1.0207355, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.892237111493623, + "language_loss": 0.80957186, + "learning_rate": 4.695830062703643e-08, + "loss": 0.83105063, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11779785, + "step": 15517, + "time_per_iteration": 2.416027545928955 + }, + { + "auxiliary_loss_clip": 0.01121196, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.04757047, + "balance_loss_mlp": 1.01922655, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 2.0685819945801236, + "language_loss": 0.74942571, + "learning_rate": 4.687443868860219e-08, + "loss": 0.77095193, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12200928, + "step": 15518, + "time_per_iteration": 2.4197070598602295 + }, + { + "auxiliary_loss_clip": 0.01118407, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.04634356, + "balance_loss_mlp": 1.02466559, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 2.1884261002215037, + "language_loss": 0.75488347, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77643275, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11877441, + "step": 15519, + "time_per_iteration": 2.4368748664855957 + }, + { + "auxiliary_loss_clip": 0.01112616, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.0407933, + "balance_loss_mlp": 1.01966012, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.0950926817303723, + "language_loss": 0.83245724, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85390472, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12463379, + "step": 15520, + "time_per_iteration": 2.4173357486724854 + }, + { + "auxiliary_loss_clip": 0.01111836, + "auxiliary_loss_mlp": 0.0102874, + "balance_loss_clip": 1.03951681, + "balance_loss_mlp": 1.01744473, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.5334551936895278, + "language_loss": 0.76118714, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78259289, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11303711, + "step": 15521, + "time_per_iteration": 2.4948203563690186 + }, + { + "auxiliary_loss_clip": 0.01112754, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.0415678, + "balance_loss_mlp": 1.01586807, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 19.539963973541052, + "language_loss": 0.77670735, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79810214, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10858154, + "step": 15522, + "time_per_iteration": 2.441236972808838 + }, + { + "auxiliary_loss_clip": 0.01113499, + "auxiliary_loss_mlp": 0.01027734, + "balance_loss_clip": 1.03989446, + "balance_loss_mlp": 1.01628971, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.0271173491356964, + "language_loss": 0.63059074, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.65200305, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11437988, + "step": 15523, + "time_per_iteration": 2.4286184310913086 + }, + { + "auxiliary_loss_clip": 0.0112135, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.04894543, + "balance_loss_mlp": 1.01838839, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.6433060616176942, + "language_loss": 0.68268836, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70419955, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11376953, + "step": 15524, + "time_per_iteration": 2.47633957862854 + }, + { + "auxiliary_loss_clip": 0.01116783, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.04487169, + "balance_loss_mlp": 1.02029502, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.8106171682560037, + "language_loss": 0.74061775, + "learning_rate": 4.628947905336589e-08, + "loss": 0.7621094, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12084961, + "step": 15525, + "time_per_iteration": 2.487985610961914 + }, + { + "auxiliary_loss_clip": 0.01119266, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.0486213, + "balance_loss_mlp": 1.02334929, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.7046224280981561, + "language_loss": 0.84051168, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.8620494, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1116333, + "step": 15526, + "time_per_iteration": 2.477240562438965 + }, + { + "auxiliary_loss_clip": 0.01116174, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.04402781, + "balance_loss_mlp": 1.01563859, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 2.0903862864076626, + "language_loss": 0.69385624, + "learning_rate": 4.61230144456366e-08, + "loss": 0.71529192, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11761475, + "step": 15527, + "time_per_iteration": 2.4229519367218018 + }, + { + "auxiliary_loss_clip": 0.01118982, + "auxiliary_loss_mlp": 0.01026859, + "balance_loss_clip": 1.04226732, + "balance_loss_mlp": 1.01376367, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 3.603267006786963, + "language_loss": 0.65267777, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67413616, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.13085938, + "step": 15528, + "time_per_iteration": 3.8334896564483643 + }, + { + "auxiliary_loss_clip": 0.01116342, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.04298222, + "balance_loss_mlp": 1.01625192, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 2.103954449286149, + "language_loss": 0.75055164, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77200043, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1229248, + "step": 15529, + "time_per_iteration": 2.43074107170105 + }, + { + "auxiliary_loss_clip": 0.01113217, + "auxiliary_loss_mlp": 0.01026768, + "balance_loss_clip": 1.04238081, + "balance_loss_mlp": 1.01586676, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.851869527831587, + "language_loss": 0.63209218, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65349197, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10906982, + "step": 15530, + "time_per_iteration": 2.4432661533355713 + }, + { + "auxiliary_loss_clip": 0.01117172, + "auxiliary_loss_mlp": 0.01027285, + "balance_loss_clip": 1.04732251, + "balance_loss_mlp": 1.0160253, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.7791469522456105, + "language_loss": 0.72747636, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74892092, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11254883, + "step": 15531, + "time_per_iteration": 2.5107321739196777 + }, + { + "auxiliary_loss_clip": 0.01112436, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.04184794, + "balance_loss_mlp": 1.01889896, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.700694132255289, + "language_loss": 0.70805365, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.72948325, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11621094, + "step": 15532, + "time_per_iteration": 2.6115076541900635 + }, + { + "auxiliary_loss_clip": 0.01117568, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.04304945, + "balance_loss_mlp": 1.01707435, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.6795233888605385, + "language_loss": 0.73362982, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75509644, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12011719, + "step": 15533, + "time_per_iteration": 2.479088544845581 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01024602, + "balance_loss_clip": 1.04262805, + "balance_loss_mlp": 1.01381373, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.916495560606678, + "language_loss": 0.79349136, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81485945, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10797119, + "step": 15534, + "time_per_iteration": 2.4482922554016113 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01026375, + "balance_loss_clip": 1.04616344, + "balance_loss_mlp": 1.01598048, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.8071827409369996, + "language_loss": 0.74127042, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76265484, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 0.65869141, + "router_z_loss_mlp": 0.10400391, + "step": 15535, + "time_per_iteration": 2.484043836593628 + }, + { + "auxiliary_loss_clip": 0.01119046, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.04564869, + "balance_loss_mlp": 1.01849294, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 2.003121149413723, + "language_loss": 0.77768004, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79917669, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12145996, + "step": 15536, + "time_per_iteration": 2.5501859188079834 + }, + { + "auxiliary_loss_clip": 0.01114639, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.04126215, + "balance_loss_mlp": 1.01925075, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.4690306372521542, + "language_loss": 0.80693817, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82838976, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11279297, + "step": 15537, + "time_per_iteration": 2.454848051071167 + }, + { + "auxiliary_loss_clip": 0.01115182, + "auxiliary_loss_mlp": 0.01028337, + "balance_loss_clip": 1.0426451, + "balance_loss_mlp": 1.01670194, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.9732565634414894, + "language_loss": 0.77860039, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.8000356, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11621094, + "step": 15538, + "time_per_iteration": 2.513885021209717 + }, + { + "auxiliary_loss_clip": 0.01115912, + "auxiliary_loss_mlp": 0.01027136, + "balance_loss_clip": 1.04477346, + "balance_loss_mlp": 1.01559114, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.5084348584424037, + "language_loss": 0.73248482, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75391531, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11541748, + "step": 15539, + "time_per_iteration": 2.4537618160247803 + }, + { + "auxiliary_loss_clip": 0.01112453, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.04195189, + "balance_loss_mlp": 1.01774716, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.6881107144304477, + "language_loss": 0.64539433, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66680372, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.1072998, + "step": 15540, + "time_per_iteration": 2.5803165435791016 + }, + { + "auxiliary_loss_clip": 0.01107614, + "auxiliary_loss_mlp": 0.01026667, + "balance_loss_clip": 1.03649831, + "balance_loss_mlp": 1.01536584, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.628449970039402, + "language_loss": 0.76742363, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78876638, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11303711, + "step": 15541, + "time_per_iteration": 3.8856353759765625 + }, + { + "auxiliary_loss_clip": 0.01118346, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.04674685, + "balance_loss_mlp": 1.02003872, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.9256978555806867, + "language_loss": 0.66784191, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.68933761, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11181641, + "step": 15542, + "time_per_iteration": 2.4942514896392822 + }, + { + "auxiliary_loss_clip": 0.01114253, + "auxiliary_loss_mlp": 0.01025915, + "balance_loss_clip": 1.04309213, + "balance_loss_mlp": 1.01441169, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 2.121002225955145, + "language_loss": 0.6976366, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71903825, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1151123, + "step": 15543, + "time_per_iteration": 3.8637638092041016 + }, + { + "auxiliary_loss_clip": 0.01121665, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.04606676, + "balance_loss_mlp": 1.01870966, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 2.091184131074237, + "language_loss": 0.69488525, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71641397, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12506104, + "step": 15544, + "time_per_iteration": 2.5593948364257812 + }, + { + "auxiliary_loss_clip": 0.01116771, + "auxiliary_loss_mlp": 0.01035401, + "balance_loss_clip": 1.04400468, + "balance_loss_mlp": 1.023772, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.8568744410169085, + "language_loss": 0.77095222, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79247391, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11633301, + "step": 15545, + "time_per_iteration": 2.5471155643463135 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01025529, + "balance_loss_clip": 1.0353868, + "balance_loss_mlp": 1.01434731, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.535313470674709, + "language_loss": 0.68996888, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71129799, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11187744, + "step": 15546, + "time_per_iteration": 2.5311639308929443 + }, + { + "auxiliary_loss_clip": 0.01103617, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.03479016, + "balance_loss_mlp": 1.01583958, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 3.4934913515632586, + "language_loss": 0.82414126, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84544396, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10803223, + "step": 15547, + "time_per_iteration": 2.643364667892456 + }, + { + "auxiliary_loss_clip": 0.01107352, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.03810787, + "balance_loss_mlp": 1.01814413, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.827139686271581, + "language_loss": 0.83414328, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85550672, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10839844, + "step": 15548, + "time_per_iteration": 2.5032808780670166 + }, + { + "auxiliary_loss_clip": 0.01117748, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.04056454, + "balance_loss_mlp": 1.02230883, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.7475885611712891, + "language_loss": 0.65751088, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67905885, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.14733887, + "step": 15549, + "time_per_iteration": 2.624659538269043 + }, + { + "auxiliary_loss_clip": 0.01123344, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.04948258, + "balance_loss_mlp": 1.02026463, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 1.6763612145217475, + "language_loss": 0.80337059, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82492912, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12255859, + "step": 15550, + "time_per_iteration": 2.4652068614959717 + }, + { + "auxiliary_loss_clip": 0.01111029, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.0398922, + "balance_loss_mlp": 1.02151453, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.8944711980438764, + "language_loss": 0.7577529, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77919596, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11755371, + "step": 15551, + "time_per_iteration": 2.472730875015259 + }, + { + "auxiliary_loss_clip": 0.01118195, + "auxiliary_loss_mlp": 0.01029206, + "balance_loss_clip": 1.04855871, + "balance_loss_mlp": 1.01977646, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.657284329578258, + "language_loss": 0.73796958, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75944358, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.09429932, + "step": 15552, + "time_per_iteration": 2.5034658908843994 + }, + { + "auxiliary_loss_clip": 0.0111238, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.04131436, + "balance_loss_mlp": 1.02008855, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.85367587240579, + "language_loss": 0.77401173, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79545057, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11413574, + "step": 15553, + "time_per_iteration": 3.9388041496276855 + }, + { + "auxiliary_loss_clip": 0.01114647, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.03990984, + "balance_loss_mlp": 1.02165258, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.606340721102624, + "language_loss": 0.78221405, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80370152, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12463379, + "step": 15554, + "time_per_iteration": 2.444932699203491 + }, + { + "auxiliary_loss_clip": 0.01101514, + "auxiliary_loss_mlp": 0.0102594, + "balance_loss_clip": 1.03489769, + "balance_loss_mlp": 1.01583064, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.5205611672893449, + "language_loss": 0.69179189, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71306646, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 0.66650391, + "router_z_loss_mlp": 0.10113525, + "step": 15555, + "time_per_iteration": 2.406167507171631 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03610992, + "balance_loss_mlp": 1.01965833, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.9063866647799788, + "language_loss": 0.75565016, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77702153, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11694336, + "step": 15556, + "time_per_iteration": 2.5308420658111572 + }, + { + "auxiliary_loss_clip": 0.01115202, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.04229355, + "balance_loss_mlp": 1.01932216, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.5475810926930849, + "language_loss": 0.72162604, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74308795, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11651611, + "step": 15557, + "time_per_iteration": 2.52181077003479 + }, + { + "auxiliary_loss_clip": 0.01115379, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.0439055, + "balance_loss_mlp": 1.01766801, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.9557271018141011, + "language_loss": 0.63227773, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65372938, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12121582, + "step": 15558, + "time_per_iteration": 2.4317691326141357 + }, + { + "auxiliary_loss_clip": 0.01123357, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.04981971, + "balance_loss_mlp": 1.01700902, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 2.1008181456473287, + "language_loss": 0.73228514, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75381327, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12432861, + "step": 15559, + "time_per_iteration": 2.455789566040039 + }, + { + "auxiliary_loss_clip": 0.01115207, + "auxiliary_loss_mlp": 0.01024163, + "balance_loss_clip": 1.04432666, + "balance_loss_mlp": 1.01391685, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.664389007391199, + "language_loss": 0.63807201, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65946567, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10229492, + "step": 15560, + "time_per_iteration": 2.5961008071899414 + }, + { + "auxiliary_loss_clip": 0.01132824, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.05009341, + "balance_loss_mlp": 1.0245595, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.1410142123263203, + "language_loss": 0.63660401, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.6583178, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14007568, + "step": 15561, + "time_per_iteration": 2.4078426361083984 + }, + { + "auxiliary_loss_clip": 0.01113701, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.0435586, + "balance_loss_mlp": 1.01786542, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.8568946434304912, + "language_loss": 0.7544601, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77589273, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11688232, + "step": 15562, + "time_per_iteration": 2.4952619075775146 + }, + { + "auxiliary_loss_clip": 0.010456, + "auxiliary_loss_mlp": 0.01000165, + "balance_loss_clip": 1.02112317, + "balance_loss_mlp": 0.99887455, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9409490772974989, + "language_loss": 0.6232487, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64370632, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01290894, + "step": 15563, + "time_per_iteration": 2.9101598262786865 + }, + { + "auxiliary_loss_clip": 0.01115226, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.04458201, + "balance_loss_mlp": 1.02337027, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.8019517991641494, + "language_loss": 0.78336531, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80486882, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11749268, + "step": 15564, + "time_per_iteration": 2.5261948108673096 + }, + { + "auxiliary_loss_clip": 0.01124275, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.04916835, + "balance_loss_mlp": 1.01671457, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 1.7318760386124707, + "language_loss": 0.7833569, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80489624, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1293335, + "step": 15565, + "time_per_iteration": 2.4327406883239746 + }, + { + "auxiliary_loss_clip": 0.01107067, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.03652358, + "balance_loss_mlp": 1.0183804, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 3.251695918782872, + "language_loss": 0.72249484, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74388051, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.13098145, + "step": 15566, + "time_per_iteration": 2.4460792541503906 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.03732908, + "balance_loss_mlp": 1.01599467, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.995830400198295, + "language_loss": 0.67654878, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69791591, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11132812, + "step": 15567, + "time_per_iteration": 2.4583888053894043 + }, + { + "auxiliary_loss_clip": 0.01118222, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.04700112, + "balance_loss_mlp": 1.02133656, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 2.350887375422468, + "language_loss": 0.62116599, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64267832, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11669922, + "step": 15568, + "time_per_iteration": 2.559864044189453 + }, + { + "auxiliary_loss_clip": 0.01113617, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.04257298, + "balance_loss_mlp": 1.01935828, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.694698244090471, + "language_loss": 0.78787708, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80932081, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11407471, + "step": 15569, + "time_per_iteration": 2.4799365997314453 + }, + { + "auxiliary_loss_clip": 0.01115545, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.04168439, + "balance_loss_mlp": 1.01951981, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.5453283582159134, + "language_loss": 0.69739568, + "learning_rate": 4.261574992142014e-08, + "loss": 0.7188651, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11877441, + "step": 15570, + "time_per_iteration": 2.48612904548645 + }, + { + "auxiliary_loss_clip": 0.01115024, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.04182935, + "balance_loss_mlp": 1.01658607, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 2.8627259574776898, + "language_loss": 0.78790355, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.80933654, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11676025, + "step": 15571, + "time_per_iteration": 2.4285457134246826 + }, + { + "auxiliary_loss_clip": 0.01113195, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.04052758, + "balance_loss_mlp": 1.01811337, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 1.7488423398864734, + "language_loss": 0.77495384, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79638469, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11798096, + "step": 15572, + "time_per_iteration": 3.8119189739227295 + }, + { + "auxiliary_loss_clip": 0.01102608, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.03428674, + "balance_loss_mlp": 1.02557623, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 3.086044563513898, + "language_loss": 0.7806437, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80203956, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.11395264, + "step": 15573, + "time_per_iteration": 2.463955879211426 + }, + { + "auxiliary_loss_clip": 0.01105851, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.0366838, + "balance_loss_mlp": 1.01710343, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.856184045247937, + "language_loss": 0.74603772, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76739085, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.12353516, + "step": 15574, + "time_per_iteration": 2.5063536167144775 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.04256451, + "balance_loss_mlp": 1.02160263, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.937419475665583, + "language_loss": 0.68146574, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70295548, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12042236, + "step": 15575, + "time_per_iteration": 2.49820876121521 + }, + { + "auxiliary_loss_clip": 0.01111864, + "auxiliary_loss_mlp": 0.01029503, + "balance_loss_clip": 1.0433836, + "balance_loss_mlp": 1.01793385, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 1.5975965268530359, + "language_loss": 0.65455073, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67596442, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.11572266, + "step": 15576, + "time_per_iteration": 2.4781458377838135 + }, + { + "auxiliary_loss_clip": 0.01110468, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.03826284, + "balance_loss_mlp": 1.01245105, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.4913607574367713, + "language_loss": 0.76046312, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.78181541, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12316895, + "step": 15577, + "time_per_iteration": 2.426278591156006 + }, + { + "auxiliary_loss_clip": 0.01110916, + "auxiliary_loss_mlp": 0.01023614, + "balance_loss_clip": 1.03990912, + "balance_loss_mlp": 1.01198554, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 1.9029989817242823, + "language_loss": 0.52275443, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54409975, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11621094, + "step": 15578, + "time_per_iteration": 2.49463152885437 + }, + { + "auxiliary_loss_clip": 0.01107239, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.03700447, + "balance_loss_mlp": 1.02602339, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.5289551996677995, + "language_loss": 0.70608222, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72754461, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.12969971, + "step": 15579, + "time_per_iteration": 2.436837911605835 + }, + { + "auxiliary_loss_clip": 0.01115083, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.04251909, + "balance_loss_mlp": 1.0138557, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.9025856481470433, + "language_loss": 0.76551187, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78691339, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11224365, + "step": 15580, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.04203928, + "balance_loss_mlp": 1.01811445, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.6127191325491104, + "language_loss": 0.66166425, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68313533, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12536621, + "step": 15581, + "time_per_iteration": 2.474893093109131 + }, + { + "auxiliary_loss_clip": 0.01118585, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.04472923, + "balance_loss_mlp": 1.01579797, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 2.308628306015048, + "language_loss": 0.76549858, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78696549, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12316895, + "step": 15582, + "time_per_iteration": 2.543896436691284 + }, + { + "auxiliary_loss_clip": 0.01105906, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.03651321, + "balance_loss_mlp": 1.02212381, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.6617774863980208, + "language_loss": 0.73919725, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.76061058, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.13293457, + "step": 15583, + "time_per_iteration": 3.966038465499878 + }, + { + "auxiliary_loss_clip": 0.01116115, + "auxiliary_loss_mlp": 0.01042245, + "balance_loss_clip": 1.040838, + "balance_loss_mlp": 1.02783895, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.8994143679994187, + "language_loss": 0.8475858, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86916935, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.14416504, + "step": 15584, + "time_per_iteration": 2.471771240234375 + }, + { + "auxiliary_loss_clip": 0.01122121, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.04595792, + "balance_loss_mlp": 1.02208602, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.498681667821627, + "language_loss": 0.72188461, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.74345517, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.128479, + "step": 15585, + "time_per_iteration": 2.608722686767578 + }, + { + "auxiliary_loss_clip": 0.0111133, + "auxiliary_loss_mlp": 0.01022986, + "balance_loss_clip": 1.04236042, + "balance_loss_mlp": 1.01243544, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.8419468570002342, + "language_loss": 0.8063218, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82766497, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10546875, + "step": 15586, + "time_per_iteration": 2.4317731857299805 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.04572821, + "balance_loss_mlp": 1.02230787, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.630291753242165, + "language_loss": 0.76830488, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78984416, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1227417, + "step": 15587, + "time_per_iteration": 3.924158811569214 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.04550433, + "balance_loss_mlp": 1.01702929, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 2.2610604863703445, + "language_loss": 0.87416816, + "learning_rate": 4.118832771491387e-08, + "loss": 0.89568651, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12634277, + "step": 15588, + "time_per_iteration": 2.4067776203155518 + }, + { + "auxiliary_loss_clip": 0.0111316, + "auxiliary_loss_mlp": 0.01025084, + "balance_loss_clip": 1.04454935, + "balance_loss_mlp": 1.0143609, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.7407260499768535, + "language_loss": 0.77802056, + "learning_rate": 4.11097319642002e-08, + "loss": 0.79940295, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10723877, + "step": 15589, + "time_per_iteration": 2.4616873264312744 + }, + { + "auxiliary_loss_clip": 0.01115661, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.04601407, + "balance_loss_mlp": 1.02300239, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.7588135207108977, + "language_loss": 0.77728754, + "learning_rate": 4.103121049480163e-08, + "loss": 0.79879594, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.1217041, + "step": 15590, + "time_per_iteration": 2.41782283782959 + }, + { + "auxiliary_loss_clip": 0.01123387, + "auxiliary_loss_mlp": 0.01039889, + "balance_loss_clip": 1.04814303, + "balance_loss_mlp": 1.02653205, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 2.48262629900546, + "language_loss": 0.71351546, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73514825, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.13360596, + "step": 15591, + "time_per_iteration": 2.571216583251953 + }, + { + "auxiliary_loss_clip": 0.01116693, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.04109323, + "balance_loss_mlp": 1.02060866, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 2.2332688935186287, + "language_loss": 0.53438026, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.55589306, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13977051, + "step": 15592, + "time_per_iteration": 2.4814531803131104 + }, + { + "auxiliary_loss_clip": 0.01109043, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.03970265, + "balance_loss_mlp": 1.01758242, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 2.9073565136300683, + "language_loss": 0.67361045, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69498956, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.112854, + "step": 15593, + "time_per_iteration": 2.53058123588562 + }, + { + "auxiliary_loss_clip": 0.01110923, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03692639, + "balance_loss_mlp": 1.01929832, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.4969505705713848, + "language_loss": 0.74088085, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76229995, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11682129, + "step": 15594, + "time_per_iteration": 2.455449342727661 + }, + { + "auxiliary_loss_clip": 0.01109008, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.04034746, + "balance_loss_mlp": 1.01455593, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.6468090054730116, + "language_loss": 0.73741329, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75875723, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10845947, + "step": 15595, + "time_per_iteration": 2.5244650840759277 + }, + { + "auxiliary_loss_clip": 0.01116302, + "auxiliary_loss_mlp": 0.01024282, + "balance_loss_clip": 1.04178786, + "balance_loss_mlp": 1.01301718, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 7.043828455400936, + "language_loss": 0.76409554, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78550136, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11260986, + "step": 15596, + "time_per_iteration": 4.127736568450928 + }, + { + "auxiliary_loss_clip": 0.01108384, + "auxiliary_loss_mlp": 0.01032541, + "balance_loss_clip": 1.0366112, + "balance_loss_mlp": 1.02056658, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.7489953829760945, + "language_loss": 0.79079628, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.81220555, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11975098, + "step": 15597, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01028901, + "balance_loss_clip": 1.04210472, + "balance_loss_mlp": 1.01672339, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.4360137339590018, + "language_loss": 0.81142676, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83288622, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12182617, + "step": 15598, + "time_per_iteration": 2.521822929382324 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.03779292, + "balance_loss_mlp": 1.01956344, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 2.0401921065800366, + "language_loss": 0.63269293, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65415132, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12469482, + "step": 15599, + "time_per_iteration": 2.4512574672698975 + }, + { + "auxiliary_loss_clip": 0.01111653, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.03854513, + "balance_loss_mlp": 1.01830363, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 2.4644120949276362, + "language_loss": 0.73503232, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75644672, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11474609, + "step": 15600, + "time_per_iteration": 2.538658857345581 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.03933489, + "balance_loss_mlp": 1.01976347, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.8124797934938972, + "language_loss": 0.69636804, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71777821, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.109375, + "step": 15601, + "time_per_iteration": 2.4654884338378906 + }, + { + "auxiliary_loss_clip": 0.01035385, + "auxiliary_loss_mlp": 0.01000118, + "balance_loss_clip": 1.01098847, + "balance_loss_mlp": 0.99886197, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7547623578170285, + "language_loss": 0.58067334, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60102838, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.24389648, + "router_z_loss_mlp": 0.01255798, + "step": 15602, + "time_per_iteration": 3.2780120372772217 + }, + { + "auxiliary_loss_clip": 0.01116891, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.04167962, + "balance_loss_mlp": 1.01947975, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 1.9829216160563068, + "language_loss": 0.72013241, + "learning_rate": 4.001719234324663e-08, + "loss": 0.74161136, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11523438, + "step": 15603, + "time_per_iteration": 2.5613551139831543 + }, + { + "auxiliary_loss_clip": 0.0110961, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.04390943, + "balance_loss_mlp": 1.01879978, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 2.5833800487569496, + "language_loss": 0.76229638, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78368616, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 0.65673828, + "router_z_loss_mlp": 0.10577393, + "step": 15604, + "time_per_iteration": 2.454760789871216 + }, + { + "auxiliary_loss_clip": 0.01116739, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.04368234, + "balance_loss_mlp": 1.01952648, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 2.1002285998497396, + "language_loss": 0.65186131, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67334819, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12420654, + "step": 15605, + "time_per_iteration": 2.453117847442627 + }, + { + "auxiliary_loss_clip": 0.01121857, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.04396677, + "balance_loss_mlp": 1.02137899, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.7247957120260138, + "language_loss": 0.67630386, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69786191, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12585449, + "step": 15606, + "time_per_iteration": 2.6403417587280273 + }, + { + "auxiliary_loss_clip": 0.01107642, + "auxiliary_loss_mlp": 0.01026362, + "balance_loss_clip": 1.04000533, + "balance_loss_mlp": 1.01549053, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.7670148495416915, + "language_loss": 0.77242279, + "learning_rate": 3.970771343058166e-08, + "loss": 0.7937628, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 0.67626953, + "router_z_loss_mlp": 0.10870361, + "step": 15607, + "time_per_iteration": 2.4258508682250977 + }, + { + "auxiliary_loss_clip": 0.01113372, + "auxiliary_loss_mlp": 0.01026636, + "balance_loss_clip": 1.04118633, + "balance_loss_mlp": 1.01543593, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 1.7321560629709176, + "language_loss": 0.82747138, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84887147, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11193848, + "step": 15608, + "time_per_iteration": 2.46291184425354 + }, + { + "auxiliary_loss_clip": 0.011173, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.0438211, + "balance_loss_mlp": 1.02257729, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 2.0558156938065046, + "language_loss": 0.69192308, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.71344435, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12231445, + "step": 15609, + "time_per_iteration": 2.40691876411438 + }, + { + "auxiliary_loss_clip": 0.01122676, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.04767156, + "balance_loss_mlp": 1.01857972, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 1.936521945199814, + "language_loss": 0.75294524, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77448368, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12597656, + "step": 15610, + "time_per_iteration": 2.4709174633026123 + }, + { + "auxiliary_loss_clip": 0.01115566, + "auxiliary_loss_mlp": 0.0102777, + "balance_loss_clip": 1.04321122, + "balance_loss_mlp": 1.01655233, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 2.2228173697181095, + "language_loss": 0.75414491, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77557826, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11224365, + "step": 15611, + "time_per_iteration": 2.4285616874694824 + }, + { + "auxiliary_loss_clip": 0.01106758, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.03646016, + "balance_loss_mlp": 1.01658499, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 1.8177470672378488, + "language_loss": 0.66063589, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68198329, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11407471, + "step": 15612, + "time_per_iteration": 2.47159743309021 + }, + { + "auxiliary_loss_clip": 0.01109315, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.04005384, + "balance_loss_mlp": 1.01803398, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.9478559039901746, + "language_loss": 0.57118976, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59258854, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.12524414, + "step": 15613, + "time_per_iteration": 2.4464468955993652 + }, + { + "auxiliary_loss_clip": 0.01110984, + "auxiliary_loss_mlp": 0.01037137, + "balance_loss_clip": 1.03763008, + "balance_loss_mlp": 1.02501988, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.210640099762353, + "language_loss": 0.71022344, + "learning_rate": 3.916898732330764e-08, + "loss": 0.73170471, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12121582, + "step": 15614, + "time_per_iteration": 2.514009952545166 + }, + { + "auxiliary_loss_clip": 0.01121467, + "auxiliary_loss_mlp": 0.01028099, + "balance_loss_clip": 1.0469079, + "balance_loss_mlp": 1.01598752, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.97970910731154, + "language_loss": 0.81067932, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83217502, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12121582, + "step": 15615, + "time_per_iteration": 4.002379655838013 + }, + { + "auxiliary_loss_clip": 0.0111375, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.04378593, + "balance_loss_mlp": 1.01649833, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.5443030852292559, + "language_loss": 0.71949381, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74090773, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11138916, + "step": 15616, + "time_per_iteration": 2.5439600944519043 + }, + { + "auxiliary_loss_clip": 0.01117616, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.04575479, + "balance_loss_mlp": 1.01642942, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 1.9112769295532552, + "language_loss": 0.6596961, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68115044, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11401367, + "step": 15617, + "time_per_iteration": 2.4820570945739746 + }, + { + "auxiliary_loss_clip": 0.01109725, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.03578115, + "balance_loss_mlp": 1.02011538, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.036372898978411, + "language_loss": 0.73426867, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75569713, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.13031006, + "step": 15618, + "time_per_iteration": 2.4574837684631348 + }, + { + "auxiliary_loss_clip": 0.01111426, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.03644049, + "balance_loss_mlp": 1.01783073, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.8537093502209918, + "language_loss": 0.70070255, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72212851, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13342285, + "step": 15619, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.0111687, + "auxiliary_loss_mlp": 0.0102941, + "balance_loss_clip": 1.04319048, + "balance_loss_mlp": 1.01745939, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.7393929928683554, + "language_loss": 0.77732319, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79878598, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11962891, + "step": 15620, + "time_per_iteration": 2.5067808628082275 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.01026183, + "balance_loss_clip": 1.04320323, + "balance_loss_mlp": 1.01502514, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 6.575461308974157, + "language_loss": 0.74238515, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.76376772, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.11157227, + "step": 15621, + "time_per_iteration": 2.4492745399475098 + }, + { + "auxiliary_loss_clip": 0.01120113, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.04427457, + "balance_loss_mlp": 1.02312446, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 2.1445610225023466, + "language_loss": 0.66157097, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68312764, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12445068, + "step": 15622, + "time_per_iteration": 2.4096148014068604 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.01938105, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.6145976136477471, + "language_loss": 0.71797901, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73937631, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.1282959, + "step": 15623, + "time_per_iteration": 2.4623913764953613 + }, + { + "auxiliary_loss_clip": 0.0111586, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.04325604, + "balance_loss_mlp": 1.01879382, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 2.047606434748681, + "language_loss": 0.72202277, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74348176, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11248779, + "step": 15624, + "time_per_iteration": 2.557170867919922 + }, + { + "auxiliary_loss_clip": 0.01112806, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.01808286, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 1.807520375119615, + "language_loss": 0.8947401, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91616917, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.12023926, + "step": 15625, + "time_per_iteration": 2.4502854347229004 + }, + { + "auxiliary_loss_clip": 0.01117638, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.04505801, + "balance_loss_mlp": 1.01932979, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 1.9488874303227404, + "language_loss": 0.83816469, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85965687, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12261963, + "step": 15626, + "time_per_iteration": 2.445937156677246 + }, + { + "auxiliary_loss_clip": 0.01043869, + "auxiliary_loss_mlp": 0.01002235, + "balance_loss_clip": 1.01967847, + "balance_loss_mlp": 1.00104713, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7731674544309896, + "language_loss": 0.56126171, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.5817228, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.0118866, + "step": 15627, + "time_per_iteration": 4.5155932903289795 + }, + { + "auxiliary_loss_clip": 0.01108418, + "auxiliary_loss_mlp": 0.01029101, + "balance_loss_clip": 1.03943384, + "balance_loss_mlp": 1.01806211, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.4365097438960488, + "language_loss": 0.7016468, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72302204, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.11022949, + "step": 15628, + "time_per_iteration": 2.4630911350250244 + }, + { + "auxiliary_loss_clip": 0.01111353, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.04053998, + "balance_loss_mlp": 1.01472545, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.554753041560225, + "language_loss": 0.75392318, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.7753042, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12017822, + "step": 15629, + "time_per_iteration": 2.4976446628570557 + }, + { + "auxiliary_loss_clip": 0.01111793, + "auxiliary_loss_mlp": 0.01025629, + "balance_loss_clip": 1.04423273, + "balance_loss_mlp": 1.01524019, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.950143324064461, + "language_loss": 0.74594378, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76731801, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.10388184, + "step": 15630, + "time_per_iteration": 4.059911489486694 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01031723, + "balance_loss_clip": 1.04774976, + "balance_loss_mlp": 1.02085125, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 1.8283937921704603, + "language_loss": 0.69183207, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71330893, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.10864258, + "step": 15631, + "time_per_iteration": 2.4338698387145996 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.02273858, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 1.456084422742979, + "language_loss": 0.75140965, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77283704, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10797119, + "step": 15632, + "time_per_iteration": 2.5467073917388916 + }, + { + "auxiliary_loss_clip": 0.01121005, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.04466653, + "balance_loss_mlp": 1.01837993, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.7977749415503297, + "language_loss": 0.74258709, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76411575, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13482666, + "step": 15633, + "time_per_iteration": 2.4560654163360596 + }, + { + "auxiliary_loss_clip": 0.01121037, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.04429173, + "balance_loss_mlp": 1.02220309, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 2.219751966313502, + "language_loss": 0.72372544, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74528962, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13153076, + "step": 15634, + "time_per_iteration": 2.488046169281006 + }, + { + "auxiliary_loss_clip": 0.01117651, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.04384398, + "balance_loss_mlp": 1.01610887, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.7379720397985916, + "language_loss": 0.68833113, + "learning_rate": 3.75746733114144e-08, + "loss": 0.70978343, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11474609, + "step": 15635, + "time_per_iteration": 2.4315450191497803 + }, + { + "auxiliary_loss_clip": 0.01107684, + "auxiliary_loss_mlp": 0.01024433, + "balance_loss_clip": 1.03902495, + "balance_loss_mlp": 1.01364446, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.6415217628286585, + "language_loss": 0.74099898, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76232016, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10791016, + "step": 15636, + "time_per_iteration": 2.483032464981079 + }, + { + "auxiliary_loss_clip": 0.01116985, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.04477906, + "balance_loss_mlp": 1.01993024, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 2.051379497038898, + "language_loss": 0.82978642, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.8512736, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11791992, + "step": 15637, + "time_per_iteration": 2.4290525913238525 + }, + { + "auxiliary_loss_clip": 0.01112275, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.0412457, + "balance_loss_mlp": 1.01804614, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.032717144712472, + "language_loss": 0.68986535, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71130431, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.13574219, + "step": 15638, + "time_per_iteration": 2.5018885135650635 + }, + { + "auxiliary_loss_clip": 0.01112316, + "auxiliary_loss_mlp": 0.01034646, + "balance_loss_clip": 1.04294419, + "balance_loss_mlp": 1.02312398, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.7471199517000038, + "language_loss": 0.85172832, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87319791, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11529541, + "step": 15639, + "time_per_iteration": 2.4543678760528564 + }, + { + "auxiliary_loss_clip": 0.01114972, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.03949511, + "balance_loss_mlp": 1.02219999, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.5321823862206025, + "language_loss": 0.78573328, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80722404, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11889648, + "step": 15640, + "time_per_iteration": 3.8699541091918945 + }, + { + "auxiliary_loss_clip": 0.01117995, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.0401876, + "balance_loss_mlp": 1.02122533, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.4607337128118223, + "language_loss": 0.74350297, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76501858, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12322998, + "step": 15641, + "time_per_iteration": 2.518397331237793 + }, + { + "auxiliary_loss_clip": 0.01117201, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.04233885, + "balance_loss_mlp": 1.01788032, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 1.9321316618572404, + "language_loss": 0.81842536, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.83990371, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12762451, + "step": 15642, + "time_per_iteration": 2.432703971862793 + }, + { + "auxiliary_loss_clip": 0.01110957, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.04242373, + "balance_loss_mlp": 1.01636744, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 2.146667390203878, + "language_loss": 0.68412018, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70549846, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10516357, + "step": 15643, + "time_per_iteration": 2.543457508087158 + }, + { + "auxiliary_loss_clip": 0.0112382, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.045506, + "balance_loss_mlp": 1.02700245, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 1.9510709603982819, + "language_loss": 0.76817143, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78980935, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12988281, + "step": 15644, + "time_per_iteration": 2.4864609241485596 + }, + { + "auxiliary_loss_clip": 0.01112369, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.04329705, + "balance_loss_mlp": 1.01817751, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.7570816777875533, + "language_loss": 0.672526, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69393694, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10546875, + "step": 15645, + "time_per_iteration": 2.5473010540008545 + }, + { + "auxiliary_loss_clip": 0.01109424, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.04220963, + "balance_loss_mlp": 1.0183506, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.5387578237007762, + "language_loss": 0.70261794, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72400606, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.11047363, + "step": 15646, + "time_per_iteration": 2.5406653881073 + }, + { + "auxiliary_loss_clip": 0.01113047, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.04357529, + "balance_loss_mlp": 1.01884651, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.557547549146386, + "language_loss": 0.74055505, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76198447, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11047363, + "step": 15647, + "time_per_iteration": 2.4954705238342285 + }, + { + "auxiliary_loss_clip": 0.01040472, + "auxiliary_loss_mlp": 0.01004227, + "balance_loss_clip": 1.01519704, + "balance_loss_mlp": 1.00286341, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8837558517358128, + "language_loss": 0.63461834, + "learning_rate": 3.660416111738907e-08, + "loss": 0.6550653, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.0136261, + "step": 15648, + "time_per_iteration": 3.1437973976135254 + }, + { + "auxiliary_loss_clip": 0.01114026, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.04574823, + "balance_loss_mlp": 1.01789522, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.6574383159496087, + "language_loss": 0.66420138, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68562478, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10406494, + "step": 15649, + "time_per_iteration": 2.5303688049316406 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01025824, + "balance_loss_clip": 1.04356158, + "balance_loss_mlp": 1.0148505, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 2.4511334816297143, + "language_loss": 0.77631879, + "learning_rate": 3.645596817637586e-08, + "loss": 0.797719, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10980225, + "step": 15650, + "time_per_iteration": 2.4031174182891846 + }, + { + "auxiliary_loss_clip": 0.01114066, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.04225016, + "balance_loss_mlp": 1.01931727, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.759203184070568, + "language_loss": 0.74399114, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76543558, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11071777, + "step": 15651, + "time_per_iteration": 2.4702935218811035 + }, + { + "auxiliary_loss_clip": 0.01114353, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.04192352, + "balance_loss_mlp": 1.0188719, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.7516866664863076, + "language_loss": 0.72458208, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74602962, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11529541, + "step": 15652, + "time_per_iteration": 2.401435375213623 + }, + { + "auxiliary_loss_clip": 0.01120432, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.04354191, + "balance_loss_mlp": 1.02247715, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.9089755284159555, + "language_loss": 0.66581762, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68736959, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12298584, + "step": 15653, + "time_per_iteration": 2.493945598602295 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.04696107, + "balance_loss_mlp": 1.02022374, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 2.031560569166215, + "language_loss": 0.78118247, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.80270863, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12304688, + "step": 15654, + "time_per_iteration": 2.4297239780426025 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.04083133, + "balance_loss_mlp": 1.01750565, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.456161906660957, + "language_loss": 0.70195079, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.72339451, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.10992432, + "step": 15655, + "time_per_iteration": 2.690302848815918 + }, + { + "auxiliary_loss_clip": 0.01113665, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.04221427, + "balance_loss_mlp": 1.01742029, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 2.0129209640300436, + "language_loss": 0.72030085, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74173713, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12542725, + "step": 15656, + "time_per_iteration": 2.4067542552948 + }, + { + "auxiliary_loss_clip": 0.01113414, + "auxiliary_loss_mlp": 0.0102254, + "balance_loss_clip": 1.04282117, + "balance_loss_mlp": 1.01176333, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 2.659626745223871, + "language_loss": 0.77680027, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79815984, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10772705, + "step": 15657, + "time_per_iteration": 2.541592836380005 + }, + { + "auxiliary_loss_clip": 0.01112297, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.04011774, + "balance_loss_mlp": 1.01436627, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 9.052628810608669, + "language_loss": 0.84262228, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86401057, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1217041, + "step": 15658, + "time_per_iteration": 2.4442431926727295 + }, + { + "auxiliary_loss_clip": 0.01121054, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.04297376, + "balance_loss_mlp": 1.02421713, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.7463808909912173, + "language_loss": 0.705073, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72667444, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.14855957, + "step": 15659, + "time_per_iteration": 4.00462794303894 + }, + { + "auxiliary_loss_clip": 0.01112684, + "auxiliary_loss_mlp": 0.01042897, + "balance_loss_clip": 1.0399158, + "balance_loss_mlp": 1.03131557, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.795967165776979, + "language_loss": 0.79523224, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81678802, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11572266, + "step": 15660, + "time_per_iteration": 2.499831199645996 + }, + { + "auxiliary_loss_clip": 0.01113549, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.04386628, + "balance_loss_mlp": 1.01706135, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.7508467487018673, + "language_loss": 0.6818316, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70324731, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10961914, + "step": 15661, + "time_per_iteration": 2.603196144104004 + }, + { + "auxiliary_loss_clip": 0.01112455, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.04109466, + "balance_loss_mlp": 1.01375437, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.3080448203317796, + "language_loss": 0.66547602, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68685132, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11322021, + "step": 15662, + "time_per_iteration": 2.472174882888794 + }, + { + "auxiliary_loss_clip": 0.01090265, + "auxiliary_loss_mlp": 0.01006868, + "balance_loss_clip": 1.06503606, + "balance_loss_mlp": 1.00519371, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7776385631780194, + "language_loss": 0.59264863, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61361992, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01673889, + "step": 15663, + "time_per_iteration": 3.19331955909729 + }, + { + "auxiliary_loss_clip": 0.01116204, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.04141951, + "balance_loss_mlp": 1.02069688, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.8018555612885434, + "language_loss": 0.66554201, + "learning_rate": 3.542695811435914e-08, + "loss": 0.68703544, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12451172, + "step": 15664, + "time_per_iteration": 2.6062326431274414 + }, + { + "auxiliary_loss_clip": 0.01114362, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.04294574, + "balance_loss_mlp": 1.02292514, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 2.0528079578708858, + "language_loss": 0.72951186, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75101566, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.13098145, + "step": 15665, + "time_per_iteration": 2.4513306617736816 + }, + { + "auxiliary_loss_clip": 0.01108771, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.04067802, + "balance_loss_mlp": 1.01761234, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 2.2912265482792344, + "language_loss": 0.63870943, + "learning_rate": 3.528114844807773e-08, + "loss": 0.66009831, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.12506104, + "step": 15666, + "time_per_iteration": 2.440823793411255 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.03469229, + "balance_loss_mlp": 1.0232811, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.5946135280772813, + "language_loss": 0.78975141, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81117672, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.13500977, + "step": 15667, + "time_per_iteration": 2.4790420532226562 + }, + { + "auxiliary_loss_clip": 0.01120481, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.04926527, + "balance_loss_mlp": 1.01946807, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.8978010292050345, + "language_loss": 0.75576371, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77727234, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10919189, + "step": 15668, + "time_per_iteration": 2.476224422454834 + }, + { + "auxiliary_loss_clip": 0.01117891, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.04334235, + "balance_loss_mlp": 1.01942301, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.2009869236766173, + "language_loss": 0.58795106, + "learning_rate": 3.506299272306723e-08, + "loss": 0.60944653, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12243652, + "step": 15669, + "time_per_iteration": 2.47896671295166 + }, + { + "auxiliary_loss_clip": 0.01110377, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.04023385, + "balance_loss_mlp": 1.01735997, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.560126472046131, + "language_loss": 0.7685473, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.78993475, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11016846, + "step": 15670, + "time_per_iteration": 3.850792646408081 + }, + { + "auxiliary_loss_clip": 0.0111233, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.04097211, + "balance_loss_mlp": 1.02049589, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 2.0215406881526516, + "language_loss": 0.65098548, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67243981, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12609863, + "step": 15671, + "time_per_iteration": 2.551788091659546 + }, + { + "auxiliary_loss_clip": 0.01111427, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.04017437, + "balance_loss_mlp": 1.02443182, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.698617662801945, + "language_loss": 0.79633999, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81782538, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12677002, + "step": 15672, + "time_per_iteration": 2.5329411029815674 + }, + { + "auxiliary_loss_clip": 0.01124381, + "auxiliary_loss_mlp": 0.0103545, + "balance_loss_clip": 1.04820001, + "balance_loss_mlp": 1.02234936, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.4264894677337616, + "language_loss": 0.73343205, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75503033, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.13098145, + "step": 15673, + "time_per_iteration": 2.435180425643921 + }, + { + "auxiliary_loss_clip": 0.01116614, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.04134417, + "balance_loss_mlp": 1.02538323, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.4770180433780815, + "language_loss": 0.70177007, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72332942, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13946533, + "step": 15674, + "time_per_iteration": 3.9701693058013916 + }, + { + "auxiliary_loss_clip": 0.01115779, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.04238534, + "balance_loss_mlp": 1.01342249, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.8358718036950556, + "language_loss": 0.81435889, + "learning_rate": 3.462869313364125e-08, + "loss": 0.8357619, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11114502, + "step": 15675, + "time_per_iteration": 2.487334728240967 + }, + { + "auxiliary_loss_clip": 0.01117372, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.0443821, + "balance_loss_mlp": 1.01827562, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.7599250906026624, + "language_loss": 0.62816179, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64963233, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11401367, + "step": 15676, + "time_per_iteration": 2.4410462379455566 + }, + { + "auxiliary_loss_clip": 0.01117578, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.04471231, + "balance_loss_mlp": 1.02300668, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 2.119889159793296, + "language_loss": 0.67297959, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69449848, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11303711, + "step": 15677, + "time_per_iteration": 2.4765431880950928 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_clip": 1.03894472, + "balance_loss_mlp": 1.02875161, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.8933409664374314, + "language_loss": 0.64707947, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66867912, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.16290283, + "step": 15678, + "time_per_iteration": 2.4676527976989746 + }, + { + "auxiliary_loss_clip": 0.01116586, + "auxiliary_loss_mlp": 0.01028988, + "balance_loss_clip": 1.04332221, + "balance_loss_mlp": 1.01715636, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.5742292737311498, + "language_loss": 0.74071139, + "learning_rate": 3.434065057895097e-08, + "loss": 0.7621671, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11834717, + "step": 15679, + "time_per_iteration": 2.4718382358551025 + }, + { + "auxiliary_loss_clip": 0.01115909, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.04139781, + "balance_loss_mlp": 1.01999569, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 3.277637444312636, + "language_loss": 0.77689779, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79837561, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11865234, + "step": 15680, + "time_per_iteration": 2.5213840007781982 + }, + { + "auxiliary_loss_clip": 0.01112892, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.04174483, + "balance_loss_mlp": 1.02092981, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 1.7488780007102394, + "language_loss": 0.75216645, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77361953, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11474609, + "step": 15681, + "time_per_iteration": 2.528554916381836 + }, + { + "auxiliary_loss_clip": 0.01123342, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.05096483, + "balance_loss_mlp": 1.02127695, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 4.651324086675529, + "language_loss": 0.65609407, + "learning_rate": 3.412540130236086e-08, + "loss": 0.6776613, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12115479, + "step": 15682, + "time_per_iteration": 2.483771324157715 + }, + { + "auxiliary_loss_clip": 0.0111531, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.0444504, + "balance_loss_mlp": 1.01822567, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 2.2121591802495266, + "language_loss": 0.76906419, + "learning_rate": 3.405380063219665e-08, + "loss": 0.79051304, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11352539, + "step": 15683, + "time_per_iteration": 2.5257060527801514 + }, + { + "auxiliary_loss_clip": 0.0112779, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.05178213, + "balance_loss_mlp": 1.02103162, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 4.128459885473254, + "language_loss": 0.7590524, + "learning_rate": 3.398227451090885e-08, + "loss": 0.78066272, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12200928, + "step": 15684, + "time_per_iteration": 3.8876953125 + }, + { + "auxiliary_loss_clip": 0.01114583, + "auxiliary_loss_mlp": 0.01025059, + "balance_loss_clip": 1.04531074, + "balance_loss_mlp": 1.01382399, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.7136078587423558, + "language_loss": 0.77147323, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79286969, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11236572, + "step": 15685, + "time_per_iteration": 2.531597852706909 + }, + { + "auxiliary_loss_clip": 0.01110693, + "auxiliary_loss_mlp": 0.01024827, + "balance_loss_clip": 1.04027057, + "balance_loss_mlp": 1.01409197, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.812000400731469, + "language_loss": 0.75225043, + "learning_rate": 3.383944592581023e-08, + "loss": 0.7736057, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10736084, + "step": 15686, + "time_per_iteration": 2.4655325412750244 + }, + { + "auxiliary_loss_clip": 0.01116366, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.04292107, + "balance_loss_mlp": 1.0172745, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.7614001591009496, + "language_loss": 0.80809712, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82955527, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12176514, + "step": 15687, + "time_per_iteration": 2.4849190711975098 + }, + { + "auxiliary_loss_clip": 0.01126094, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.05066252, + "balance_loss_mlp": 1.01987135, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 3.602510044935189, + "language_loss": 0.75675327, + "learning_rate": 3.369691556873011e-08, + "loss": 0.77834427, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.13122559, + "step": 15688, + "time_per_iteration": 2.4427437782287598 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01027619, + "balance_loss_clip": 1.0518657, + "balance_loss_mlp": 1.01533401, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.623790349091508, + "language_loss": 0.68055427, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70203078, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.12286377, + "step": 15689, + "time_per_iteration": 2.5626814365386963 + }, + { + "auxiliary_loss_clip": 0.01112319, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.04131317, + "balance_loss_mlp": 1.02251399, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.982193311019864, + "language_loss": 0.8050344, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82648945, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10675049, + "step": 15690, + "time_per_iteration": 2.4613852500915527 + }, + { + "auxiliary_loss_clip": 0.01121084, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.05028939, + "balance_loss_mlp": 1.02237558, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 2.073437541252317, + "language_loss": 0.59966725, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62121743, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11535645, + "step": 15691, + "time_per_iteration": 2.5855019092559814 + }, + { + "auxiliary_loss_clip": 0.01114934, + "auxiliary_loss_mlp": 0.01026561, + "balance_loss_clip": 1.04274106, + "balance_loss_mlp": 1.01529574, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.7648636509454185, + "language_loss": 0.66879344, + "learning_rate": 3.341274962505514e-08, + "loss": 0.69020838, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11254883, + "step": 15692, + "time_per_iteration": 2.457906723022461 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.03662777, + "balance_loss_mlp": 1.01728535, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.672241504197253, + "language_loss": 0.74955273, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77092505, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11639404, + "step": 15693, + "time_per_iteration": 2.516101360321045 + }, + { + "auxiliary_loss_clip": 0.01106703, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.03641415, + "balance_loss_mlp": 1.01681209, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.8603298302286415, + "language_loss": 0.73257005, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75392216, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11688232, + "step": 15694, + "time_per_iteration": 2.5231728553771973 + }, + { + "auxiliary_loss_clip": 0.01040893, + "auxiliary_loss_mlp": 0.01003239, + "balance_loss_clip": 1.01590383, + "balance_loss_mlp": 1.00189912, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.7269953846716672, + "language_loss": 0.50599682, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52643812, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01339722, + "step": 15695, + "time_per_iteration": 3.1517174243927 + }, + { + "auxiliary_loss_clip": 0.01104663, + "auxiliary_loss_mlp": 0.01029828, + "balance_loss_clip": 1.03676009, + "balance_loss_mlp": 1.01862264, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.7243177992166947, + "language_loss": 0.65018135, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67152625, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.11212158, + "step": 15696, + "time_per_iteration": 2.532588481903076 + }, + { + "auxiliary_loss_clip": 0.01116517, + "auxiliary_loss_mlp": 0.01027345, + "balance_loss_clip": 1.0441432, + "balance_loss_mlp": 1.01616895, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 2.4898083832071416, + "language_loss": 0.66138035, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68281901, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11169434, + "step": 15697, + "time_per_iteration": 2.47751522064209 + }, + { + "auxiliary_loss_clip": 0.01034834, + "auxiliary_loss_mlp": 0.01008297, + "balance_loss_clip": 1.0103035, + "balance_loss_mlp": 1.00671732, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8527909410024224, + "language_loss": 0.63127291, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65170419, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01579285, + "step": 15698, + "time_per_iteration": 3.0118296146392822 + }, + { + "auxiliary_loss_clip": 0.01114632, + "auxiliary_loss_mlp": 0.01042153, + "balance_loss_clip": 1.03939819, + "balance_loss_mlp": 1.02802062, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.7656909825944045, + "language_loss": 0.69637203, + "learning_rate": 3.291833039444092e-08, + "loss": 0.71793991, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.14141846, + "step": 15699, + "time_per_iteration": 2.4523561000823975 + }, + { + "auxiliary_loss_clip": 0.01107351, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.03798997, + "balance_loss_mlp": 1.01734352, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 2.351405465632552, + "language_loss": 0.74399579, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76535404, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11138916, + "step": 15700, + "time_per_iteration": 2.4229915142059326 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.04739928, + "balance_loss_mlp": 1.01916265, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.5577292210695584, + "language_loss": 0.70276445, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72429001, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11358643, + "step": 15701, + "time_per_iteration": 2.406320095062256 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.04518151, + "balance_loss_mlp": 1.01545417, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 1.818286572772149, + "language_loss": 0.77587843, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.79736453, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11889648, + "step": 15702, + "time_per_iteration": 2.457918405532837 + }, + { + "auxiliary_loss_clip": 0.01114579, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.04043126, + "balance_loss_mlp": 1.02057743, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 3.6601677928593537, + "language_loss": 0.66459274, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.6860587, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11444092, + "step": 15703, + "time_per_iteration": 3.8708012104034424 + }, + { + "auxiliary_loss_clip": 0.01113387, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.04099154, + "balance_loss_mlp": 1.01731944, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 1.69483767445655, + "language_loss": 0.73100615, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75243735, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12426758, + "step": 15704, + "time_per_iteration": 2.589783191680908 + }, + { + "auxiliary_loss_clip": 0.01111555, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.04042268, + "balance_loss_mlp": 1.01843834, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.8914247806008613, + "language_loss": 0.74368459, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76510227, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11785889, + "step": 15705, + "time_per_iteration": 2.4693048000335693 + }, + { + "auxiliary_loss_clip": 0.01113087, + "auxiliary_loss_mlp": 0.0104258, + "balance_loss_clip": 1.04142809, + "balance_loss_mlp": 1.02996767, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 2.2636278033522177, + "language_loss": 0.76784259, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.78939927, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12609863, + "step": 15706, + "time_per_iteration": 2.51678729057312 + }, + { + "auxiliary_loss_clip": 0.01110902, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.04135084, + "balance_loss_mlp": 1.01675892, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.706917766052026, + "language_loss": 0.69382596, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71521127, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10876465, + "step": 15707, + "time_per_iteration": 2.555662155151367 + }, + { + "auxiliary_loss_clip": 0.01103292, + "auxiliary_loss_mlp": 0.01027353, + "balance_loss_clip": 1.03608251, + "balance_loss_mlp": 1.01720214, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 2.1392479061043024, + "language_loss": 0.68999398, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71130049, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 0.67236328, + "router_z_loss_mlp": 0.10150146, + "step": 15708, + "time_per_iteration": 2.499284267425537 + }, + { + "auxiliary_loss_clip": 0.01116732, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.04655099, + "balance_loss_mlp": 1.01801443, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.3634283568324053, + "language_loss": 0.70755768, + "learning_rate": 3.221835774749748e-08, + "loss": 0.7290206, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11560059, + "step": 15709, + "time_per_iteration": 2.4254209995269775 + }, + { + "auxiliary_loss_clip": 0.01107288, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.03868055, + "balance_loss_mlp": 1.01949143, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 1.8793554722681145, + "language_loss": 0.8457613, + "learning_rate": 3.214877084074774e-08, + "loss": 0.8671416, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.11248779, + "step": 15710, + "time_per_iteration": 2.4799883365631104 + }, + { + "auxiliary_loss_clip": 0.01112082, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.03730404, + "balance_loss_mlp": 1.01956737, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.6825858425826188, + "language_loss": 0.71687078, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73831087, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12371826, + "step": 15711, + "time_per_iteration": 2.494019031524658 + }, + { + "auxiliary_loss_clip": 0.01125886, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.05221796, + "balance_loss_mlp": 1.0158782, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.6702947284020615, + "language_loss": 0.69258511, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71411854, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11578369, + "step": 15712, + "time_per_iteration": 2.628082752227783 + }, + { + "auxiliary_loss_clip": 0.01115397, + "auxiliary_loss_mlp": 0.0103203, + "balance_loss_clip": 1.04128456, + "balance_loss_mlp": 1.01900053, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.413074085693627, + "language_loss": 0.70977652, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73125076, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.13024902, + "step": 15713, + "time_per_iteration": 2.428640842437744 + }, + { + "auxiliary_loss_clip": 0.01105396, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.03557491, + "balance_loss_mlp": 1.0217979, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.5902739734033609, + "language_loss": 0.7671169, + "learning_rate": 3.187116945125212e-08, + "loss": 0.78851724, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.12835693, + "step": 15714, + "time_per_iteration": 3.982274293899536 + }, + { + "auxiliary_loss_clip": 0.01108454, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.03611827, + "balance_loss_mlp": 1.02042615, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.8377312110962989, + "language_loss": 0.67820203, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69961005, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11920166, + "step": 15715, + "time_per_iteration": 2.45597767829895 + }, + { + "auxiliary_loss_clip": 0.01116355, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.04239964, + "balance_loss_mlp": 1.01895201, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 1.8901415721626722, + "language_loss": 0.7473774, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76885623, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12573242, + "step": 15716, + "time_per_iteration": 2.481210708618164 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.03981495, + "balance_loss_mlp": 1.01730049, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.9731438335472749, + "language_loss": 0.62519222, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64660412, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12017822, + "step": 15717, + "time_per_iteration": 2.513240098953247 + }, + { + "auxiliary_loss_clip": 0.01113197, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.04182196, + "balance_loss_mlp": 1.02096987, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.7432537659471266, + "language_loss": 0.79306984, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81453425, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12280273, + "step": 15718, + "time_per_iteration": 3.9054648876190186 + }, + { + "auxiliary_loss_clip": 0.01040565, + "auxiliary_loss_mlp": 0.01001543, + "balance_loss_clip": 1.01633322, + "balance_loss_mlp": 1.00024343, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.7114427256691268, + "language_loss": 0.57737589, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59779698, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.24243164, + "router_z_loss_mlp": 0.01300049, + "step": 15719, + "time_per_iteration": 3.1224193572998047 + }, + { + "auxiliary_loss_clip": 0.0111216, + "auxiliary_loss_mlp": 0.01031015, + "balance_loss_clip": 1.03992844, + "balance_loss_mlp": 1.01952326, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.794428766791107, + "language_loss": 0.75899768, + "learning_rate": 3.145700636861193e-08, + "loss": 0.78042948, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1149292, + "step": 15720, + "time_per_iteration": 2.496103048324585 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01023637, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.01380777, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.774724112305775, + "language_loss": 0.73088682, + "learning_rate": 3.138824043864452e-08, + "loss": 0.75222516, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.0982666, + "step": 15721, + "time_per_iteration": 2.4911515712738037 + }, + { + "auxiliary_loss_clip": 0.01109068, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.03651726, + "balance_loss_mlp": 1.02232385, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.8686072248028633, + "language_loss": 0.85360688, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87504619, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12542725, + "step": 15722, + "time_per_iteration": 2.5036373138427734 + }, + { + "auxiliary_loss_clip": 0.01048077, + "auxiliary_loss_mlp": 0.01003657, + "balance_loss_clip": 1.02319121, + "balance_loss_mlp": 1.00240552, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8955234095532055, + "language_loss": 0.64478624, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66530359, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01251221, + "step": 15723, + "time_per_iteration": 3.046339988708496 + }, + { + "auxiliary_loss_clip": 0.01120712, + "auxiliary_loss_mlp": 0.01031768, + "balance_loss_clip": 1.04323983, + "balance_loss_mlp": 1.0191679, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 1.9509380695274368, + "language_loss": 0.72800428, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.74952906, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.1260376, + "step": 15724, + "time_per_iteration": 2.5384140014648438 + }, + { + "auxiliary_loss_clip": 0.01117405, + "auxiliary_loss_mlp": 0.01024388, + "balance_loss_clip": 1.04636943, + "balance_loss_mlp": 1.01317096, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.53187304772496, + "language_loss": 0.85084534, + "learning_rate": 3.111392324436024e-08, + "loss": 0.87226331, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11218262, + "step": 15725, + "time_per_iteration": 2.488722085952759 + }, + { + "auxiliary_loss_clip": 0.01117133, + "auxiliary_loss_mlp": 0.01029149, + "balance_loss_clip": 1.04361594, + "balance_loss_mlp": 1.01777053, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 2.011649993321602, + "language_loss": 0.70925415, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73071694, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11376953, + "step": 15726, + "time_per_iteration": 2.50429368019104 + }, + { + "auxiliary_loss_clip": 0.01116083, + "auxiliary_loss_mlp": 0.01025042, + "balance_loss_clip": 1.04422784, + "balance_loss_mlp": 1.0127461, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.7664083723302222, + "language_loss": 0.61021149, + "learning_rate": 3.097721259896735e-08, + "loss": 0.63162279, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.12304688, + "step": 15727, + "time_per_iteration": 3.917971134185791 + }, + { + "auxiliary_loss_clip": 0.01117908, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.04842246, + "balance_loss_mlp": 1.02334487, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.8376043827969082, + "language_loss": 0.81857657, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.84010005, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.11096191, + "step": 15728, + "time_per_iteration": 2.461411237716675 + }, + { + "auxiliary_loss_clip": 0.01035047, + "auxiliary_loss_mlp": 0.0099909, + "balance_loss_clip": 1.01089334, + "balance_loss_mlp": 0.9978472, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7480081219416285, + "language_loss": 0.59073412, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61107552, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.24169922, + "router_z_loss_mlp": 0.01242065, + "step": 15729, + "time_per_iteration": 3.121856451034546 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01025206, + "balance_loss_clip": 1.03491831, + "balance_loss_mlp": 1.01280808, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 2.750998349991159, + "language_loss": 0.76231807, + "learning_rate": 3.077270662890052e-08, + "loss": 0.7836327, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1239624, + "step": 15730, + "time_per_iteration": 2.4717981815338135 + }, + { + "auxiliary_loss_clip": 0.01113738, + "auxiliary_loss_mlp": 0.0102996, + "balance_loss_clip": 1.04092312, + "balance_loss_mlp": 1.01758623, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.497237589296086, + "language_loss": 0.62372828, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64516521, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12371826, + "step": 15731, + "time_per_iteration": 2.4788050651550293 + }, + { + "auxiliary_loss_clip": 0.01120494, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.04588699, + "balance_loss_mlp": 1.01771259, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 1.9104393138129219, + "language_loss": 0.63847709, + "learning_rate": 3.063674267769589e-08, + "loss": 0.65998155, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12231445, + "step": 15732, + "time_per_iteration": 2.521848678588867 + }, + { + "auxiliary_loss_clip": 0.01119631, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.04319668, + "balance_loss_mlp": 1.01712024, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.9946321755965621, + "language_loss": 0.8434974, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86499107, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1262207, + "step": 15733, + "time_per_iteration": 2.4945030212402344 + }, + { + "auxiliary_loss_clip": 0.01110672, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.04143178, + "balance_loss_mlp": 1.01772141, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 5.032373075864468, + "language_loss": 0.72163701, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74302709, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10626221, + "step": 15734, + "time_per_iteration": 2.4677789211273193 + }, + { + "auxiliary_loss_clip": 0.01104934, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.03763807, + "balance_loss_mlp": 1.01978898, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.4221251490979978, + "language_loss": 0.87032926, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89167213, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.09558105, + "step": 15735, + "time_per_iteration": 2.5130951404571533 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.04144192, + "balance_loss_mlp": 1.0193013, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 1.8958443441245465, + "language_loss": 0.67334878, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69478166, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11236572, + "step": 15736, + "time_per_iteration": 2.4740984439849854 + }, + { + "auxiliary_loss_clip": 0.01035723, + "auxiliary_loss_mlp": 0.01001401, + "balance_loss_clip": 1.01140165, + "balance_loss_mlp": 1.00011039, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8629711503406915, + "language_loss": 0.65226376, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67263502, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.2434082, + "router_z_loss_mlp": 0.01290894, + "step": 15737, + "time_per_iteration": 3.1526448726654053 + }, + { + "auxiliary_loss_clip": 0.01063296, + "auxiliary_loss_mlp": 0.01004775, + "balance_loss_clip": 1.03876221, + "balance_loss_mlp": 1.00321317, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.7963240117645921, + "language_loss": 0.58793163, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60861242, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01560974, + "step": 15738, + "time_per_iteration": 3.1610801219940186 + }, + { + "auxiliary_loss_clip": 0.01114331, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.0421865, + "balance_loss_mlp": 1.02175403, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 2.083231978398734, + "language_loss": 0.71819496, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73966289, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10705566, + "step": 15739, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01110054, + "auxiliary_loss_mlp": 0.01036516, + "balance_loss_clip": 1.03703833, + "balance_loss_mlp": 1.02280092, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 3.0367314117813837, + "language_loss": 0.64882863, + "learning_rate": 3.009587421648363e-08, + "loss": 0.6702944, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.13726807, + "step": 15740, + "time_per_iteration": 2.4852945804595947 + }, + { + "auxiliary_loss_clip": 0.01116588, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.04513168, + "balance_loss_mlp": 1.01769257, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.7136768603479937, + "language_loss": 0.66463047, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.6860885, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11529541, + "step": 15741, + "time_per_iteration": 2.524078130722046 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01026628, + "balance_loss_clip": 1.04345226, + "balance_loss_mlp": 1.01513052, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.4593166184300954, + "language_loss": 0.75987411, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.78130376, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1149292, + "step": 15742, + "time_per_iteration": 2.4458343982696533 + }, + { + "auxiliary_loss_clip": 0.01123629, + "auxiliary_loss_mlp": 0.0102603, + "balance_loss_clip": 1.05218446, + "balance_loss_mlp": 1.01492584, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 1.784840432731344, + "language_loss": 0.71999073, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74148732, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11108398, + "step": 15743, + "time_per_iteration": 2.4642515182495117 + }, + { + "auxiliary_loss_clip": 0.01119241, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.04606318, + "balance_loss_mlp": 1.01772809, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 2.0381258058073284, + "language_loss": 0.79839468, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81988233, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11798096, + "step": 15744, + "time_per_iteration": 2.557265520095825 + }, + { + "auxiliary_loss_clip": 0.01117958, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.04255867, + "balance_loss_mlp": 1.02098596, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.6047104854285028, + "language_loss": 0.7807548, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80226606, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12176514, + "step": 15745, + "time_per_iteration": 2.594749689102173 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.04167581, + "balance_loss_mlp": 1.01883376, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.631280267474202, + "language_loss": 0.70348883, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72494912, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1149292, + "step": 15746, + "time_per_iteration": 3.981367826461792 + }, + { + "auxiliary_loss_clip": 0.01116336, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.04554975, + "balance_loss_mlp": 1.01574218, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.2024182221937454, + "language_loss": 0.56558859, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58703089, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.12139893, + "step": 15747, + "time_per_iteration": 2.447554588317871 + }, + { + "auxiliary_loss_clip": 0.01043566, + "auxiliary_loss_mlp": 0.01001657, + "balance_loss_clip": 1.01906991, + "balance_loss_mlp": 1.00037742, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6606827452191516, + "language_loss": 0.5328424, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55329466, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01280212, + "step": 15748, + "time_per_iteration": 3.305391311645508 + }, + { + "auxiliary_loss_clip": 0.01123807, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.05202508, + "balance_loss_mlp": 1.02372742, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.8271465062744727, + "language_loss": 0.66304451, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68463564, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11566162, + "step": 15749, + "time_per_iteration": 2.503565788269043 + }, + { + "auxiliary_loss_clip": 0.0111632, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.04150319, + "balance_loss_mlp": 1.01509976, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 2.299118683587372, + "language_loss": 0.76188439, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78333706, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13848877, + "step": 15750, + "time_per_iteration": 2.5374865531921387 + }, + { + "auxiliary_loss_clip": 0.0111508, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.04392099, + "balance_loss_mlp": 1.01450801, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.773803756280046, + "language_loss": 0.68331355, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70472527, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.1159668, + "step": 15751, + "time_per_iteration": 2.56467866897583 + }, + { + "auxiliary_loss_clip": 0.01109272, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.03818429, + "balance_loss_mlp": 1.01686239, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.5914103870703649, + "language_loss": 0.65717214, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67855018, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11669922, + "step": 15752, + "time_per_iteration": 2.626555919647217 + }, + { + "auxiliary_loss_clip": 0.01114058, + "auxiliary_loss_mlp": 0.01029619, + "balance_loss_clip": 1.04148793, + "balance_loss_mlp": 1.01748347, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.4822120971744013, + "language_loss": 0.71489125, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73632801, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.12139893, + "step": 15753, + "time_per_iteration": 2.46743106842041 + }, + { + "auxiliary_loss_clip": 0.01114393, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.03884959, + "balance_loss_mlp": 1.01898146, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.8620810430077164, + "language_loss": 0.70703542, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72850454, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.13531494, + "step": 15754, + "time_per_iteration": 2.5321574211120605 + }, + { + "auxiliary_loss_clip": 0.01116599, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.04310989, + "balance_loss_mlp": 1.01948488, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.287941207777184, + "language_loss": 0.7908175, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.81229395, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11560059, + "step": 15755, + "time_per_iteration": 2.4526638984680176 + }, + { + "auxiliary_loss_clip": 0.01117239, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.04038739, + "balance_loss_mlp": 1.02107072, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.8681610347771858, + "language_loss": 0.75318938, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77471215, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13964844, + "step": 15756, + "time_per_iteration": 2.4817886352539062 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.03574872, + "balance_loss_mlp": 1.02063727, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 2.8662442218716024, + "language_loss": 0.74887168, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.77026337, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11181641, + "step": 15757, + "time_per_iteration": 3.8682682514190674 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04370272, + "balance_loss_mlp": 1.0190407, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.097136952314848, + "language_loss": 0.79613972, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81764865, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.13952637, + "step": 15758, + "time_per_iteration": 2.4644412994384766 + }, + { + "auxiliary_loss_clip": 0.01110196, + "auxiliary_loss_mlp": 0.01028016, + "balance_loss_clip": 1.04138899, + "balance_loss_mlp": 1.01724541, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.4327138968105895, + "language_loss": 0.72120512, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74258721, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10772705, + "step": 15759, + "time_per_iteration": 2.5017411708831787 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.01026061, + "balance_loss_clip": 1.03866649, + "balance_loss_mlp": 1.01666141, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.482371917980197, + "language_loss": 0.75778997, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77910161, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 0.66455078, + "router_z_loss_mlp": 0.09393311, + "step": 15760, + "time_per_iteration": 2.5070879459381104 + }, + { + "auxiliary_loss_clip": 0.01105807, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.03555393, + "balance_loss_mlp": 1.02220595, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.8322055234364625, + "language_loss": 0.72917938, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.75057435, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.1149292, + "step": 15761, + "time_per_iteration": 2.482797622680664 + }, + { + "auxiliary_loss_clip": 0.0110814, + "auxiliary_loss_mlp": 0.01035102, + "balance_loss_clip": 1.03875101, + "balance_loss_mlp": 1.0234015, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.5383084829655447, + "language_loss": 0.71923923, + "learning_rate": 2.863314050734722e-08, + "loss": 0.74067163, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.11688232, + "step": 15762, + "time_per_iteration": 3.825145721435547 + }, + { + "auxiliary_loss_clip": 0.01110564, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.03583074, + "balance_loss_mlp": 1.02128327, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 1.8811755699776147, + "language_loss": 0.67357004, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69501609, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12762451, + "step": 15763, + "time_per_iteration": 2.5246875286102295 + }, + { + "auxiliary_loss_clip": 0.01111706, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.03846097, + "balance_loss_mlp": 1.02172661, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.760093387099639, + "language_loss": 0.69784516, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.71930587, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12646484, + "step": 15764, + "time_per_iteration": 2.475703239440918 + }, + { + "auxiliary_loss_clip": 0.01110185, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.04286766, + "balance_loss_mlp": 1.01807666, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6028389946175934, + "language_loss": 0.71411747, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73550093, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 0.67333984, + "router_z_loss_mlp": 0.10089111, + "step": 15765, + "time_per_iteration": 2.4649195671081543 + }, + { + "auxiliary_loss_clip": 0.01043733, + "auxiliary_loss_mlp": 0.01003196, + "balance_loss_clip": 1.01859736, + "balance_loss_mlp": 1.0018425, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8041674641422908, + "language_loss": 0.59089547, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61136472, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.25170898, + "router_z_loss_mlp": 0.01353455, + "step": 15766, + "time_per_iteration": 2.8557920455932617 + }, + { + "auxiliary_loss_clip": 0.01113956, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.04264367, + "balance_loss_mlp": 1.02220237, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.8624122123443898, + "language_loss": 0.74434894, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.765818, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10748291, + "step": 15767, + "time_per_iteration": 2.454627752304077 + }, + { + "auxiliary_loss_clip": 0.0111575, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.041556, + "balance_loss_mlp": 1.02425969, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.2370008245503317, + "language_loss": 0.7302919, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.7518295, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.13763428, + "step": 15768, + "time_per_iteration": 2.459257125854492 + }, + { + "auxiliary_loss_clip": 0.01054456, + "auxiliary_loss_mlp": 0.01010301, + "balance_loss_clip": 1.03077269, + "balance_loss_mlp": 1.00910306, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7379393086993459, + "language_loss": 0.5533663, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57401383, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.01197815, + "step": 15769, + "time_per_iteration": 3.090848922729492 + }, + { + "auxiliary_loss_clip": 0.01116933, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.04295063, + "balance_loss_mlp": 1.01722765, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 2.032261279921524, + "language_loss": 0.77036792, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79182732, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11779785, + "step": 15770, + "time_per_iteration": 2.5270915031433105 + }, + { + "auxiliary_loss_clip": 0.01122723, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.04870224, + "balance_loss_mlp": 1.01874185, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 3.0209663947016847, + "language_loss": 0.8028459, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82439077, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.13024902, + "step": 15771, + "time_per_iteration": 3.9197771549224854 + }, + { + "auxiliary_loss_clip": 0.0111638, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.04613721, + "balance_loss_mlp": 1.0184226, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 2.0720050253588784, + "language_loss": 0.69674748, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71821076, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11517334, + "step": 15772, + "time_per_iteration": 2.4704368114471436 + }, + { + "auxiliary_loss_clip": 0.01108169, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.03761268, + "balance_loss_mlp": 1.01678848, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.468171612096248, + "language_loss": 0.73220462, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75357378, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11975098, + "step": 15773, + "time_per_iteration": 2.4477624893188477 + }, + { + "auxiliary_loss_clip": 0.01112275, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.0211066, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.14539924289849, + "language_loss": 0.62850833, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64997637, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13427734, + "step": 15774, + "time_per_iteration": 2.483847141265869 + }, + { + "auxiliary_loss_clip": 0.01111568, + "auxiliary_loss_mlp": 0.01034197, + "balance_loss_clip": 1.03953171, + "balance_loss_mlp": 1.02073884, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 3.1221525164563806, + "language_loss": 0.59168708, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61314476, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.13439941, + "step": 15775, + "time_per_iteration": 2.488377809524536 + }, + { + "auxiliary_loss_clip": 0.01108965, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.03660762, + "balance_loss_mlp": 1.01770627, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.7451534794621149, + "language_loss": 0.62209868, + "learning_rate": 2.772114638584555e-08, + "loss": 0.643502, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.13665771, + "step": 15776, + "time_per_iteration": 2.640963077545166 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.04291511, + "balance_loss_mlp": 1.02200687, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 2.2400922510533103, + "language_loss": 0.73871553, + "learning_rate": 2.765656478622458e-08, + "loss": 0.76022232, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12176514, + "step": 15777, + "time_per_iteration": 2.581529378890991 + }, + { + "auxiliary_loss_clip": 0.01123572, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.04395735, + "balance_loss_mlp": 1.02426708, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.470663437804905, + "language_loss": 0.72216737, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74377203, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12628174, + "step": 15778, + "time_per_iteration": 2.531740665435791 + }, + { + "auxiliary_loss_clip": 0.01112324, + "auxiliary_loss_mlp": 0.01036543, + "balance_loss_clip": 1.04229414, + "balance_loss_mlp": 1.02590919, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 2.016227076756611, + "language_loss": 0.70042419, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.72191286, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10632324, + "step": 15779, + "time_per_iteration": 2.3875083923339844 + }, + { + "auxiliary_loss_clip": 0.01121934, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.04521084, + "balance_loss_mlp": 1.02085614, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 1.9889629379093048, + "language_loss": 0.78227562, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80382812, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12445068, + "step": 15780, + "time_per_iteration": 2.4627933502197266 + }, + { + "auxiliary_loss_clip": 0.01120367, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.04820144, + "balance_loss_mlp": 1.01808572, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.8444328787934043, + "language_loss": 0.66212976, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68363178, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11743164, + "step": 15781, + "time_per_iteration": 2.4254820346832275 + }, + { + "auxiliary_loss_clip": 0.01110896, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.04064822, + "balance_loss_mlp": 1.01750112, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.194975410327012, + "language_loss": 0.79771292, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81911188, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11486816, + "step": 15782, + "time_per_iteration": 2.456367254257202 + }, + { + "auxiliary_loss_clip": 0.01038078, + "auxiliary_loss_mlp": 0.01000917, + "balance_loss_clip": 1.0130651, + "balance_loss_mlp": 0.99956691, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.7184492079257692, + "language_loss": 0.59826815, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61865813, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01350403, + "step": 15783, + "time_per_iteration": 3.178661823272705 + }, + { + "auxiliary_loss_clip": 0.01117686, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.04493463, + "balance_loss_mlp": 1.02147448, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.6742290190505522, + "language_loss": 0.7384125, + "learning_rate": 2.720658788656105e-08, + "loss": 0.75992602, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12207031, + "step": 15784, + "time_per_iteration": 2.502006769180298 + }, + { + "auxiliary_loss_clip": 0.01110969, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03946733, + "balance_loss_mlp": 1.01694465, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.7925162725351256, + "language_loss": 0.6952458, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71664619, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12127686, + "step": 15785, + "time_per_iteration": 2.4905312061309814 + }, + { + "auxiliary_loss_clip": 0.01118264, + "auxiliary_loss_mlp": 0.01024205, + "balance_loss_clip": 1.04581785, + "balance_loss_mlp": 1.01273131, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.721719503742346, + "language_loss": 0.75832832, + "learning_rate": 2.707869629830495e-08, + "loss": 0.77975297, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11474609, + "step": 15786, + "time_per_iteration": 2.4903602600097656 + }, + { + "auxiliary_loss_clip": 0.0111319, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.04231942, + "balance_loss_mlp": 1.01618767, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.8439675087561551, + "language_loss": 0.7902782, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81168199, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11004639, + "step": 15787, + "time_per_iteration": 2.512911558151245 + }, + { + "auxiliary_loss_clip": 0.01110218, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.03964627, + "balance_loss_mlp": 1.01925254, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.5183893715504995, + "language_loss": 0.76569688, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78711259, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12115479, + "step": 15788, + "time_per_iteration": 2.450669288635254 + }, + { + "auxiliary_loss_clip": 0.01116268, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.04253018, + "balance_loss_mlp": 1.01490664, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 1.7042531285400182, + "language_loss": 0.71921992, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.74065632, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12445068, + "step": 15789, + "time_per_iteration": 2.4464375972747803 + }, + { + "auxiliary_loss_clip": 0.01120697, + "auxiliary_loss_mlp": 0.01028191, + "balance_loss_clip": 1.04641509, + "balance_loss_mlp": 1.01590061, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 1.861810331873219, + "language_loss": 0.73047048, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75195932, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12298584, + "step": 15790, + "time_per_iteration": 3.9004878997802734 + }, + { + "auxiliary_loss_clip": 0.01117297, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.04017138, + "balance_loss_mlp": 1.02726746, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 26.498422411532353, + "language_loss": 0.78069806, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.80228567, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.14196777, + "step": 15791, + "time_per_iteration": 2.5264391899108887 + }, + { + "auxiliary_loss_clip": 0.01123033, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.04590797, + "balance_loss_mlp": 1.02166188, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 2.5041459677815996, + "language_loss": 0.73603988, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75761342, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12658691, + "step": 15792, + "time_per_iteration": 2.498917579650879 + }, + { + "auxiliary_loss_clip": 0.01122144, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.05109334, + "balance_loss_mlp": 1.02001476, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.9260511490670227, + "language_loss": 0.78403246, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80556893, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11486816, + "step": 15793, + "time_per_iteration": 2.4405407905578613 + }, + { + "auxiliary_loss_clip": 0.01113757, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.04248834, + "balance_loss_mlp": 1.01531994, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.77538768097165, + "language_loss": 0.7754041, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79680365, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10882568, + "step": 15794, + "time_per_iteration": 2.5715672969818115 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.04251492, + "balance_loss_mlp": 1.01617289, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.8762610722571254, + "language_loss": 0.61001873, + "learning_rate": 2.650688769211107e-08, + "loss": 0.6314832, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.13140869, + "step": 15795, + "time_per_iteration": 2.4502439498901367 + }, + { + "auxiliary_loss_clip": 0.01109718, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.03955722, + "balance_loss_mlp": 1.02067995, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.9373312159363645, + "language_loss": 0.79552674, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81695396, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.12347412, + "step": 15796, + "time_per_iteration": 2.4687387943267822 + }, + { + "auxiliary_loss_clip": 0.01114858, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.04185462, + "balance_loss_mlp": 1.01614726, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 1.819127307109077, + "language_loss": 0.75575942, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77719402, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12463379, + "step": 15797, + "time_per_iteration": 2.452348232269287 + }, + { + "auxiliary_loss_clip": 0.01117432, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.04443002, + "balance_loss_mlp": 1.01960897, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 1.78931431736011, + "language_loss": 0.65787512, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67935991, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11444092, + "step": 15798, + "time_per_iteration": 2.4174482822418213 + }, + { + "auxiliary_loss_clip": 0.01109774, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.03695107, + "balance_loss_mlp": 1.0213387, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.9682342650314666, + "language_loss": 0.77321208, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79464513, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12194824, + "step": 15799, + "time_per_iteration": 2.5705716609954834 + }, + { + "auxiliary_loss_clip": 0.011109, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.04147625, + "balance_loss_mlp": 1.01708007, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 9.4538256776306, + "language_loss": 0.7122637, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73365706, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11346436, + "step": 15800, + "time_per_iteration": 3.916278600692749 + }, + { + "auxiliary_loss_clip": 0.01117442, + "auxiliary_loss_mlp": 0.01027509, + "balance_loss_clip": 1.04548335, + "balance_loss_mlp": 1.01529551, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.8235848581265715, + "language_loss": 0.7183094, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73975885, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12219238, + "step": 15801, + "time_per_iteration": 2.5301971435546875 + }, + { + "auxiliary_loss_clip": 0.01110379, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.03910315, + "balance_loss_mlp": 1.01726544, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.5577743304908418, + "language_loss": 0.80873561, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83012593, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11383057, + "step": 15802, + "time_per_iteration": 2.5388574600219727 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.0396055, + "balance_loss_mlp": 1.01910436, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.6250509325133804, + "language_loss": 0.67455745, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69599199, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11462402, + "step": 15803, + "time_per_iteration": 2.4864988327026367 + }, + { + "auxiliary_loss_clip": 0.01110002, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.03877091, + "balance_loss_mlp": 1.01826239, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.560340135016793, + "language_loss": 0.76011813, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78151774, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11694336, + "step": 15804, + "time_per_iteration": 2.4398505687713623 + }, + { + "auxiliary_loss_clip": 0.01119231, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.04619753, + "balance_loss_mlp": 1.01773679, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 2.10992556561311, + "language_loss": 0.73055023, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75203639, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11645508, + "step": 15805, + "time_per_iteration": 3.83486270904541 + }, + { + "auxiliary_loss_clip": 0.01114974, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.04167485, + "balance_loss_mlp": 1.01882899, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 1.4990038823490397, + "language_loss": 0.80339527, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82485139, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11810303, + "step": 15806, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01118231, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.04307675, + "balance_loss_mlp": 1.01955247, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.24013243676016, + "language_loss": 0.82449806, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84599316, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11737061, + "step": 15807, + "time_per_iteration": 2.4225234985351562 + }, + { + "auxiliary_loss_clip": 0.01105713, + "auxiliary_loss_mlp": 0.01039242, + "balance_loss_clip": 1.03508663, + "balance_loss_mlp": 1.02551484, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.9092682491397472, + "language_loss": 0.71934128, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.74079078, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.13726807, + "step": 15808, + "time_per_iteration": 2.469066619873047 + }, + { + "auxiliary_loss_clip": 0.01117298, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.04515219, + "balance_loss_mlp": 1.01743197, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.401730783299443, + "language_loss": 0.69823134, + "learning_rate": 2.562945671948058e-08, + "loss": 0.71969676, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11810303, + "step": 15809, + "time_per_iteration": 2.4371440410614014 + }, + { + "auxiliary_loss_clip": 0.01118443, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.04535043, + "balance_loss_mlp": 1.01408076, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.7167627475106224, + "language_loss": 0.7562173, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77766168, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11907959, + "step": 15810, + "time_per_iteration": 2.502504587173462 + }, + { + "auxiliary_loss_clip": 0.01105947, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.03479636, + "balance_loss_mlp": 1.03054595, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.4256511715385307, + "language_loss": 0.8024714, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82397747, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.14111328, + "step": 15811, + "time_per_iteration": 2.4805665016174316 + }, + { + "auxiliary_loss_clip": 0.01109987, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.03807092, + "balance_loss_mlp": 1.0200386, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 2.2838126020304617, + "language_loss": 0.69883239, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72024882, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11602783, + "step": 15812, + "time_per_iteration": 2.5039680004119873 + }, + { + "auxiliary_loss_clip": 0.01119059, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.04600489, + "balance_loss_mlp": 1.01791978, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.6524144373871226, + "language_loss": 0.65459681, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67608249, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11590576, + "step": 15813, + "time_per_iteration": 2.4050445556640625 + }, + { + "auxiliary_loss_clip": 0.01108064, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.03567243, + "balance_loss_mlp": 1.02123332, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.3294976872839455, + "language_loss": 0.70485705, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72627127, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12133789, + "step": 15814, + "time_per_iteration": 2.630713939666748 + }, + { + "auxiliary_loss_clip": 0.01110047, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.04077017, + "balance_loss_mlp": 1.01494014, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 1.922640258415821, + "language_loss": 0.63065475, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65201354, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10894775, + "step": 15815, + "time_per_iteration": 3.9167964458465576 + }, + { + "auxiliary_loss_clip": 0.01109793, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.03706098, + "balance_loss_mlp": 1.01623118, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.798178476932394, + "language_loss": 0.58614171, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60751235, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11035156, + "step": 15816, + "time_per_iteration": 2.5125644207000732 + }, + { + "auxiliary_loss_clip": 0.01113375, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.040838, + "balance_loss_mlp": 1.02419841, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.5077297545547443, + "language_loss": 0.73400521, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75549519, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11425781, + "step": 15817, + "time_per_iteration": 2.513885498046875 + }, + { + "auxiliary_loss_clip": 0.01117753, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.04489493, + "balance_loss_mlp": 1.02194321, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.585230331866288, + "language_loss": 0.60092038, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.622455, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.13757324, + "step": 15818, + "time_per_iteration": 2.4747891426086426 + }, + { + "auxiliary_loss_clip": 0.01114278, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.04004073, + "balance_loss_mlp": 1.02169788, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.9484478674454881, + "language_loss": 0.69396555, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71545196, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12670898, + "step": 15819, + "time_per_iteration": 2.432887077331543 + }, + { + "auxiliary_loss_clip": 0.01118286, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.04458857, + "balance_loss_mlp": 1.01664281, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.7714254286129996, + "language_loss": 0.74199802, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76345879, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11151123, + "step": 15820, + "time_per_iteration": 2.427631378173828 + }, + { + "auxiliary_loss_clip": 0.01117568, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.04261172, + "balance_loss_mlp": 1.02354944, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.631894858264701, + "language_loss": 0.78702664, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80856502, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12713623, + "step": 15821, + "time_per_iteration": 2.425996780395508 + }, + { + "auxiliary_loss_clip": 0.01108928, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.03758454, + "balance_loss_mlp": 1.0195719, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.5687660531805125, + "language_loss": 0.71153194, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73295677, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.13983154, + "step": 15822, + "time_per_iteration": 2.602736473083496 + }, + { + "auxiliary_loss_clip": 0.0112056, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.05008364, + "balance_loss_mlp": 1.01956522, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.6084359394156564, + "language_loss": 0.66335261, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68486667, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1126709, + "step": 15823, + "time_per_iteration": 2.455052137374878 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.04245985, + "balance_loss_mlp": 1.01706266, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.5930895902494058, + "language_loss": 0.77249002, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79390502, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11187744, + "step": 15824, + "time_per_iteration": 2.440856695175171 + }, + { + "auxiliary_loss_clip": 0.01119161, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.04383409, + "balance_loss_mlp": 1.01547015, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 1.9307397584298691, + "language_loss": 0.73479527, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75626528, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12365723, + "step": 15825, + "time_per_iteration": 2.5129566192626953 + }, + { + "auxiliary_loss_clip": 0.0103495, + "auxiliary_loss_mlp": 0.01001534, + "balance_loss_clip": 1.00990486, + "balance_loss_mlp": 1.00013828, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.9010610209295914, + "language_loss": 0.53427339, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55463815, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01396179, + "step": 15826, + "time_per_iteration": 3.0178730487823486 + }, + { + "auxiliary_loss_clip": 0.01110156, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.03848743, + "balance_loss_mlp": 1.02351367, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 1.9502152598947557, + "language_loss": 0.72942317, + "learning_rate": 2.452289414874076e-08, + "loss": 0.75088191, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.12200928, + "step": 15827, + "time_per_iteration": 2.599844455718994 + }, + { + "auxiliary_loss_clip": 0.01113457, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.04067194, + "balance_loss_mlp": 1.01801169, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.8399400584395127, + "language_loss": 0.74648213, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76792312, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12634277, + "step": 15828, + "time_per_iteration": 2.459183931350708 + }, + { + "auxiliary_loss_clip": 0.01113322, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.04422057, + "balance_loss_mlp": 1.01795137, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.6300987367972937, + "language_loss": 0.72961682, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75103754, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10803223, + "step": 15829, + "time_per_iteration": 2.5091350078582764 + }, + { + "auxiliary_loss_clip": 0.01112301, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.04062366, + "balance_loss_mlp": 1.0160234, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.8040418572562131, + "language_loss": 0.61314255, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.63454115, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11541748, + "step": 15830, + "time_per_iteration": 2.445950984954834 + }, + { + "auxiliary_loss_clip": 0.01111863, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03720093, + "balance_loss_mlp": 1.01739657, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 1.8347588856310633, + "language_loss": 0.73109329, + "learning_rate": 2.428028693179729e-08, + "loss": 0.75252223, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1362915, + "step": 15831, + "time_per_iteration": 2.514193058013916 + }, + { + "auxiliary_loss_clip": 0.01111155, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.04051089, + "balance_loss_mlp": 1.01422358, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.6350638408996698, + "language_loss": 0.65575892, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67711639, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10369873, + "step": 15832, + "time_per_iteration": 2.4562764167785645 + }, + { + "auxiliary_loss_clip": 0.01111111, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.0426141, + "balance_loss_mlp": 1.01956129, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.832486909070097, + "language_loss": 0.77921945, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80063879, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.11260986, + "step": 15833, + "time_per_iteration": 2.4660210609436035 + }, + { + "auxiliary_loss_clip": 0.01105403, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.03667855, + "balance_loss_mlp": 1.01763415, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.232953842725241, + "language_loss": 0.74940836, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.77075154, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.11291504, + "step": 15834, + "time_per_iteration": 3.902404308319092 + }, + { + "auxiliary_loss_clip": 0.01118125, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.04231405, + "balance_loss_mlp": 1.02331996, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.9561920811535685, + "language_loss": 0.76114285, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78268319, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12597656, + "step": 15835, + "time_per_iteration": 2.473698139190674 + }, + { + "auxiliary_loss_clip": 0.0111382, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.03921032, + "balance_loss_mlp": 1.01707649, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.8468535280053286, + "language_loss": 0.65801084, + "learning_rate": 2.397871361623238e-08, + "loss": 0.67944252, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12286377, + "step": 15836, + "time_per_iteration": 2.3896756172180176 + }, + { + "auxiliary_loss_clip": 0.01115271, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.04533887, + "balance_loss_mlp": 1.0174526, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.6168596314413632, + "language_loss": 0.70554656, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72700763, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.13378906, + "step": 15837, + "time_per_iteration": 2.4740610122680664 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.04470503, + "balance_loss_mlp": 1.01970553, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 2.0475429496513424, + "language_loss": 0.73591256, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75745976, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.14147949, + "step": 15838, + "time_per_iteration": 2.4301674365997314 + }, + { + "auxiliary_loss_clip": 0.01109368, + "auxiliary_loss_mlp": 0.01028853, + "balance_loss_clip": 1.03730714, + "balance_loss_mlp": 1.01731324, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.9059708099854131, + "language_loss": 0.78305864, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80444086, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11541748, + "step": 15839, + "time_per_iteration": 2.5093283653259277 + }, + { + "auxiliary_loss_clip": 0.01113091, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.03974998, + "balance_loss_mlp": 1.02053213, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.5696759647110405, + "language_loss": 0.80357504, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82503235, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12109375, + "step": 15840, + "time_per_iteration": 2.440260171890259 + }, + { + "auxiliary_loss_clip": 0.01102379, + "auxiliary_loss_mlp": 0.01026962, + "balance_loss_clip": 1.03595018, + "balance_loss_mlp": 1.01725817, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 2.4051779745052975, + "language_loss": 0.73025519, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75154865, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 0.09698486, + "step": 15841, + "time_per_iteration": 2.448629140853882 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01026333, + "balance_loss_clip": 1.04286993, + "balance_loss_mlp": 1.01573551, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 4.7798846346508705, + "language_loss": 0.79486686, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81623244, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.10595703, + "step": 15842, + "time_per_iteration": 2.4237253665924072 + }, + { + "auxiliary_loss_clip": 0.01107836, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.03746378, + "balance_loss_mlp": 1.01885629, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 2.5300856567167544, + "language_loss": 0.72651815, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74790615, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12109375, + "step": 15843, + "time_per_iteration": 2.48685884475708 + }, + { + "auxiliary_loss_clip": 0.01113575, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.03970361, + "balance_loss_mlp": 1.01360118, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.7166837108979212, + "language_loss": 0.78456783, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80596709, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12750244, + "step": 15844, + "time_per_iteration": 3.9579453468322754 + }, + { + "auxiliary_loss_clip": 0.01121951, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.0437876, + "balance_loss_mlp": 1.01593161, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 2.6035996945814697, + "language_loss": 0.70322108, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72473609, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13635254, + "step": 15845, + "time_per_iteration": 2.6254398822784424 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.04240274, + "balance_loss_mlp": 1.01828218, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.491870025601101, + "language_loss": 0.75406134, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77551579, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11444092, + "step": 15846, + "time_per_iteration": 2.5227959156036377 + }, + { + "auxiliary_loss_clip": 0.01114066, + "auxiliary_loss_mlp": 0.01026751, + "balance_loss_clip": 1.04273272, + "balance_loss_mlp": 1.01536679, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.8207800860576129, + "language_loss": 0.77962095, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80102909, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1138916, + "step": 15847, + "time_per_iteration": 2.473320245742798 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.04864156, + "balance_loss_mlp": 1.0222218, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 2.3647056949921414, + "language_loss": 0.77795088, + "learning_rate": 2.326258115328672e-08, + "loss": 0.79946232, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.1060791, + "step": 15848, + "time_per_iteration": 3.8768157958984375 + }, + { + "auxiliary_loss_clip": 0.01116977, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.04142487, + "balance_loss_mlp": 1.02331674, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.5494769042799272, + "language_loss": 0.71953094, + "learning_rate": 2.320339062183674e-08, + "loss": 0.7410596, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12585449, + "step": 15849, + "time_per_iteration": 2.4547226428985596 + }, + { + "auxiliary_loss_clip": 0.0111583, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.03961205, + "balance_loss_mlp": 1.02123582, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.6565152011730973, + "language_loss": 0.75197113, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77347124, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1293335, + "step": 15850, + "time_per_iteration": 2.4602878093719482 + }, + { + "auxiliary_loss_clip": 0.01112822, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.03821945, + "balance_loss_mlp": 1.01998484, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.7543315214610677, + "language_loss": 0.72556132, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74700153, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11212158, + "step": 15851, + "time_per_iteration": 2.4039769172668457 + }, + { + "auxiliary_loss_clip": 0.01112159, + "auxiliary_loss_mlp": 0.01026158, + "balance_loss_clip": 1.04194081, + "balance_loss_mlp": 1.01460695, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 1.8844524225079666, + "language_loss": 0.79771686, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81910002, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11553955, + "step": 15852, + "time_per_iteration": 2.4360547065734863 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.03858304, + "balance_loss_mlp": 1.01828527, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.6915298673429549, + "language_loss": 0.59560794, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61703306, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12060547, + "step": 15853, + "time_per_iteration": 2.638500452041626 + }, + { + "auxiliary_loss_clip": 0.01105104, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.0362488, + "balance_loss_mlp": 1.01857769, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.867975951082455, + "language_loss": 0.72642529, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74777234, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.11029053, + "step": 15854, + "time_per_iteration": 2.430711269378662 + }, + { + "auxiliary_loss_clip": 0.01111697, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.0393002, + "balance_loss_mlp": 1.01977158, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 3.076345823015004, + "language_loss": 0.67409062, + "learning_rate": 2.284982167833127e-08, + "loss": 0.6955198, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11462402, + "step": 15855, + "time_per_iteration": 2.4515798091888428 + }, + { + "auxiliary_loss_clip": 0.01119106, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.0460422, + "balance_loss_mlp": 1.01562524, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.4891850620058749, + "language_loss": 0.76474667, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78620887, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11486816, + "step": 15856, + "time_per_iteration": 2.4975426197052 + }, + { + "auxiliary_loss_clip": 0.01114566, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.04361558, + "balance_loss_mlp": 1.01890481, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.963322331131384, + "language_loss": 0.78153175, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80297649, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10992432, + "step": 15857, + "time_per_iteration": 3.9341988563537598 + }, + { + "auxiliary_loss_clip": 0.01040428, + "auxiliary_loss_mlp": 0.0100144, + "balance_loss_clip": 1.01588249, + "balance_loss_mlp": 1.0001545, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7122352446263102, + "language_loss": 0.62631595, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64673465, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01286316, + "step": 15858, + "time_per_iteration": 3.013793468475342 + }, + { + "auxiliary_loss_clip": 0.01112293, + "auxiliary_loss_mlp": 0.01027105, + "balance_loss_clip": 1.04118133, + "balance_loss_mlp": 1.01622725, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.6025548854855323, + "language_loss": 0.56786615, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.5892601, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10882568, + "step": 15859, + "time_per_iteration": 2.4896349906921387 + }, + { + "auxiliary_loss_clip": 0.01110342, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.04174614, + "balance_loss_mlp": 1.01649094, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.081904020412875, + "language_loss": 0.81829786, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.83967221, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.1060791, + "step": 15860, + "time_per_iteration": 2.4223568439483643 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01025176, + "balance_loss_clip": 1.04048371, + "balance_loss_mlp": 1.01456594, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.8306277814543688, + "language_loss": 0.66511631, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68649054, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1060791, + "step": 15861, + "time_per_iteration": 2.507289409637451 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.04206276, + "balance_loss_mlp": 1.02205479, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 2.0978198651274313, + "language_loss": 0.65639687, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67788446, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12182617, + "step": 15862, + "time_per_iteration": 2.524043321609497 + }, + { + "auxiliary_loss_clip": 0.01115252, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.04604614, + "balance_loss_mlp": 1.01574469, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.567114316491417, + "language_loss": 0.67559779, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69701654, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10888672, + "step": 15863, + "time_per_iteration": 2.4824912548065186 + }, + { + "auxiliary_loss_clip": 0.01114865, + "auxiliary_loss_mlp": 0.01025864, + "balance_loss_clip": 1.04215086, + "balance_loss_mlp": 1.01430643, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 1.9076589690226107, + "language_loss": 0.78166467, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80307198, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11560059, + "step": 15864, + "time_per_iteration": 2.5240824222564697 + }, + { + "auxiliary_loss_clip": 0.01111212, + "auxiliary_loss_mlp": 0.01025825, + "balance_loss_clip": 1.04103947, + "balance_loss_mlp": 1.01482224, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 1.9083461564786912, + "language_loss": 0.59707069, + "learning_rate": 2.226653824047586e-08, + "loss": 0.6184411, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11004639, + "step": 15865, + "time_per_iteration": 2.4518887996673584 + }, + { + "auxiliary_loss_clip": 0.0111978, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.04711342, + "balance_loss_mlp": 1.01636386, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.8196483899334595, + "language_loss": 0.7008121, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72228855, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1149292, + "step": 15866, + "time_per_iteration": 2.4484450817108154 + }, + { + "auxiliary_loss_clip": 0.01109401, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.0370959, + "balance_loss_mlp": 1.01759672, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.7479305210293288, + "language_loss": 0.84656936, + "learning_rate": 2.215078143255855e-08, + "loss": 0.86796594, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12652588, + "step": 15867, + "time_per_iteration": 2.5139598846435547 + }, + { + "auxiliary_loss_clip": 0.01037103, + "auxiliary_loss_mlp": 0.01002978, + "balance_loss_clip": 1.01256573, + "balance_loss_mlp": 1.00136971, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7681989114263449, + "language_loss": 0.61813849, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63853931, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01609802, + "step": 15868, + "time_per_iteration": 3.1630001068115234 + }, + { + "auxiliary_loss_clip": 0.01113859, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.04220104, + "balance_loss_mlp": 1.0171175, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 2.2423505109199247, + "language_loss": 0.60059196, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.6220395, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.13775635, + "step": 15869, + "time_per_iteration": 2.4834070205688477 + }, + { + "auxiliary_loss_clip": 0.01108114, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.03702652, + "balance_loss_mlp": 1.02316236, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.8158982304416946, + "language_loss": 0.71148098, + "learning_rate": 2.197770872795579e-08, + "loss": 0.73291057, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11682129, + "step": 15870, + "time_per_iteration": 2.4616620540618896 + }, + { + "auxiliary_loss_clip": 0.01110519, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.0390079, + "balance_loss_mlp": 1.02019179, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 1.9739256331065556, + "language_loss": 0.76899803, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.79043376, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12860107, + "step": 15871, + "time_per_iteration": 2.486213445663452 + }, + { + "auxiliary_loss_clip": 0.01120278, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.04557383, + "balance_loss_mlp": 1.01710594, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 1.942673704852555, + "language_loss": 0.58276367, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60425979, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12243652, + "step": 15872, + "time_per_iteration": 2.561336040496826 + }, + { + "auxiliary_loss_clip": 0.01121691, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.04636514, + "balance_loss_mlp": 1.01783371, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.5222843385198086, + "language_loss": 0.74898839, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.77051651, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13287354, + "step": 15873, + "time_per_iteration": 2.4932403564453125 + }, + { + "auxiliary_loss_clip": 0.01122143, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.05007863, + "balance_loss_mlp": 1.01895583, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.7848021769033398, + "language_loss": 0.62337172, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64490187, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11914062, + "step": 15874, + "time_per_iteration": 2.4513418674468994 + }, + { + "auxiliary_loss_clip": 0.01111306, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.04148304, + "balance_loss_mlp": 1.0221858, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.9367361899810094, + "language_loss": 0.89711672, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91856265, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11096191, + "step": 15875, + "time_per_iteration": 2.410435676574707 + }, + { + "auxiliary_loss_clip": 0.01115365, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.04069483, + "balance_loss_mlp": 1.02155471, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.6780417750340575, + "language_loss": 0.67682695, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69833565, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1394043, + "step": 15876, + "time_per_iteration": 2.453185558319092 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.04035974, + "balance_loss_mlp": 1.01483142, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.872629148336086, + "language_loss": 0.69364387, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71506625, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12927246, + "step": 15877, + "time_per_iteration": 3.867607831954956 + }, + { + "auxiliary_loss_clip": 0.01120104, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.04460549, + "balance_loss_mlp": 1.01885116, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.620394297094295, + "language_loss": 0.71060354, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.7321173, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1239624, + "step": 15878, + "time_per_iteration": 2.4820525646209717 + }, + { + "auxiliary_loss_clip": 0.01113077, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.04284096, + "balance_loss_mlp": 1.01495993, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.3815972219055277, + "language_loss": 0.68414694, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70554125, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11401367, + "step": 15879, + "time_per_iteration": 2.4881293773651123 + }, + { + "auxiliary_loss_clip": 0.01104619, + "auxiliary_loss_mlp": 0.01026393, + "balance_loss_clip": 1.03539312, + "balance_loss_mlp": 1.01515198, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 2.096825042937398, + "language_loss": 0.85215878, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.87346882, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.11248779, + "step": 15880, + "time_per_iteration": 2.520642042160034 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.04175806, + "balance_loss_mlp": 1.016096, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 2.171131043059789, + "language_loss": 0.72172385, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74315447, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12054443, + "step": 15881, + "time_per_iteration": 2.545368194580078 + }, + { + "auxiliary_loss_clip": 0.01116779, + "auxiliary_loss_mlp": 0.01034357, + "balance_loss_clip": 1.04685378, + "balance_loss_mlp": 1.02254343, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.8507035283796498, + "language_loss": 0.71428835, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73579973, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.1182251, + "step": 15882, + "time_per_iteration": 2.417062759399414 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.04167843, + "balance_loss_mlp": 1.01801038, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.8244909207197848, + "language_loss": 0.6605351, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68196142, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11572266, + "step": 15883, + "time_per_iteration": 2.852433919906616 + }, + { + "auxiliary_loss_clip": 0.01116652, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.04336452, + "balance_loss_mlp": 1.01708853, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 3.618745967484809, + "language_loss": 0.77843046, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.79989409, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1262207, + "step": 15884, + "time_per_iteration": 2.4244251251220703 + }, + { + "auxiliary_loss_clip": 0.01115017, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.04127336, + "balance_loss_mlp": 1.01544607, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.7337095941943577, + "language_loss": 0.77905244, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.80049384, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.13671875, + "step": 15885, + "time_per_iteration": 2.422206401824951 + }, + { + "auxiliary_loss_clip": 0.01113506, + "auxiliary_loss_mlp": 0.01031453, + "balance_loss_clip": 1.04056001, + "balance_loss_mlp": 1.02019382, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.8789414249181609, + "language_loss": 0.70517719, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72662675, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1126709, + "step": 15886, + "time_per_iteration": 2.486875534057617 + }, + { + "auxiliary_loss_clip": 0.01115084, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.03921306, + "balance_loss_mlp": 1.02366364, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.5915135918766294, + "language_loss": 0.72717571, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.7487005, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13720703, + "step": 15887, + "time_per_iteration": 2.500295400619507 + }, + { + "auxiliary_loss_clip": 0.0110282, + "auxiliary_loss_mlp": 0.01026597, + "balance_loss_clip": 1.03446889, + "balance_loss_mlp": 1.01588058, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 2.5724629743662586, + "language_loss": 0.57258856, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.5938828, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10717773, + "step": 15888, + "time_per_iteration": 3.866438388824463 + }, + { + "auxiliary_loss_clip": 0.01034112, + "auxiliary_loss_mlp": 0.01001887, + "balance_loss_clip": 1.00988817, + "balance_loss_mlp": 1.00058126, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7097053123400656, + "language_loss": 0.57777643, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59813643, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.01306152, + "step": 15889, + "time_per_iteration": 3.134124517440796 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.03456044, + "balance_loss_mlp": 1.01646805, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.7817939877995905, + "language_loss": 0.6695866, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69095993, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.13360596, + "step": 15890, + "time_per_iteration": 2.4941070079803467 + }, + { + "auxiliary_loss_clip": 0.01112741, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.0417037, + "balance_loss_mlp": 1.02198935, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.529438758375386, + "language_loss": 0.74033213, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76179099, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11157227, + "step": 15891, + "time_per_iteration": 2.502499580383301 + }, + { + "auxiliary_loss_clip": 0.01105022, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.03688323, + "balance_loss_mlp": 1.02319217, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.9359111027063902, + "language_loss": 0.78155529, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80295473, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.11724854, + "step": 15892, + "time_per_iteration": 3.8522002696990967 + }, + { + "auxiliary_loss_clip": 0.01117304, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.04411435, + "balance_loss_mlp": 1.01975763, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.3936832499796883, + "language_loss": 0.69915885, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.72065371, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12420654, + "step": 15893, + "time_per_iteration": 2.5502443313598633 + }, + { + "auxiliary_loss_clip": 0.01120664, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.04999471, + "balance_loss_mlp": 1.01634359, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 2.25900890674103, + "language_loss": 0.65628558, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67778617, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.1305542, + "step": 15894, + "time_per_iteration": 2.4632298946380615 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.04102755, + "balance_loss_mlp": 1.0177871, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.7399733968061795, + "language_loss": 0.81979418, + "learning_rate": 2.056169412853581e-08, + "loss": 0.84123552, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12213135, + "step": 15895, + "time_per_iteration": 2.498997688293457 + }, + { + "auxiliary_loss_clip": 0.01112717, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.04013872, + "balance_loss_mlp": 1.02251458, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 2.1195902384604923, + "language_loss": 0.72425532, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74574405, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.13659668, + "step": 15896, + "time_per_iteration": 2.4716641902923584 + }, + { + "auxiliary_loss_clip": 0.01110042, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.04005992, + "balance_loss_mlp": 1.01770711, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.7754220706424537, + "language_loss": 0.79504752, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81644118, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.1161499, + "step": 15897, + "time_per_iteration": 2.4089574813842773 + }, + { + "auxiliary_loss_clip": 0.01107483, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.03661883, + "balance_loss_mlp": 1.01748168, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.5095591457411193, + "language_loss": 0.72932971, + "learning_rate": 2.03949242614303e-08, + "loss": 0.75071752, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.13812256, + "step": 15898, + "time_per_iteration": 2.4472336769104004 + }, + { + "auxiliary_loss_clip": 0.01046151, + "auxiliary_loss_mlp": 0.01003242, + "balance_loss_clip": 1.02155423, + "balance_loss_mlp": 1.00170732, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.876813082878347, + "language_loss": 0.52364945, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54414338, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.01535034, + "step": 15899, + "time_per_iteration": 3.026768922805786 + }, + { + "auxiliary_loss_clip": 0.01122117, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.0456326, + "balance_loss_mlp": 1.0176816, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.390029046495402, + "language_loss": 0.68509793, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70663029, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13439941, + "step": 15900, + "time_per_iteration": 3.824904680252075 + }, + { + "auxiliary_loss_clip": 0.01119213, + "auxiliary_loss_mlp": 0.01028454, + "balance_loss_clip": 1.04516816, + "balance_loss_mlp": 1.01652169, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.1249317591446637, + "language_loss": 0.82906955, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85054624, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11932373, + "step": 15901, + "time_per_iteration": 2.394632577896118 + }, + { + "auxiliary_loss_clip": 0.01032887, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 1.00850821, + "balance_loss_mlp": 1.00146663, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7057624440178173, + "language_loss": 0.54314101, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56349725, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.24365234, + "router_z_loss_mlp": 0.01269531, + "step": 15902, + "time_per_iteration": 3.171741008758545 + }, + { + "auxiliary_loss_clip": 0.0110599, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.03786755, + "balance_loss_mlp": 1.02537012, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.8590931173468914, + "language_loss": 0.85504133, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87647533, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.1204834, + "step": 15903, + "time_per_iteration": 2.4220330715179443 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.04541051, + "balance_loss_mlp": 1.01395082, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 1.8757121080476218, + "language_loss": 0.80313206, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82455909, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10681152, + "step": 15904, + "time_per_iteration": 2.416250228881836 + }, + { + "auxiliary_loss_clip": 0.01115589, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.03963232, + "balance_loss_mlp": 1.02411938, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.759931301697979, + "language_loss": 0.59916687, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.6207009, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.13677979, + "step": 15905, + "time_per_iteration": 2.441035032272339 + }, + { + "auxiliary_loss_clip": 0.01111812, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.04010761, + "balance_loss_mlp": 1.01996887, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.9065162356372505, + "language_loss": 0.70631593, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72776628, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.13256836, + "step": 15906, + "time_per_iteration": 2.4439985752105713 + }, + { + "auxiliary_loss_clip": 0.01123329, + "auxiliary_loss_mlp": 0.01028481, + "balance_loss_clip": 1.04524541, + "balance_loss_mlp": 1.01596427, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.6813991430186257, + "language_loss": 0.71065402, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73217213, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12506104, + "step": 15907, + "time_per_iteration": 2.4370815753936768 + }, + { + "auxiliary_loss_clip": 0.01113285, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.04244089, + "balance_loss_mlp": 1.01878166, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.9974289480901923, + "language_loss": 0.7013613, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72279453, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11273193, + "step": 15908, + "time_per_iteration": 2.494288444519043 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.0451653, + "balance_loss_mlp": 1.01940298, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.8054692573931541, + "language_loss": 0.83222318, + "learning_rate": 1.978921532427802e-08, + "loss": 0.85370684, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11755371, + "step": 15909, + "time_per_iteration": 2.4395320415496826 + }, + { + "auxiliary_loss_clip": 0.0111174, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.03815973, + "balance_loss_mlp": 1.01660335, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 1.8447736369713617, + "language_loss": 0.67866087, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.70006055, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11639404, + "step": 15910, + "time_per_iteration": 2.506863594055176 + }, + { + "auxiliary_loss_clip": 0.01123211, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.04754961, + "balance_loss_mlp": 1.02275765, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.812910823956587, + "language_loss": 0.74436486, + "learning_rate": 1.968006251276444e-08, + "loss": 0.765939, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11450195, + "step": 15911, + "time_per_iteration": 2.4566869735717773 + }, + { + "auxiliary_loss_clip": 0.01119579, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.04749751, + "balance_loss_mlp": 1.01691437, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 1.6625504498109331, + "language_loss": 0.69381529, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71529055, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1104126, + "step": 15912, + "time_per_iteration": 2.5583996772766113 + }, + { + "auxiliary_loss_clip": 0.01117614, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.04341912, + "balance_loss_mlp": 1.02225041, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 2.6754086925708065, + "language_loss": 0.72532642, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74684608, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12103271, + "step": 15913, + "time_per_iteration": 2.452643871307373 + }, + { + "auxiliary_loss_clip": 0.01115793, + "auxiliary_loss_mlp": 0.01026009, + "balance_loss_clip": 1.04561424, + "balance_loss_mlp": 1.01513124, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 1.9255333665911682, + "language_loss": 0.73492831, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75634634, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10876465, + "step": 15914, + "time_per_iteration": 2.5033445358276367 + }, + { + "auxiliary_loss_clip": 0.0110969, + "auxiliary_loss_mlp": 0.01026214, + "balance_loss_clip": 1.03850114, + "balance_loss_mlp": 1.01503849, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.605273062507106, + "language_loss": 0.6718933, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69325233, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11169434, + "step": 15915, + "time_per_iteration": 2.5088253021240234 + }, + { + "auxiliary_loss_clip": 0.01113744, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.04212773, + "balance_loss_mlp": 1.01782238, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.7745400398041913, + "language_loss": 0.64428157, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66571331, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11627197, + "step": 15916, + "time_per_iteration": 2.4352309703826904 + }, + { + "auxiliary_loss_clip": 0.0110999, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.04408801, + "balance_loss_mlp": 1.0171504, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 2.190137871815911, + "language_loss": 0.80939209, + "learning_rate": 1.935440639853536e-08, + "loss": 0.83077073, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 0.65869141, + "router_z_loss_mlp": 0.10717773, + "step": 15917, + "time_per_iteration": 2.4407925605773926 + }, + { + "auxiliary_loss_clip": 0.01118235, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.0479157, + "balance_loss_mlp": 1.02204919, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 2.44694589084869, + "language_loss": 0.73153353, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.75305092, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11474609, + "step": 15918, + "time_per_iteration": 2.4396183490753174 + }, + { + "auxiliary_loss_clip": 0.01051338, + "auxiliary_loss_mlp": 0.01003485, + "balance_loss_clip": 1.02687049, + "balance_loss_mlp": 1.00218105, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6283915372019796, + "language_loss": 0.53120607, + "learning_rate": 1.924645518878032e-08, + "loss": 0.5517543, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.24462891, + "router_z_loss_mlp": 0.01303101, + "step": 15919, + "time_per_iteration": 3.2114171981811523 + }, + { + "auxiliary_loss_clip": 0.01116803, + "auxiliary_loss_mlp": 0.01037598, + "balance_loss_clip": 1.04315972, + "balance_loss_mlp": 1.02371573, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.8724324038227445, + "language_loss": 0.75238967, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77393365, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.13885498, + "step": 15920, + "time_per_iteration": 2.417696714401245 + }, + { + "auxiliary_loss_clip": 0.0112286, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.04759836, + "balance_loss_mlp": 1.01910686, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.8438884985094053, + "language_loss": 0.78851783, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81006515, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12744141, + "step": 15921, + "time_per_iteration": 3.867800712585449 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.01718235, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.053504527607409, + "language_loss": 0.51323909, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53469902, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13043213, + "step": 15922, + "time_per_iteration": 2.5218284130096436 + }, + { + "auxiliary_loss_clip": 0.01113324, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.04055667, + "balance_loss_mlp": 1.01850009, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.033610880431809, + "language_loss": 0.83434331, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85577947, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11798096, + "step": 15923, + "time_per_iteration": 2.4094412326812744 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.04067945, + "balance_loss_mlp": 1.02186716, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.6980889051603545, + "language_loss": 0.75422692, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77570724, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.10992432, + "step": 15924, + "time_per_iteration": 2.480508804321289 + }, + { + "auxiliary_loss_clip": 0.01119478, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.0464319, + "balance_loss_mlp": 1.02123618, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.0531817061376847, + "language_loss": 0.85794127, + "learning_rate": 1.892440427371711e-08, + "loss": 0.87947088, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12261963, + "step": 15925, + "time_per_iteration": 2.466005563735962 + }, + { + "auxiliary_loss_clip": 0.01115854, + "auxiliary_loss_mlp": 0.01038525, + "balance_loss_clip": 1.03995454, + "balance_loss_mlp": 1.02405834, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 1.7280396448243835, + "language_loss": 0.75818384, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77972758, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.14471436, + "step": 15926, + "time_per_iteration": 2.666165590286255 + }, + { + "auxiliary_loss_clip": 0.01120066, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.04389095, + "balance_loss_mlp": 1.01888204, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.6597347927271193, + "language_loss": 0.77831197, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79981172, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11035156, + "step": 15927, + "time_per_iteration": 2.536299705505371 + }, + { + "auxiliary_loss_clip": 0.01117426, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.04282665, + "balance_loss_mlp": 1.0201869, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 1.72402972922145, + "language_loss": 0.6897943, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.71130466, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.13415527, + "step": 15928, + "time_per_iteration": 2.543009042739868 + }, + { + "auxiliary_loss_clip": 0.01114706, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.04095685, + "balance_loss_mlp": 1.02089894, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.7577622724787472, + "language_loss": 0.82129908, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84277678, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12158203, + "step": 15929, + "time_per_iteration": 2.4338057041168213 + }, + { + "auxiliary_loss_clip": 0.01120371, + "auxiliary_loss_mlp": 0.01038128, + "balance_loss_clip": 1.04391026, + "balance_loss_mlp": 1.02616525, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.6961471680142453, + "language_loss": 0.72425586, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74584085, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11968994, + "step": 15930, + "time_per_iteration": 2.48923397064209 + }, + { + "auxiliary_loss_clip": 0.01112527, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.0423491, + "balance_loss_mlp": 1.01628804, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.4874983987059733, + "language_loss": 0.62295055, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64434916, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11047363, + "step": 15931, + "time_per_iteration": 3.8509438037872314 + }, + { + "auxiliary_loss_clip": 0.01116568, + "auxiliary_loss_mlp": 0.01025126, + "balance_loss_clip": 1.04680979, + "balance_loss_mlp": 1.01474857, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.7956723191948145, + "language_loss": 0.69107878, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71249574, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10375977, + "step": 15932, + "time_per_iteration": 2.416409492492676 + }, + { + "auxiliary_loss_clip": 0.01115968, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.04095376, + "balance_loss_mlp": 1.02166963, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 2.1550904240167803, + "language_loss": 0.75418484, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77569067, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12945557, + "step": 15933, + "time_per_iteration": 2.4354724884033203 + }, + { + "auxiliary_loss_clip": 0.01050133, + "auxiliary_loss_mlp": 0.01002507, + "balance_loss_clip": 1.02526999, + "balance_loss_mlp": 1.00122881, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7036829693653629, + "language_loss": 0.57249498, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59302139, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01278687, + "step": 15934, + "time_per_iteration": 3.17767333984375 + }, + { + "auxiliary_loss_clip": 0.01043291, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.01853824, + "balance_loss_mlp": 1.00191474, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9028330626185469, + "language_loss": 0.65939856, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67986459, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.01396179, + "step": 15935, + "time_per_iteration": 4.4126176834106445 + }, + { + "auxiliary_loss_clip": 0.01047174, + "auxiliary_loss_mlp": 0.01001471, + "balance_loss_clip": 1.02107167, + "balance_loss_mlp": 1.00007367, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7788895276476921, + "language_loss": 0.56974488, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59023136, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.26123047, + "router_z_loss_mlp": 0.01396179, + "step": 15936, + "time_per_iteration": 3.0393126010894775 + }, + { + "auxiliary_loss_clip": 0.01118342, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.04480767, + "balance_loss_mlp": 1.01699591, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.5821595211226225, + "language_loss": 0.78434658, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80582571, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12567139, + "step": 15937, + "time_per_iteration": 2.544719696044922 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.05519533, + "balance_loss_mlp": 1.01563287, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 2.1747625315847943, + "language_loss": 0.68282753, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70439696, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11450195, + "step": 15938, + "time_per_iteration": 2.4912843704223633 + }, + { + "auxiliary_loss_clip": 0.01110525, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.03907168, + "balance_loss_mlp": 1.0151571, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 3.413225616647145, + "language_loss": 0.65709352, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67846811, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11785889, + "step": 15939, + "time_per_iteration": 2.528813600540161 + }, + { + "auxiliary_loss_clip": 0.01112494, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.04200363, + "balance_loss_mlp": 1.02121162, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.5939117191669918, + "language_loss": 0.73566973, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.75711787, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11108398, + "step": 15940, + "time_per_iteration": 2.4933385848999023 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.04185462, + "balance_loss_mlp": 1.01913881, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 9.134283160293252, + "language_loss": 0.72648752, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.74797416, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.13476562, + "step": 15941, + "time_per_iteration": 2.4556589126586914 + }, + { + "auxiliary_loss_clip": 0.01116902, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.04424596, + "balance_loss_mlp": 1.02070093, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.6400022021934986, + "language_loss": 0.70821929, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.72971004, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11480713, + "step": 15942, + "time_per_iteration": 2.5826125144958496 + }, + { + "auxiliary_loss_clip": 0.01118104, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.0444324, + "balance_loss_mlp": 1.02140522, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.4879564472899998, + "language_loss": 0.72069329, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74221802, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12963867, + "step": 15943, + "time_per_iteration": 2.6063225269317627 + }, + { + "auxiliary_loss_clip": 0.01134156, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.05558467, + "balance_loss_mlp": 1.0207473, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.847710550354498, + "language_loss": 0.68446004, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70613909, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13000488, + "step": 15944, + "time_per_iteration": 3.9425442218780518 + }, + { + "auxiliary_loss_clip": 0.01117531, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.04413724, + "balance_loss_mlp": 1.02224493, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 1.6329813040181058, + "language_loss": 0.66265738, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.68417764, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12243652, + "step": 15945, + "time_per_iteration": 2.448391914367676 + }, + { + "auxiliary_loss_clip": 0.0104429, + "auxiliary_loss_mlp": 0.0100203, + "balance_loss_clip": 1.0192368, + "balance_loss_mlp": 1.00067365, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7376955693261529, + "language_loss": 0.61859035, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63905358, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.01356506, + "step": 15946, + "time_per_iteration": 3.117884874343872 + }, + { + "auxiliary_loss_clip": 0.01110995, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.04178166, + "balance_loss_mlp": 1.01686037, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.6304104820743772, + "language_loss": 0.75581622, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77720237, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10760498, + "step": 15947, + "time_per_iteration": 2.495349168777466 + }, + { + "auxiliary_loss_clip": 0.01107959, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.03637588, + "balance_loss_mlp": 1.0196383, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.346423462093721, + "language_loss": 0.69951868, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72090888, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11413574, + "step": 15948, + "time_per_iteration": 2.4714553356170654 + }, + { + "auxiliary_loss_clip": 0.01112529, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.04089332, + "balance_loss_mlp": 1.01631618, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 1.8733672127044183, + "language_loss": 0.78695273, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80835193, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11077881, + "step": 15949, + "time_per_iteration": 2.454909324645996 + }, + { + "auxiliary_loss_clip": 0.0111352, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.04038787, + "balance_loss_mlp": 1.01789033, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.8828581559371949, + "language_loss": 0.68443906, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70587516, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12194824, + "step": 15950, + "time_per_iteration": 2.4550998210906982 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.04219961, + "balance_loss_mlp": 1.02242517, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.7147233586447546, + "language_loss": 0.86081886, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88232851, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12384033, + "step": 15951, + "time_per_iteration": 2.4765055179595947 + }, + { + "auxiliary_loss_clip": 0.01125399, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.04985499, + "balance_loss_mlp": 1.02144384, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.151835264920985, + "language_loss": 0.79530537, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.81689417, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12036133, + "step": 15952, + "time_per_iteration": 2.4521546363830566 + }, + { + "auxiliary_loss_clip": 0.01115614, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.04360533, + "balance_loss_mlp": 1.01857865, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 2.3909862845596197, + "language_loss": 0.6983633, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71982813, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12298584, + "step": 15953, + "time_per_iteration": 2.4551327228546143 + }, + { + "auxiliary_loss_clip": 0.01119247, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.04504919, + "balance_loss_mlp": 1.01724637, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 2.9790910065827947, + "language_loss": 0.58276415, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60425788, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12884521, + "step": 15954, + "time_per_iteration": 2.4769721031188965 + }, + { + "auxiliary_loss_clip": 0.0112058, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.04575646, + "balance_loss_mlp": 1.02042627, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.004229517728058, + "language_loss": 0.73618323, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.75772524, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13201904, + "step": 15955, + "time_per_iteration": 2.5205607414245605 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.04580855, + "balance_loss_mlp": 1.02148724, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.8659122457211925, + "language_loss": 0.62289417, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.6444096, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.12188721, + "step": 15956, + "time_per_iteration": 2.4197845458984375 + }, + { + "auxiliary_loss_clip": 0.01112146, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.04091346, + "balance_loss_mlp": 1.01874781, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.8714820171250317, + "language_loss": 0.60168058, + "learning_rate": 1.725248447997507e-08, + "loss": 0.62310684, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11724854, + "step": 15957, + "time_per_iteration": 2.4219272136688232 + }, + { + "auxiliary_loss_clip": 0.01117613, + "auxiliary_loss_mlp": 0.01035814, + "balance_loss_clip": 1.04546857, + "balance_loss_mlp": 1.02348828, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.9692492381740918, + "language_loss": 0.7421211, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76365542, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12335205, + "step": 15958, + "time_per_iteration": 2.518312454223633 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.04196286, + "balance_loss_mlp": 1.01653194, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.898245873572159, + "language_loss": 0.74673128, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76815772, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.12719727, + "step": 15959, + "time_per_iteration": 2.4522838592529297 + }, + { + "auxiliary_loss_clip": 0.0111432, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.04089296, + "balance_loss_mlp": 1.01559854, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 2.082703900084245, + "language_loss": 0.6512748, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.6726889, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11499023, + "step": 15960, + "time_per_iteration": 2.416999340057373 + }, + { + "auxiliary_loss_clip": 0.01110055, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.04084563, + "balance_loss_mlp": 1.01920819, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.7024322818569477, + "language_loss": 0.78103817, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80244696, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.1161499, + "step": 15961, + "time_per_iteration": 2.4919416904449463 + }, + { + "auxiliary_loss_clip": 0.01110629, + "auxiliary_loss_mlp": 0.01025388, + "balance_loss_clip": 1.0405817, + "balance_loss_mlp": 1.01403964, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 2.2751285053184, + "language_loss": 0.76107752, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78243768, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11358643, + "step": 15962, + "time_per_iteration": 2.432258129119873 + }, + { + "auxiliary_loss_clip": 0.01125766, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.0510236, + "balance_loss_mlp": 1.0211581, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 2.0718133220430484, + "language_loss": 0.7157737, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.73736906, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12615967, + "step": 15963, + "time_per_iteration": 2.4881255626678467 + }, + { + "auxiliary_loss_clip": 0.01112017, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.04230499, + "balance_loss_mlp": 1.01949334, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.6139756596381207, + "language_loss": 0.74262613, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76405358, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11230469, + "step": 15964, + "time_per_iteration": 3.9391424655914307 + }, + { + "auxiliary_loss_clip": 0.01048435, + "auxiliary_loss_mlp": 0.01002911, + "balance_loss_clip": 1.0242281, + "balance_loss_mlp": 1.00183046, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8773997389489154, + "language_loss": 0.57587373, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59638721, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.24194336, + "router_z_loss_mlp": 0.01081848, + "step": 15965, + "time_per_iteration": 3.0301482677459717 + }, + { + "auxiliary_loss_clip": 0.01113909, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.0404104, + "balance_loss_mlp": 1.02178669, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.6164388236699563, + "language_loss": 0.78703916, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.80850935, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11328125, + "step": 15966, + "time_per_iteration": 2.463101863861084 + }, + { + "auxiliary_loss_clip": 0.01106205, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.03581941, + "balance_loss_mlp": 1.01757038, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.5433820326561032, + "language_loss": 0.79486096, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81621873, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11999512, + "step": 15967, + "time_per_iteration": 2.4327166080474854 + }, + { + "auxiliary_loss_clip": 0.01122356, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.04800844, + "balance_loss_mlp": 1.01639795, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 1.9144141336603502, + "language_loss": 0.80362034, + "learning_rate": 1.669554028728348e-08, + "loss": 0.8251344, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12658691, + "step": 15968, + "time_per_iteration": 2.409214735031128 + }, + { + "auxiliary_loss_clip": 0.0112127, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.04511762, + "balance_loss_mlp": 1.02686977, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.798780107545006, + "language_loss": 0.67217106, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69379818, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.14569092, + "step": 15969, + "time_per_iteration": 2.433649778366089 + }, + { + "auxiliary_loss_clip": 0.01108139, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.03836918, + "balance_loss_mlp": 1.02501464, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 2.5818769302435403, + "language_loss": 0.79582274, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81726134, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10693359, + "step": 15970, + "time_per_iteration": 2.430032730102539 + }, + { + "auxiliary_loss_clip": 0.01116662, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.04528248, + "balance_loss_mlp": 1.02314878, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.6930340412555243, + "language_loss": 0.77346802, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79498744, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12127686, + "step": 15971, + "time_per_iteration": 2.4635751247406006 + }, + { + "auxiliary_loss_clip": 0.01116356, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.03969955, + "balance_loss_mlp": 1.01663733, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 2.1464554801165603, + "language_loss": 0.67224407, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69369555, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12139893, + "step": 15972, + "time_per_iteration": 2.424846649169922 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01036744, + "balance_loss_clip": 1.03846395, + "balance_loss_mlp": 1.02246857, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.464375294796974, + "language_loss": 0.76200187, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78348112, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1428833, + "step": 15973, + "time_per_iteration": 2.388594150543213 + }, + { + "auxiliary_loss_clip": 0.01120514, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.0489254, + "balance_loss_mlp": 1.02709532, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.6370007730302243, + "language_loss": 0.69218111, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71377575, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11853027, + "step": 15974, + "time_per_iteration": 2.440887928009033 + }, + { + "auxiliary_loss_clip": 0.01115971, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.04039788, + "balance_loss_mlp": 1.01833391, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.747593199654058, + "language_loss": 0.68104601, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70251197, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1229248, + "step": 15975, + "time_per_iteration": 3.969810962677002 + }, + { + "auxiliary_loss_clip": 0.01106647, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.03946388, + "balance_loss_mlp": 1.01613081, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 1.8771185823573975, + "language_loss": 0.55200177, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.57333857, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 0.67236328, + "router_z_loss_mlp": 0.10900879, + "step": 15976, + "time_per_iteration": 2.5136868953704834 + }, + { + "auxiliary_loss_clip": 0.01108015, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.03842008, + "balance_loss_mlp": 1.01552629, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 2.656789892609146, + "language_loss": 0.68272579, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70407605, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11486816, + "step": 15977, + "time_per_iteration": 2.501581907272339 + }, + { + "auxiliary_loss_clip": 0.0110789, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.03684199, + "balance_loss_mlp": 1.02447128, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 1.7315825224496078, + "language_loss": 0.82319021, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84462929, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11547852, + "step": 15978, + "time_per_iteration": 2.4213104248046875 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.04020178, + "balance_loss_mlp": 1.01966786, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.631133642464773, + "language_loss": 0.83356243, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85501862, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12280273, + "step": 15979, + "time_per_iteration": 3.837737560272217 + }, + { + "auxiliary_loss_clip": 0.01114971, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.04120731, + "balance_loss_mlp": 1.02343392, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.5193963516251985, + "language_loss": 0.79937565, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82087076, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11114502, + "step": 15980, + "time_per_iteration": 2.4641506671905518 + }, + { + "auxiliary_loss_clip": 0.0110783, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.03503704, + "balance_loss_mlp": 1.01765811, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.8265879912793423, + "language_loss": 0.68463057, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70601457, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12908936, + "step": 15981, + "time_per_iteration": 2.511657953262329 + }, + { + "auxiliary_loss_clip": 0.01111772, + "auxiliary_loss_mlp": 0.0102357, + "balance_loss_clip": 1.04143333, + "balance_loss_mlp": 1.01271629, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.452818027652106, + "language_loss": 0.69421589, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71556938, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10864258, + "step": 15982, + "time_per_iteration": 2.493608236312866 + }, + { + "auxiliary_loss_clip": 0.01040346, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.01553857, + "balance_loss_mlp": 1.00634325, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6701598339214456, + "language_loss": 0.53274059, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55322093, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01348877, + "step": 15983, + "time_per_iteration": 3.1713457107543945 + }, + { + "auxiliary_loss_clip": 0.01117863, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.04227614, + "balance_loss_mlp": 1.02345037, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.146383298738071, + "language_loss": 0.68059653, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70213354, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12390137, + "step": 15984, + "time_per_iteration": 2.481206178665161 + }, + { + "auxiliary_loss_clip": 0.01108232, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.04006171, + "balance_loss_mlp": 1.01961517, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.7106156285072966, + "language_loss": 0.67485905, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69624162, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.10412598, + "step": 15985, + "time_per_iteration": 2.411231279373169 + }, + { + "auxiliary_loss_clip": 0.01119593, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.04547322, + "balance_loss_mlp": 1.01963425, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 6.981364076184965, + "language_loss": 0.7891984, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81070447, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11383057, + "step": 15986, + "time_per_iteration": 2.4001407623291016 + }, + { + "auxiliary_loss_clip": 0.01119521, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.04770839, + "balance_loss_mlp": 1.01831651, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 3.95461584161993, + "language_loss": 0.64126313, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.66277266, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.13110352, + "step": 15987, + "time_per_iteration": 3.80552077293396 + }, + { + "auxiliary_loss_clip": 0.01106607, + "auxiliary_loss_mlp": 0.01026261, + "balance_loss_clip": 1.03809643, + "balance_loss_mlp": 1.0155319, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 2.327513372832294, + "language_loss": 0.66791302, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.68924171, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.1072998, + "step": 15988, + "time_per_iteration": 2.455418825149536 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.04132557, + "balance_loss_mlp": 1.02534819, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 2.5345093638834286, + "language_loss": 0.74632615, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76782358, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11035156, + "step": 15989, + "time_per_iteration": 2.4248135089874268 + }, + { + "auxiliary_loss_clip": 0.01033846, + "auxiliary_loss_mlp": 0.00999064, + "balance_loss_clip": 1.00929332, + "balance_loss_mlp": 0.99776793, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8347142371905559, + "language_loss": 0.63108742, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65141654, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01296997, + "step": 15990, + "time_per_iteration": 2.978853225708008 + }, + { + "auxiliary_loss_clip": 0.01116892, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.04380655, + "balance_loss_mlp": 1.01752448, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 1.726997396660343, + "language_loss": 0.77903819, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.80049425, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11187744, + "step": 15991, + "time_per_iteration": 2.5180418491363525 + }, + { + "auxiliary_loss_clip": 0.01121005, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.04220831, + "balance_loss_mlp": 1.0232197, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.480590181992426, + "language_loss": 0.84900951, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.87058711, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13531494, + "step": 15992, + "time_per_iteration": 2.4203412532806396 + }, + { + "auxiliary_loss_clip": 0.01108494, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.03617036, + "balance_loss_mlp": 1.02438724, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 1.9660602895346817, + "language_loss": 0.72441709, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74588609, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.14019775, + "step": 15993, + "time_per_iteration": 2.452709197998047 + }, + { + "auxiliary_loss_clip": 0.01113741, + "auxiliary_loss_mlp": 0.0104108, + "balance_loss_clip": 1.03934956, + "balance_loss_mlp": 1.02689385, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 9.476666503331055, + "language_loss": 0.68169641, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70324457, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.14190674, + "step": 15994, + "time_per_iteration": 2.5627365112304688 + }, + { + "auxiliary_loss_clip": 0.01112714, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.04111576, + "balance_loss_mlp": 1.01871729, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.8066076556389357, + "language_loss": 0.84837127, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.8698051, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11938477, + "step": 15995, + "time_per_iteration": 2.474466562271118 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.04160416, + "balance_loss_mlp": 1.02233517, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.8220083295934342, + "language_loss": 0.75979465, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78131104, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12481689, + "step": 15996, + "time_per_iteration": 2.407519817352295 + }, + { + "auxiliary_loss_clip": 0.01113814, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.04269767, + "balance_loss_mlp": 1.01656461, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 2.021391119483335, + "language_loss": 0.7700603, + "learning_rate": 1.52708595287494e-08, + "loss": 0.79148519, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.12103271, + "step": 15997, + "time_per_iteration": 2.4478871822357178 + }, + { + "auxiliary_loss_clip": 0.01122807, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.05141878, + "balance_loss_mlp": 1.01466656, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 3.13064602619085, + "language_loss": 0.67121452, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69269657, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10736084, + "step": 15998, + "time_per_iteration": 2.4765431880950928 + }, + { + "auxiliary_loss_clip": 0.01124549, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.05003321, + "balance_loss_mlp": 1.01462436, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.686397389312547, + "language_loss": 0.72937465, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.75088882, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12261963, + "step": 15999, + "time_per_iteration": 2.4031026363372803 + }, + { + "auxiliary_loss_clip": 0.01112669, + "auxiliary_loss_mlp": 0.01022974, + "balance_loss_clip": 1.0442642, + "balance_loss_mlp": 1.01272821, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 1.866829847646505, + "language_loss": 0.65665388, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67801034, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10247803, + "step": 16000, + "time_per_iteration": 2.4929163455963135 + }, + { + "auxiliary_loss_clip": 0.01115656, + "auxiliary_loss_mlp": 0.01027461, + "balance_loss_clip": 1.04379201, + "balance_loss_mlp": 1.01583242, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 1.71774344316348, + "language_loss": 0.75494713, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77637833, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11621094, + "step": 16001, + "time_per_iteration": 2.4274470806121826 + }, + { + "auxiliary_loss_clip": 0.01110849, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.03995979, + "balance_loss_mlp": 1.01677275, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.5739959463652757, + "language_loss": 0.68388599, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70528138, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11920166, + "step": 16002, + "time_per_iteration": 2.4586384296417236 + }, + { + "auxiliary_loss_clip": 0.01111608, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.04188752, + "balance_loss_mlp": 1.01818526, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.2849842513077492, + "language_loss": 0.64558768, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66700006, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11450195, + "step": 16003, + "time_per_iteration": 2.4991252422332764 + }, + { + "auxiliary_loss_clip": 0.01113842, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.04246628, + "balance_loss_mlp": 1.0237124, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 19.82354782740192, + "language_loss": 0.75868696, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78017306, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1105957, + "step": 16004, + "time_per_iteration": 2.470937490463257 + }, + { + "auxiliary_loss_clip": 0.01115825, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.04536629, + "balance_loss_mlp": 1.01647162, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.1448299732809835, + "language_loss": 0.79699135, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81843495, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12054443, + "step": 16005, + "time_per_iteration": 2.4702115058898926 + }, + { + "auxiliary_loss_clip": 0.01113316, + "auxiliary_loss_mlp": 0.01025905, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.01459801, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 2.293632112801073, + "language_loss": 0.67490101, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69629323, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11309814, + "step": 16006, + "time_per_iteration": 2.8033030033111572 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.04120529, + "balance_loss_mlp": 1.02181494, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5671345267191332, + "language_loss": 0.78105861, + "learning_rate": 1.479426394188521e-08, + "loss": 0.8024683, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 0.66845703, + "router_z_loss_mlp": 0.11016846, + "step": 16007, + "time_per_iteration": 2.609370470046997 + }, + { + "auxiliary_loss_clip": 0.01118441, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.04537952, + "balance_loss_mlp": 1.01757431, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 2.6606117444044894, + "language_loss": 0.67808992, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.69957143, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.121521, + "step": 16008, + "time_per_iteration": 2.503352165222168 + }, + { + "auxiliary_loss_clip": 0.01120825, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.04758918, + "balance_loss_mlp": 1.01923752, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.9534094518071115, + "language_loss": 0.72925079, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75078511, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.13348389, + "step": 16009, + "time_per_iteration": 3.8650026321411133 + }, + { + "auxiliary_loss_clip": 0.01109303, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.03846908, + "balance_loss_mlp": 1.01731086, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 2.1096074592769365, + "language_loss": 0.75649822, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77787679, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11242676, + "step": 16010, + "time_per_iteration": 2.4423305988311768 + }, + { + "auxiliary_loss_clip": 0.0112133, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.04575419, + "balance_loss_mlp": 1.0198884, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 7.775381676310632, + "language_loss": 0.69481164, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71636933, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.14544678, + "step": 16011, + "time_per_iteration": 2.3806495666503906 + }, + { + "auxiliary_loss_clip": 0.011078, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.03718519, + "balance_loss_mlp": 1.01942742, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.8396547014974078, + "language_loss": 0.68838513, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70976973, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11236572, + "step": 16012, + "time_per_iteration": 2.7519359588623047 + }, + { + "auxiliary_loss_clip": 0.01121508, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.04516149, + "balance_loss_mlp": 1.01752627, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.8203670199400896, + "language_loss": 0.72414076, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74566126, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.13012695, + "step": 16013, + "time_per_iteration": 2.5474956035614014 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03998494, + "balance_loss_mlp": 1.01925933, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.3620712950470355, + "language_loss": 0.63138044, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65282404, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12823486, + "step": 16014, + "time_per_iteration": 2.607773780822754 + }, + { + "auxiliary_loss_clip": 0.01105937, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.03848791, + "balance_loss_mlp": 1.01775646, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 4.25018088513851, + "language_loss": 0.7217409, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.74308234, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.10455322, + "step": 16015, + "time_per_iteration": 2.614725112915039 + }, + { + "auxiliary_loss_clip": 0.011094, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.03897285, + "balance_loss_mlp": 1.01633835, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 2.0619456837249177, + "language_loss": 0.77447647, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79586339, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12939453, + "step": 16016, + "time_per_iteration": 2.3954806327819824 + }, + { + "auxiliary_loss_clip": 0.01038607, + "auxiliary_loss_mlp": 0.01007883, + "balance_loss_clip": 1.01384079, + "balance_loss_mlp": 1.00646245, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.809784271377842, + "language_loss": 0.63034308, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65080798, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01417542, + "step": 16017, + "time_per_iteration": 2.9519855976104736 + }, + { + "auxiliary_loss_clip": 0.01118411, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.04528749, + "balance_loss_mlp": 1.01695752, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.9447742668060655, + "language_loss": 0.66082782, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68229914, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11749268, + "step": 16018, + "time_per_iteration": 2.506342887878418 + }, + { + "auxiliary_loss_clip": 0.01114902, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.04159975, + "balance_loss_mlp": 1.02371585, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.8684492658146983, + "language_loss": 0.79658806, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81809354, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11932373, + "step": 16019, + "time_per_iteration": 3.879185676574707 + }, + { + "auxiliary_loss_clip": 0.01110063, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.04030418, + "balance_loss_mlp": 1.01991272, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.4194061022758377, + "language_loss": 0.71321696, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.73461848, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10168457, + "step": 16020, + "time_per_iteration": 2.475606679916382 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01028047, + "balance_loss_clip": 1.04380369, + "balance_loss_mlp": 1.01731205, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.6928056927688702, + "language_loss": 0.7684719, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.78990299, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1072998, + "step": 16021, + "time_per_iteration": 2.5237135887145996 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.04458022, + "balance_loss_mlp": 1.01634717, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.8872236547172316, + "language_loss": 0.65037835, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.67187572, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.13293457, + "step": 16022, + "time_per_iteration": 2.4594225883483887 + }, + { + "auxiliary_loss_clip": 0.01109372, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03879046, + "balance_loss_mlp": 1.02110982, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 2.014649489340194, + "language_loss": 0.73123264, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75264806, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1105957, + "step": 16023, + "time_per_iteration": 3.933541774749756 + }, + { + "auxiliary_loss_clip": 0.01114114, + "auxiliary_loss_mlp": 0.01025315, + "balance_loss_clip": 1.04160881, + "balance_loss_mlp": 1.01445484, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.7075564462700106, + "language_loss": 0.81521189, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83660614, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10858154, + "step": 16024, + "time_per_iteration": 2.5728464126586914 + }, + { + "auxiliary_loss_clip": 0.01119261, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.04421663, + "balance_loss_mlp": 1.02224231, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 2.0711952709355823, + "language_loss": 0.81514782, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83668774, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12481689, + "step": 16025, + "time_per_iteration": 2.493326425552368 + }, + { + "auxiliary_loss_clip": 0.01109988, + "auxiliary_loss_mlp": 0.01029279, + "balance_loss_clip": 1.03610706, + "balance_loss_mlp": 1.01735818, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.820040381249332, + "language_loss": 0.76344597, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78483862, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11920166, + "step": 16026, + "time_per_iteration": 2.5452592372894287 + }, + { + "auxiliary_loss_clip": 0.01107401, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.03519058, + "balance_loss_mlp": 1.02219796, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.6327226985802137, + "language_loss": 0.63033921, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65176058, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12554932, + "step": 16027, + "time_per_iteration": 2.466792583465576 + }, + { + "auxiliary_loss_clip": 0.01118314, + "auxiliary_loss_mlp": 0.01026394, + "balance_loss_clip": 1.04294038, + "balance_loss_mlp": 1.01422918, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 1.9747295248691883, + "language_loss": 0.87120324, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89265037, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1217041, + "step": 16028, + "time_per_iteration": 2.4588096141815186 + }, + { + "auxiliary_loss_clip": 0.01076773, + "auxiliary_loss_mlp": 0.01005737, + "balance_loss_clip": 1.05259275, + "balance_loss_mlp": 1.00436282, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.6797285947778277, + "language_loss": 0.53115618, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55198127, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.24145508, + "router_z_loss_mlp": 0.01373291, + "step": 16029, + "time_per_iteration": 3.0669524669647217 + }, + { + "auxiliary_loss_clip": 0.01116277, + "auxiliary_loss_mlp": 0.01024521, + "balance_loss_clip": 1.04406571, + "balance_loss_mlp": 1.01344061, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.8548109254056653, + "language_loss": 0.74109256, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76250058, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11083984, + "step": 16030, + "time_per_iteration": 3.975435256958008 + }, + { + "auxiliary_loss_clip": 0.01108782, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.03990102, + "balance_loss_mlp": 1.01599622, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 1.7494661183747322, + "language_loss": 0.66282082, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.6841799, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.11126709, + "step": 16031, + "time_per_iteration": 2.460026741027832 + }, + { + "auxiliary_loss_clip": 0.01043525, + "auxiliary_loss_mlp": 0.01001635, + "balance_loss_clip": 1.01949668, + "balance_loss_mlp": 1.00045919, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8374419085007733, + "language_loss": 0.60797995, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62843156, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.01174927, + "step": 16032, + "time_per_iteration": 3.126173257827759 + }, + { + "auxiliary_loss_clip": 0.01101408, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.0356369, + "balance_loss_mlp": 1.01708591, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.8700473111088916, + "language_loss": 0.66448116, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68576837, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 0.65771484, + "router_z_loss_mlp": 0.10235596, + "step": 16033, + "time_per_iteration": 2.5198376178741455 + }, + { + "auxiliary_loss_clip": 0.01112106, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.04134083, + "balance_loss_mlp": 1.01802933, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.6231139893389759, + "language_loss": 0.65393317, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67535192, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11749268, + "step": 16034, + "time_per_iteration": 2.449511766433716 + }, + { + "auxiliary_loss_clip": 0.0111413, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.04114127, + "balance_loss_mlp": 1.018332, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.206220275175307, + "language_loss": 0.74252474, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.76396549, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11627197, + "step": 16035, + "time_per_iteration": 2.454188585281372 + }, + { + "auxiliary_loss_clip": 0.01121066, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.04972339, + "balance_loss_mlp": 1.01780772, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 2.8624766570035565, + "language_loss": 0.82132936, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84283823, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12017822, + "step": 16036, + "time_per_iteration": 2.4358744621276855 + }, + { + "auxiliary_loss_clip": 0.01110468, + "auxiliary_loss_mlp": 0.01027819, + "balance_loss_clip": 1.03954792, + "balance_loss_mlp": 1.01613057, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.860448908486801, + "language_loss": 0.69963646, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72101927, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11700439, + "step": 16037, + "time_per_iteration": 2.500824213027954 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.04216361, + "balance_loss_mlp": 1.02016068, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.9375405394025143, + "language_loss": 0.6313777, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65286994, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12512207, + "step": 16038, + "time_per_iteration": 2.441030263900757 + }, + { + "auxiliary_loss_clip": 0.01120933, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.04388499, + "balance_loss_mlp": 1.02149844, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 1.829334625505124, + "language_loss": 0.70889187, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73043978, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12371826, + "step": 16039, + "time_per_iteration": 2.4611353874206543 + }, + { + "auxiliary_loss_clip": 0.01116706, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.04430223, + "balance_loss_mlp": 1.01619887, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.2654663367970116, + "language_loss": 0.73315954, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75460613, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11743164, + "step": 16040, + "time_per_iteration": 2.4051709175109863 + }, + { + "auxiliary_loss_clip": 0.0112243, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.04507494, + "balance_loss_mlp": 1.02099705, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 1.9652114205053837, + "language_loss": 0.73070562, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75226504, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12530518, + "step": 16041, + "time_per_iteration": 2.403975009918213 + }, + { + "auxiliary_loss_clip": 0.01111708, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.04309535, + "balance_loss_mlp": 1.01879501, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.8052061313287748, + "language_loss": 0.71798289, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.73939717, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10931396, + "step": 16042, + "time_per_iteration": 2.455390214920044 + }, + { + "auxiliary_loss_clip": 0.01114477, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.04135263, + "balance_loss_mlp": 1.01835704, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.696036756017191, + "language_loss": 0.81099552, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83243227, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10845947, + "step": 16043, + "time_per_iteration": 2.447362184524536 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.04511583, + "balance_loss_mlp": 1.01857138, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 2.0949220257789802, + "language_loss": 0.71662289, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73808098, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11682129, + "step": 16044, + "time_per_iteration": 2.471895933151245 + }, + { + "auxiliary_loss_clip": 0.01111912, + "auxiliary_loss_mlp": 0.01026804, + "balance_loss_clip": 1.04178095, + "balance_loss_mlp": 1.01509786, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 1.8344916021119921, + "language_loss": 0.69997144, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72135866, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.1171875, + "step": 16045, + "time_per_iteration": 2.394514560699463 + }, + { + "auxiliary_loss_clip": 0.01112195, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.03903461, + "balance_loss_mlp": 1.01794779, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 1.855909654960304, + "language_loss": 0.75478834, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77621603, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12628174, + "step": 16046, + "time_per_iteration": 2.43375301361084 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.0378933, + "balance_loss_mlp": 1.01852012, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.8623132251244838, + "language_loss": 0.62690502, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64834559, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.14331055, + "step": 16047, + "time_per_iteration": 2.456573247909546 + }, + { + "auxiliary_loss_clip": 0.01116509, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.04351616, + "balance_loss_mlp": 1.01854801, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.807589502270695, + "language_loss": 0.69178057, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71325153, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12042236, + "step": 16048, + "time_per_iteration": 2.437981128692627 + }, + { + "auxiliary_loss_clip": 0.01108414, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.03563833, + "balance_loss_mlp": 1.01745784, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 2.200473918853508, + "language_loss": 0.63420081, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.65558386, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12451172, + "step": 16049, + "time_per_iteration": 2.512615442276001 + }, + { + "auxiliary_loss_clip": 0.01117596, + "auxiliary_loss_mlp": 0.01026069, + "balance_loss_clip": 1.04548061, + "balance_loss_mlp": 1.01419616, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 6.774005891888178, + "language_loss": 0.71040964, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73184621, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11889648, + "step": 16050, + "time_per_iteration": 2.447018623352051 + }, + { + "auxiliary_loss_clip": 0.01118996, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.04087043, + "balance_loss_mlp": 1.02446926, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 4.788656001973859, + "language_loss": 0.70183235, + "learning_rate": 1.278669873970606e-08, + "loss": 0.72342014, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.15313721, + "step": 16051, + "time_per_iteration": 2.604400157928467 + }, + { + "auxiliary_loss_clip": 0.01037347, + "auxiliary_loss_mlp": 0.01004617, + "balance_loss_clip": 1.0127331, + "balance_loss_mlp": 1.00331593, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8449849779137825, + "language_loss": 0.59131896, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61173868, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 0.01300049, + "step": 16052, + "time_per_iteration": 4.554698944091797 + }, + { + "auxiliary_loss_clip": 0.01109925, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.01958537, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.7187595506915525, + "language_loss": 0.74143755, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76287031, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.13751221, + "step": 16053, + "time_per_iteration": 2.5266356468200684 + }, + { + "auxiliary_loss_clip": 0.0111809, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.04579842, + "balance_loss_mlp": 1.01723862, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.2830939357987687, + "language_loss": 0.68596804, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70743662, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11529541, + "step": 16054, + "time_per_iteration": 2.3921594619750977 + }, + { + "auxiliary_loss_clip": 0.01117057, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.04406297, + "balance_loss_mlp": 1.01964235, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.5194522922964204, + "language_loss": 0.61865306, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.6401304, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11047363, + "step": 16055, + "time_per_iteration": 2.523888349533081 + }, + { + "auxiliary_loss_clip": 0.01111861, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.04047751, + "balance_loss_mlp": 1.01838732, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 2.6426618272895603, + "language_loss": 0.76996386, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.79138285, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11645508, + "step": 16056, + "time_per_iteration": 2.4375059604644775 + }, + { + "auxiliary_loss_clip": 0.01110873, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.03872871, + "balance_loss_mlp": 1.01765335, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.616686583534229, + "language_loss": 0.71620524, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73760015, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10961914, + "step": 16057, + "time_per_iteration": 2.436765670776367 + }, + { + "auxiliary_loss_clip": 0.01108194, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.03702974, + "balance_loss_mlp": 1.02519345, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.98263029279352, + "language_loss": 0.71708584, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.73852932, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10968018, + "step": 16058, + "time_per_iteration": 2.415285110473633 + }, + { + "auxiliary_loss_clip": 0.01105146, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.03569293, + "balance_loss_mlp": 1.02376819, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.557302129192367, + "language_loss": 0.73802292, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.75942522, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11322021, + "step": 16059, + "time_per_iteration": 2.5067737102508545 + }, + { + "auxiliary_loss_clip": 0.01117167, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.04197621, + "balance_loss_mlp": 1.01861012, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 2.1156201109352435, + "language_loss": 0.73762453, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75910068, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.1184082, + "step": 16060, + "time_per_iteration": 2.591097354888916 + }, + { + "auxiliary_loss_clip": 0.01111221, + "auxiliary_loss_mlp": 0.01028085, + "balance_loss_clip": 1.0423162, + "balance_loss_mlp": 1.01755285, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 2.577411750631464, + "language_loss": 0.76770008, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78909314, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10528564, + "step": 16061, + "time_per_iteration": 2.4975035190582275 + }, + { + "auxiliary_loss_clip": 0.01067583, + "auxiliary_loss_mlp": 0.01009213, + "balance_loss_clip": 1.04301023, + "balance_loss_mlp": 1.00776291, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7317873980895488, + "language_loss": 0.64190483, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66267288, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01448059, + "step": 16062, + "time_per_iteration": 4.463664293289185 + }, + { + "auxiliary_loss_clip": 0.01103908, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.03534913, + "balance_loss_mlp": 1.01709676, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.3124316602294384, + "language_loss": 0.93585861, + "learning_rate": 1.226449424760867e-08, + "loss": 0.9571721, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.10345459, + "step": 16063, + "time_per_iteration": 2.4199910163879395 + }, + { + "auxiliary_loss_clip": 0.01112542, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.04091823, + "balance_loss_mlp": 1.01858687, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 2.000014320629008, + "language_loss": 0.82262146, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84404802, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11529541, + "step": 16064, + "time_per_iteration": 2.520289182662964 + }, + { + "auxiliary_loss_clip": 0.01114985, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.04292798, + "balance_loss_mlp": 1.01970029, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.5205831427394019, + "language_loss": 0.84309882, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86455786, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11218262, + "step": 16065, + "time_per_iteration": 2.6661834716796875 + }, + { + "auxiliary_loss_clip": 0.01107375, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.03652978, + "balance_loss_mlp": 1.0144701, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.926559118816241, + "language_loss": 0.67589533, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69723165, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11791992, + "step": 16066, + "time_per_iteration": 3.8984434604644775 + }, + { + "auxiliary_loss_clip": 0.01110827, + "auxiliary_loss_mlp": 0.01026071, + "balance_loss_clip": 1.04024243, + "balance_loss_mlp": 1.01499665, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 1.8655072454518382, + "language_loss": 0.82217026, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84353924, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11065674, + "step": 16067, + "time_per_iteration": 2.4646925926208496 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.01026822, + "balance_loss_clip": 1.04164612, + "balance_loss_mlp": 1.01473379, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.7637137562345844, + "language_loss": 0.68896115, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71037567, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12091064, + "step": 16068, + "time_per_iteration": 2.472400665283203 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.04108882, + "balance_loss_mlp": 1.02109098, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.7100071634270326, + "language_loss": 0.68480492, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70619249, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 0.66455078, + "router_z_loss_mlp": 0.10040283, + "step": 16069, + "time_per_iteration": 2.442074775695801 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01025949, + "balance_loss_clip": 1.05236495, + "balance_loss_mlp": 1.01424909, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.7740752451956412, + "language_loss": 0.89593548, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91747808, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11694336, + "step": 16070, + "time_per_iteration": 2.4307873249053955 + }, + { + "auxiliary_loss_clip": 0.01118154, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.04512453, + "balance_loss_mlp": 1.02093291, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 1.8582477529749568, + "language_loss": 0.771007, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79251933, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12158203, + "step": 16071, + "time_per_iteration": 2.4301257133483887 + }, + { + "auxiliary_loss_clip": 0.01115732, + "auxiliary_loss_mlp": 0.01030176, + "balance_loss_clip": 1.04428005, + "balance_loss_mlp": 1.0175755, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.9164904669442393, + "language_loss": 0.65839756, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.67985666, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12609863, + "step": 16072, + "time_per_iteration": 2.4562573432922363 + }, + { + "auxiliary_loss_clip": 0.01121195, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.04868078, + "balance_loss_mlp": 1.01969934, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.7891129152746008, + "language_loss": 0.77492702, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79645145, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11553955, + "step": 16073, + "time_per_iteration": 3.9487409591674805 + }, + { + "auxiliary_loss_clip": 0.01121174, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.04703641, + "balance_loss_mlp": 1.01969326, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.075212009418734, + "language_loss": 0.76167119, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78320336, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12359619, + "step": 16074, + "time_per_iteration": 2.4551141262054443 + }, + { + "auxiliary_loss_clip": 0.0111684, + "auxiliary_loss_mlp": 0.01026999, + "balance_loss_clip": 1.04330873, + "balance_loss_mlp": 1.01535201, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 2.0642008390439344, + "language_loss": 0.75677198, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77821034, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11645508, + "step": 16075, + "time_per_iteration": 2.571664571762085 + }, + { + "auxiliary_loss_clip": 0.01113629, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.04108167, + "balance_loss_mlp": 1.0186193, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 1.9785651055857474, + "language_loss": 0.79086196, + "learning_rate": 1.171102125547696e-08, + "loss": 0.8123008, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11639404, + "step": 16076, + "time_per_iteration": 2.427400588989258 + }, + { + "auxiliary_loss_clip": 0.01112473, + "auxiliary_loss_mlp": 0.01048341, + "balance_loss_clip": 1.04021835, + "balance_loss_mlp": 1.03422642, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.6495045353358548, + "language_loss": 0.71957225, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74118042, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.14117432, + "step": 16077, + "time_per_iteration": 2.4087588787078857 + }, + { + "auxiliary_loss_clip": 0.01106098, + "auxiliary_loss_mlp": 0.01034642, + "balance_loss_clip": 1.03485274, + "balance_loss_mlp": 1.02120745, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.9857604879046686, + "language_loss": 0.59422982, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61563724, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.13446045, + "step": 16078, + "time_per_iteration": 2.4587438106536865 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.04975152, + "balance_loss_mlp": 1.01960909, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 3.2589088317023904, + "language_loss": 0.71952039, + "learning_rate": 1.158510609718899e-08, + "loss": 0.74110639, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12719727, + "step": 16079, + "time_per_iteration": 2.5105574131011963 + }, + { + "auxiliary_loss_clip": 0.01110627, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.04166722, + "balance_loss_mlp": 1.01712811, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.6504172464523268, + "language_loss": 0.72371507, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74510229, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10968018, + "step": 16080, + "time_per_iteration": 2.423133134841919 + }, + { + "auxiliary_loss_clip": 0.01114398, + "auxiliary_loss_mlp": 0.01025156, + "balance_loss_clip": 1.04349899, + "balance_loss_mlp": 1.01455879, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 2.660712065943819, + "language_loss": 0.74101877, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76241434, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.1060791, + "step": 16081, + "time_per_iteration": 2.441284418106079 + }, + { + "auxiliary_loss_clip": 0.01107304, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.03489852, + "balance_loss_mlp": 1.01498222, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.66380781542722, + "language_loss": 0.67128313, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69262516, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.1192627, + "step": 16082, + "time_per_iteration": 2.4668004512786865 + }, + { + "auxiliary_loss_clip": 0.01107103, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.0376308, + "balance_loss_mlp": 1.01635778, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.5789940486302416, + "language_loss": 0.76952356, + "learning_rate": 1.141827483932789e-08, + "loss": 0.79087144, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11322021, + "step": 16083, + "time_per_iteration": 2.5318140983581543 + }, + { + "auxiliary_loss_clip": 0.01109843, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.03681922, + "balance_loss_mlp": 1.01954913, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.962670845291268, + "language_loss": 0.79686755, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81828856, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12719727, + "step": 16084, + "time_per_iteration": 2.4380197525024414 + }, + { + "auxiliary_loss_clip": 0.01111788, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.03782368, + "balance_loss_mlp": 1.01835144, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.3461329419159718, + "language_loss": 0.68178779, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.70322388, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.13494873, + "step": 16085, + "time_per_iteration": 2.3910958766937256 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.03935397, + "balance_loss_mlp": 1.0164057, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 2.3269643250687553, + "language_loss": 0.69506991, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71651304, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12780762, + "step": 16086, + "time_per_iteration": 2.461693048477173 + }, + { + "auxiliary_loss_clip": 0.01113611, + "auxiliary_loss_mlp": 0.01037277, + "balance_loss_clip": 1.04208779, + "balance_loss_mlp": 1.02369308, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.5883254841666552, + "language_loss": 0.78614759, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80765647, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.13592529, + "step": 16087, + "time_per_iteration": 2.4084324836730957 + }, + { + "auxiliary_loss_clip": 0.01110128, + "auxiliary_loss_mlp": 0.01023562, + "balance_loss_clip": 1.04025006, + "balance_loss_mlp": 1.01301801, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.8785250658629482, + "language_loss": 0.71155882, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73289573, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10552979, + "step": 16088, + "time_per_iteration": 2.456418514251709 + }, + { + "auxiliary_loss_clip": 0.01115191, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.04616022, + "balance_loss_mlp": 1.01964223, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.4606523210168276, + "language_loss": 0.70769811, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72915733, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.11077881, + "step": 16089, + "time_per_iteration": 2.535642385482788 + }, + { + "auxiliary_loss_clip": 0.01112653, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.03828359, + "balance_loss_mlp": 1.02450001, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.1746899707705767, + "language_loss": 0.74649322, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.7680012, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13653564, + "step": 16090, + "time_per_iteration": 2.412174940109253 + }, + { + "auxiliary_loss_clip": 0.01120505, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.04285645, + "balance_loss_mlp": 1.01699662, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.7908002608707512, + "language_loss": 0.68712461, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.70862132, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.1217041, + "step": 16091, + "time_per_iteration": 2.4876227378845215 + }, + { + "auxiliary_loss_clip": 0.01111725, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.04022419, + "balance_loss_mlp": 1.01984763, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.6992222113982838, + "language_loss": 0.77050376, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79193836, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11883545, + "step": 16092, + "time_per_iteration": 2.4823460578918457 + }, + { + "auxiliary_loss_clip": 0.01116067, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.0454483, + "balance_loss_mlp": 1.01562893, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.8177780257517164, + "language_loss": 0.76717877, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78860843, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.1126709, + "step": 16093, + "time_per_iteration": 2.439051628112793 + }, + { + "auxiliary_loss_clip": 0.0112074, + "auxiliary_loss_mlp": 0.01025615, + "balance_loss_clip": 1.04915476, + "balance_loss_mlp": 1.01333094, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.6185338141227135, + "language_loss": 0.69440901, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71587253, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.1229248, + "step": 16094, + "time_per_iteration": 2.4643008708953857 + }, + { + "auxiliary_loss_clip": 0.0111767, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.04133081, + "balance_loss_mlp": 1.01629877, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.4850692313179052, + "language_loss": 0.75795555, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.77941, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11486816, + "step": 16095, + "time_per_iteration": 2.444676160812378 + }, + { + "auxiliary_loss_clip": 0.01116597, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.04188454, + "balance_loss_mlp": 1.0212822, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 1.7116473765449831, + "language_loss": 0.70451331, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72601068, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11871338, + "step": 16096, + "time_per_iteration": 3.9291958808898926 + }, + { + "auxiliary_loss_clip": 0.01109523, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.03756094, + "balance_loss_mlp": 1.01499844, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 2.5932584772679204, + "language_loss": 0.71973157, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.74109656, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11987305, + "step": 16097, + "time_per_iteration": 2.7206764221191406 + }, + { + "auxiliary_loss_clip": 0.01113635, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.04141879, + "balance_loss_mlp": 1.02399468, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.611444249086221, + "language_loss": 0.78399193, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80548429, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.1161499, + "step": 16098, + "time_per_iteration": 2.447512626647949 + }, + { + "auxiliary_loss_clip": 0.01120929, + "auxiliary_loss_mlp": 0.01026636, + "balance_loss_clip": 1.05099249, + "balance_loss_mlp": 1.01571071, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 2.555863485428665, + "language_loss": 0.91008151, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.93155706, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10919189, + "step": 16099, + "time_per_iteration": 2.4382646083831787 + }, + { + "auxiliary_loss_clip": 0.01117237, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.04379058, + "balance_loss_mlp": 1.01932752, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 2.1583142210151336, + "language_loss": 0.66106141, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68254852, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12145996, + "step": 16100, + "time_per_iteration": 2.5862655639648438 + }, + { + "auxiliary_loss_clip": 0.01116112, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.04361546, + "balance_loss_mlp": 1.01781988, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.5914451444123865, + "language_loss": 0.73674619, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75820357, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11804199, + "step": 16101, + "time_per_iteration": 2.4728856086730957 + }, + { + "auxiliary_loss_clip": 0.01114618, + "auxiliary_loss_mlp": 0.01027761, + "balance_loss_clip": 1.04328537, + "balance_loss_mlp": 1.01609051, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 2.032948913588853, + "language_loss": 0.73692822, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75835204, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11663818, + "step": 16102, + "time_per_iteration": 2.4519927501678467 + }, + { + "auxiliary_loss_clip": 0.01117524, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.04517543, + "balance_loss_mlp": 1.02195978, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 1.7773861675321838, + "language_loss": 0.77356482, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79510307, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.14331055, + "step": 16103, + "time_per_iteration": 2.4922847747802734 + }, + { + "auxiliary_loss_clip": 0.01108929, + "auxiliary_loss_mlp": 0.0102673, + "balance_loss_clip": 1.03722143, + "balance_loss_mlp": 1.01542258, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.848999662834719, + "language_loss": 0.80317008, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82452673, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11309814, + "step": 16104, + "time_per_iteration": 2.4389290809631348 + }, + { + "auxiliary_loss_clip": 0.0111819, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.04630911, + "balance_loss_mlp": 1.02027845, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.4482970806883109, + "language_loss": 0.7800498, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.80153924, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10479736, + "step": 16105, + "time_per_iteration": 2.468576192855835 + }, + { + "auxiliary_loss_clip": 0.01055253, + "auxiliary_loss_mlp": 0.01008321, + "balance_loss_clip": 1.03155136, + "balance_loss_mlp": 1.00700378, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.824407064040837, + "language_loss": 0.56751454, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58815026, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.01318359, + "step": 16106, + "time_per_iteration": 4.506986141204834 + }, + { + "auxiliary_loss_clip": 0.01035111, + "auxiliary_loss_mlp": 0.01006144, + "balance_loss_clip": 1.01024902, + "balance_loss_mlp": 1.00483716, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8726540766440463, + "language_loss": 0.61584091, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63625348, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01307678, + "step": 16107, + "time_per_iteration": 2.9309020042419434 + }, + { + "auxiliary_loss_clip": 0.01118097, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.04640675, + "balance_loss_mlp": 1.02231336, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.2672013999839384, + "language_loss": 0.73921371, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76074636, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.12866211, + "step": 16108, + "time_per_iteration": 2.445812225341797 + }, + { + "auxiliary_loss_clip": 0.01124064, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.04807079, + "balance_loss_mlp": 1.01520813, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 3.9933186448145728, + "language_loss": 0.56925738, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59077048, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12036133, + "step": 16109, + "time_per_iteration": 3.8668270111083984 + }, + { + "auxiliary_loss_clip": 0.01038646, + "auxiliary_loss_mlp": 0.01002421, + "balance_loss_clip": 1.0139904, + "balance_loss_mlp": 1.00099504, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6670016142688227, + "language_loss": 0.54225421, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56266487, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01423645, + "step": 16110, + "time_per_iteration": 3.0489325523376465 + }, + { + "auxiliary_loss_clip": 0.01116433, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.04225397, + "balance_loss_mlp": 1.02104104, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.3578211338825017, + "language_loss": 0.62639785, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64790231, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.1295166, + "step": 16111, + "time_per_iteration": 2.5746994018554688 + }, + { + "auxiliary_loss_clip": 0.01114082, + "auxiliary_loss_mlp": 0.01025773, + "balance_loss_clip": 1.04324698, + "balance_loss_mlp": 1.01526487, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 1.9427321302448175, + "language_loss": 0.74924433, + "learning_rate": 1.024483677309118e-08, + "loss": 0.77064282, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10516357, + "step": 16112, + "time_per_iteration": 2.407452344894409 + }, + { + "auxiliary_loss_clip": 0.0110917, + "auxiliary_loss_mlp": 0.01025242, + "balance_loss_clip": 1.04052436, + "balance_loss_mlp": 1.0146203, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.9719824926395306, + "language_loss": 0.66876113, + "learning_rate": 1.020550495531558e-08, + "loss": 0.69010532, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10614014, + "step": 16113, + "time_per_iteration": 2.465233087539673 + }, + { + "auxiliary_loss_clip": 0.01039866, + "auxiliary_loss_mlp": 0.01001296, + "balance_loss_clip": 1.01538908, + "balance_loss_mlp": 1.0000608, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.692146349522649, + "language_loss": 0.56481397, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.5852257, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01234436, + "step": 16114, + "time_per_iteration": 3.0872879028320312 + }, + { + "auxiliary_loss_clip": 0.01122195, + "auxiliary_loss_mlp": 0.01039616, + "balance_loss_clip": 1.04635143, + "balance_loss_mlp": 1.02705121, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.9631797786511869, + "language_loss": 0.82908499, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.85070312, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12561035, + "step": 16115, + "time_per_iteration": 2.526183605194092 + }, + { + "auxiliary_loss_clip": 0.01113143, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.04239535, + "balance_loss_mlp": 1.01819003, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.5949377078916305, + "language_loss": 0.7204833, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74190879, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11212158, + "step": 16116, + "time_per_iteration": 2.4664039611816406 + }, + { + "auxiliary_loss_clip": 0.01116398, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.04379678, + "balance_loss_mlp": 1.01607633, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.083514583421382, + "language_loss": 0.7576372, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.77908164, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11975098, + "step": 16117, + "time_per_iteration": 3.8077051639556885 + }, + { + "auxiliary_loss_clip": 0.01119163, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.04148197, + "balance_loss_mlp": 1.02055812, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 2.61982324649844, + "language_loss": 0.77450418, + "learning_rate": 1.000997769426548e-08, + "loss": 0.7960273, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12585449, + "step": 16118, + "time_per_iteration": 2.439896821975708 + }, + { + "auxiliary_loss_clip": 0.01115611, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.04364848, + "balance_loss_mlp": 1.01750445, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.9478710146912874, + "language_loss": 0.7784766, + "learning_rate": 9.971098618001272e-09, + "loss": 0.79992175, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11401367, + "step": 16119, + "time_per_iteration": 2.44711971282959 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.01026739, + "balance_loss_clip": 1.04855394, + "balance_loss_mlp": 1.01626611, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.5300287685748943, + "language_loss": 0.75739622, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77887595, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10473633, + "step": 16120, + "time_per_iteration": 2.4469640254974365 + }, + { + "auxiliary_loss_clip": 0.01118511, + "auxiliary_loss_mlp": 0.01024335, + "balance_loss_clip": 1.04700232, + "balance_loss_mlp": 1.01366019, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.837340653482996, + "language_loss": 0.69750309, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71893156, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10675049, + "step": 16121, + "time_per_iteration": 2.397791862487793 + }, + { + "auxiliary_loss_clip": 0.01114533, + "auxiliary_loss_mlp": 0.01028128, + "balance_loss_clip": 1.04477143, + "balance_loss_mlp": 1.01697588, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.970204018635071, + "language_loss": 0.68936008, + "learning_rate": 9.854914167664486e-09, + "loss": 0.7107867, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.1116333, + "step": 16122, + "time_per_iteration": 2.484997510910034 + }, + { + "auxiliary_loss_clip": 0.01109728, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.03795552, + "balance_loss_mlp": 1.01453948, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 2.4583816762623187, + "language_loss": 0.7599594, + "learning_rate": 9.81633694859907e-09, + "loss": 0.78131211, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11004639, + "step": 16123, + "time_per_iteration": 2.5264363288879395 + }, + { + "auxiliary_loss_clip": 0.01116882, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.04445922, + "balance_loss_mlp": 1.01645613, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.7665859709821883, + "language_loss": 0.74779034, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76924562, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12194824, + "step": 16124, + "time_per_iteration": 2.4754555225372314 + }, + { + "auxiliary_loss_clip": 0.01117696, + "auxiliary_loss_mlp": 0.01036473, + "balance_loss_clip": 1.04260874, + "balance_loss_mlp": 1.02437353, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 2.0604776612519884, + "language_loss": 0.74519795, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76673961, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12097168, + "step": 16125, + "time_per_iteration": 2.4396848678588867 + }, + { + "auxiliary_loss_clip": 0.01031702, + "auxiliary_loss_mlp": 0.01001623, + "balance_loss_clip": 1.00771904, + "balance_loss_mlp": 1.00035942, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8813325482995156, + "language_loss": 0.61486268, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63519591, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.01263428, + "step": 16126, + "time_per_iteration": 3.0256404876708984 + }, + { + "auxiliary_loss_clip": 0.01107028, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.03865528, + "balance_loss_mlp": 1.02105832, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.7883600444190444, + "language_loss": 0.74655843, + "learning_rate": 9.662782766562738e-09, + "loss": 0.76795095, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.11157227, + "step": 16127, + "time_per_iteration": 2.4033820629119873 + }, + { + "auxiliary_loss_clip": 0.01110074, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.03535712, + "balance_loss_mlp": 1.02116764, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.6252388917137695, + "language_loss": 0.69696426, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71840274, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12609863, + "step": 16128, + "time_per_iteration": 2.422461986541748 + }, + { + "auxiliary_loss_clip": 0.01113267, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.04093552, + "balance_loss_mlp": 1.01888382, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 2.0222999006979414, + "language_loss": 0.65135407, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67279887, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12335205, + "step": 16129, + "time_per_iteration": 2.58558988571167 + }, + { + "auxiliary_loss_clip": 0.01116259, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.0413357, + "balance_loss_mlp": 1.01950359, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 2.588180661731214, + "language_loss": 0.6299026, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65138447, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12426758, + "step": 16130, + "time_per_iteration": 2.4670474529266357 + }, + { + "auxiliary_loss_clip": 0.01118236, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.04358721, + "balance_loss_mlp": 1.01466274, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 3.007740017694756, + "language_loss": 0.70096648, + "learning_rate": 9.510436165056867e-09, + "loss": 0.72241408, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11865234, + "step": 16131, + "time_per_iteration": 2.403815507888794 + }, + { + "auxiliary_loss_clip": 0.01119743, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.04671824, + "balance_loss_mlp": 1.01893425, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 1.703451409194588, + "language_loss": 0.76265687, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78416407, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1204834, + "step": 16132, + "time_per_iteration": 2.458139419555664 + }, + { + "auxiliary_loss_clip": 0.01119827, + "auxiliary_loss_mlp": 0.0103168, + "balance_loss_clip": 1.04641974, + "balance_loss_mlp": 1.01935995, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 3.300671238622323, + "language_loss": 0.78902209, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81053716, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12316895, + "step": 16133, + "time_per_iteration": 2.4238388538360596 + }, + { + "auxiliary_loss_clip": 0.01106769, + "auxiliary_loss_mlp": 0.01026625, + "balance_loss_clip": 1.03831494, + "balance_loss_mlp": 1.01571774, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.840526723190282, + "language_loss": 0.64744294, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66877693, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10913086, + "step": 16134, + "time_per_iteration": 2.492077589035034 + }, + { + "auxiliary_loss_clip": 0.01109608, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.03849947, + "balance_loss_mlp": 1.01755202, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 2.202922290938917, + "language_loss": 0.81060648, + "learning_rate": 9.359297236513519e-09, + "loss": 0.83199728, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1192627, + "step": 16135, + "time_per_iteration": 2.5475692749023438 + }, + { + "auxiliary_loss_clip": 0.01117687, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.04224026, + "balance_loss_mlp": 1.01747525, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 2.0187429577042093, + "language_loss": 0.72940421, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75090116, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.14526367, + "step": 16136, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01108674, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.03908026, + "balance_loss_mlp": 1.01862514, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.6005635082390333, + "language_loss": 0.76304752, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78443712, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11651611, + "step": 16137, + "time_per_iteration": 2.4803965091705322 + }, + { + "auxiliary_loss_clip": 0.01035525, + "auxiliary_loss_mlp": 0.01000693, + "balance_loss_clip": 1.01083899, + "balance_loss_mlp": 0.9991107, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.7653267391557943, + "language_loss": 0.54983819, + "learning_rate": 9.246735630678015e-09, + "loss": 0.57020032, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 0.01582336, + "step": 16138, + "time_per_iteration": 3.173799753189087 + }, + { + "auxiliary_loss_clip": 0.01118361, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.04096055, + "balance_loss_mlp": 1.02822208, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.9628211280867476, + "language_loss": 0.70929182, + "learning_rate": 9.209366072632007e-09, + "loss": 0.73088616, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12841797, + "step": 16139, + "time_per_iteration": 2.5322110652923584 + }, + { + "auxiliary_loss_clip": 0.01125225, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.04962194, + "balance_loss_mlp": 1.02077067, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 2.518958687437283, + "language_loss": 0.72883904, + "learning_rate": 9.172072005566134e-09, + "loss": 0.75042087, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12182617, + "step": 16140, + "time_per_iteration": 3.913207769393921 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.04339886, + "balance_loss_mlp": 1.02805614, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.7178940222538377, + "language_loss": 0.68428403, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70592368, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12908936, + "step": 16141, + "time_per_iteration": 2.388465166091919 + }, + { + "auxiliary_loss_clip": 0.0111249, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.04308724, + "balance_loss_mlp": 1.01844525, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 1.9887954824368892, + "language_loss": 0.68814445, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70956266, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10882568, + "step": 16142, + "time_per_iteration": 2.474773645401001 + }, + { + "auxiliary_loss_clip": 0.01119322, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.04736102, + "balance_loss_mlp": 1.01551497, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.7923186768769488, + "language_loss": 0.55260026, + "learning_rate": 9.060642764378457e-09, + "loss": 0.57406312, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11444092, + "step": 16143, + "time_per_iteration": 2.5094306468963623 + }, + { + "auxiliary_loss_clip": 0.01122345, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.04897678, + "balance_loss_mlp": 1.01601589, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.2494052183410083, + "language_loss": 0.67921722, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70071459, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1138916, + "step": 16144, + "time_per_iteration": 2.513068914413452 + }, + { + "auxiliary_loss_clip": 0.01115804, + "auxiliary_loss_mlp": 0.01040786, + "balance_loss_clip": 1.04528165, + "balance_loss_mlp": 1.03026557, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 2.187021982858476, + "language_loss": 0.72720242, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74876833, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10522461, + "step": 16145, + "time_per_iteration": 2.5738556385040283 + }, + { + "auxiliary_loss_clip": 0.01114206, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.03954196, + "balance_loss_mlp": 1.01680279, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 4.504573027552557, + "language_loss": 0.80165708, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82309389, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12664795, + "step": 16146, + "time_per_iteration": 2.461127519607544 + }, + { + "auxiliary_loss_clip": 0.01065161, + "auxiliary_loss_mlp": 0.01004258, + "balance_loss_clip": 1.04082108, + "balance_loss_mlp": 1.00283515, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 2.8197116838858403, + "language_loss": 0.54552042, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56621456, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.24365234, + "router_z_loss_mlp": 0.01420593, + "step": 16147, + "time_per_iteration": 3.090127944946289 + }, + { + "auxiliary_loss_clip": 0.01119189, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.04256868, + "balance_loss_mlp": 1.01920557, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 3.649774675591276, + "language_loss": 0.60985684, + "learning_rate": 8.876437313434682e-09, + "loss": 0.6313684, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12768555, + "step": 16148, + "time_per_iteration": 2.4596102237701416 + }, + { + "auxiliary_loss_clip": 0.01117622, + "auxiliary_loss_mlp": 0.010385, + "balance_loss_clip": 1.04339743, + "balance_loss_mlp": 1.0273, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.9678278694949045, + "language_loss": 0.74247456, + "learning_rate": 8.839822728487155e-09, + "loss": 0.7640357, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11199951, + "step": 16149, + "time_per_iteration": 3.8309781551361084 + }, + { + "auxiliary_loss_clip": 0.01109234, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.03596735, + "balance_loss_mlp": 1.02403927, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 2.254050939644913, + "language_loss": 0.75120306, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77265406, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11834717, + "step": 16150, + "time_per_iteration": 2.60141658782959 + }, + { + "auxiliary_loss_clip": 0.01123485, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.04555666, + "balance_loss_mlp": 1.01959777, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.1888424810953993, + "language_loss": 0.73578465, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75736088, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.14556885, + "step": 16151, + "time_per_iteration": 2.432431697845459 + }, + { + "auxiliary_loss_clip": 0.01111675, + "auxiliary_loss_mlp": 0.01026325, + "balance_loss_clip": 1.04244161, + "balance_loss_mlp": 1.01540494, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.777348241458551, + "language_loss": 0.74780941, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76918936, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10931396, + "step": 16152, + "time_per_iteration": 2.4029388427734375 + }, + { + "auxiliary_loss_clip": 0.01111138, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.04066181, + "balance_loss_mlp": 1.01997042, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.9447057146668314, + "language_loss": 0.67020237, + "learning_rate": 8.694119452473448e-09, + "loss": 0.6916303, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11669922, + "step": 16153, + "time_per_iteration": 3.8760480880737305 + }, + { + "auxiliary_loss_clip": 0.01115112, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.04340482, + "balance_loss_mlp": 1.01769686, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.747306481937839, + "language_loss": 0.70544577, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72687757, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10369873, + "step": 16154, + "time_per_iteration": 2.521465301513672 + }, + { + "auxiliary_loss_clip": 0.01115933, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.04366219, + "balance_loss_mlp": 1.02016711, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 2.038413605711529, + "language_loss": 0.8096326, + "learning_rate": 8.621720872059812e-09, + "loss": 0.83113688, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.14324951, + "step": 16155, + "time_per_iteration": 2.4877853393554688 + }, + { + "auxiliary_loss_clip": 0.01118147, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.0411284, + "balance_loss_mlp": 1.02569938, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 2.7286242878703146, + "language_loss": 0.67565763, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69722778, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13165283, + "step": 16156, + "time_per_iteration": 2.412057638168335 + }, + { + "auxiliary_loss_clip": 0.01113259, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.03942657, + "balance_loss_mlp": 1.01897049, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.693505293375461, + "language_loss": 0.90995389, + "learning_rate": 8.54962434469919e-09, + "loss": 0.93138874, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11254883, + "step": 16157, + "time_per_iteration": 2.5276854038238525 + }, + { + "auxiliary_loss_clip": 0.01118489, + "auxiliary_loss_mlp": 0.01029929, + "balance_loss_clip": 1.04539108, + "balance_loss_mlp": 1.01835358, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.8697163361240923, + "language_loss": 0.72471148, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74619567, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11578369, + "step": 16158, + "time_per_iteration": 2.422907590866089 + }, + { + "auxiliary_loss_clip": 0.01113074, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.04288435, + "balance_loss_mlp": 1.01897049, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.2302796594867256, + "language_loss": 0.60324502, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62467873, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11334229, + "step": 16159, + "time_per_iteration": 2.409698247909546 + }, + { + "auxiliary_loss_clip": 0.0110824, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.0392096, + "balance_loss_mlp": 1.01741982, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.6885408950413567, + "language_loss": 0.78980279, + "learning_rate": 8.44204592704112e-09, + "loss": 0.8111639, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10455322, + "step": 16160, + "time_per_iteration": 3.9382386207580566 + }, + { + "auxiliary_loss_clip": 0.01048973, + "auxiliary_loss_mlp": 0.01002133, + "balance_loss_clip": 1.02467847, + "balance_loss_mlp": 1.00096822, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7649202831578343, + "language_loss": 0.54236341, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56287456, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 0.01164246, + "step": 16161, + "time_per_iteration": 3.098344564437866 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.04863179, + "balance_loss_mlp": 1.01908362, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 2.041437405185413, + "language_loss": 0.72216779, + "learning_rate": 8.3707045800554e-09, + "loss": 0.74364042, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.11151123, + "step": 16162, + "time_per_iteration": 2.4079458713531494 + }, + { + "auxiliary_loss_clip": 0.01106619, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.03657532, + "balance_loss_mlp": 1.01959682, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.6740936010541245, + "language_loss": 0.78918529, + "learning_rate": 8.335147190060787e-09, + "loss": 0.81058848, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.14086914, + "step": 16163, + "time_per_iteration": 2.4828786849975586 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01025936, + "balance_loss_clip": 1.04508781, + "balance_loss_mlp": 1.01504672, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 1.8790805216740443, + "language_loss": 0.72766376, + "learning_rate": 8.299665324196903e-09, + "loss": 0.74908018, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10894775, + "step": 16164, + "time_per_iteration": 2.532813310623169 + }, + { + "auxiliary_loss_clip": 0.0111927, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.04280663, + "balance_loss_mlp": 1.02397084, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.096920954205463, + "language_loss": 0.84465337, + "learning_rate": 8.264258983809114e-09, + "loss": 0.86621118, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12542725, + "step": 16165, + "time_per_iteration": 2.5077359676361084 + }, + { + "auxiliary_loss_clip": 0.01115201, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.04429591, + "balance_loss_mlp": 1.01587558, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.5567456937226865, + "language_loss": 0.7911073, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81251693, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.09881592, + "step": 16166, + "time_per_iteration": 2.462846040725708 + }, + { + "auxiliary_loss_clip": 0.01113079, + "auxiliary_loss_mlp": 0.01023967, + "balance_loss_clip": 1.04121196, + "balance_loss_mlp": 1.01298833, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.7617647279871922, + "language_loss": 0.7076031, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72897357, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10980225, + "step": 16167, + "time_per_iteration": 2.431520700454712 + }, + { + "auxiliary_loss_clip": 0.01118019, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.04702759, + "balance_loss_mlp": 1.02250922, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 2.829732984898647, + "language_loss": 0.75932544, + "learning_rate": 8.158493128915812e-09, + "loss": 0.78084236, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1116333, + "step": 16168, + "time_per_iteration": 2.494882345199585 + }, + { + "auxiliary_loss_clip": 0.01112083, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.03864062, + "balance_loss_mlp": 1.02092934, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 3.142683363991101, + "language_loss": 0.73007536, + "learning_rate": 8.123388903830797e-09, + "loss": 0.75153434, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12890625, + "step": 16169, + "time_per_iteration": 2.4825875759124756 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.03830421, + "balance_loss_mlp": 1.01858044, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 1.8033025800996854, + "language_loss": 0.57999587, + "learning_rate": 8.088360210906309e-09, + "loss": 0.60143495, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12115479, + "step": 16170, + "time_per_iteration": 2.5303544998168945 + }, + { + "auxiliary_loss_clip": 0.01107989, + "auxiliary_loss_mlp": 0.01029206, + "balance_loss_clip": 1.03575087, + "balance_loss_mlp": 1.01686788, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 2.3427140845116066, + "language_loss": 0.71879983, + "learning_rate": 8.053407051471062e-09, + "loss": 0.74017173, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12335205, + "step": 16171, + "time_per_iteration": 2.439349889755249 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.04030812, + "balance_loss_mlp": 1.02492738, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.7657736857661337, + "language_loss": 0.68902183, + "learning_rate": 8.018529426850218e-09, + "loss": 0.71053207, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11834717, + "step": 16172, + "time_per_iteration": 2.4359195232391357 + }, + { + "auxiliary_loss_clip": 0.01118256, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.04324722, + "balance_loss_mlp": 1.0188036, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 1.7510155206932927, + "language_loss": 0.86172271, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88321143, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11804199, + "step": 16173, + "time_per_iteration": 2.4692161083221436 + }, + { + "auxiliary_loss_clip": 0.011197, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.04235363, + "balance_loss_mlp": 1.01599073, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.9471401047957169, + "language_loss": 0.64358819, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66508126, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1362915, + "step": 16174, + "time_per_iteration": 2.482839822769165 + }, + { + "auxiliary_loss_clip": 0.0110982, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.03859377, + "balance_loss_mlp": 1.01751685, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.787384707793782, + "language_loss": 0.77927649, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80065948, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10955811, + "step": 16175, + "time_per_iteration": 2.4675469398498535 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.03877878, + "balance_loss_mlp": 1.01906896, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 3.044043283603745, + "language_loss": 0.56981897, + "learning_rate": 7.879774302919307e-09, + "loss": 0.59123266, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.12658691, + "step": 16176, + "time_per_iteration": 2.4468302726745605 + }, + { + "auxiliary_loss_clip": 0.01112939, + "auxiliary_loss_mlp": 0.01027969, + "balance_loss_clip": 1.04155338, + "balance_loss_mlp": 1.01705551, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.0938824169218817, + "language_loss": 0.72403646, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74544549, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10919189, + "step": 16177, + "time_per_iteration": 2.4753525257110596 + }, + { + "auxiliary_loss_clip": 0.01110719, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.03795779, + "balance_loss_mlp": 1.01520848, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.8161683424652486, + "language_loss": 0.68890679, + "learning_rate": 7.810849984090984e-09, + "loss": 0.71027982, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11383057, + "step": 16178, + "time_per_iteration": 2.51846981048584 + }, + { + "auxiliary_loss_clip": 0.01118138, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.04410136, + "balance_loss_mlp": 1.01944041, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 2.1273983048849474, + "language_loss": 0.67288709, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69438279, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11987305, + "step": 16179, + "time_per_iteration": 2.5000686645507812 + }, + { + "auxiliary_loss_clip": 0.01105511, + "auxiliary_loss_mlp": 0.01037078, + "balance_loss_clip": 1.03668022, + "balance_loss_mlp": 1.024966, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 1.7698192216439905, + "language_loss": 0.77267265, + "learning_rate": 7.742227841308624e-09, + "loss": 0.79409862, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.12109375, + "step": 16180, + "time_per_iteration": 2.4923715591430664 + }, + { + "auxiliary_loss_clip": 0.01117632, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.04277325, + "balance_loss_mlp": 1.01873565, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.4993131006084448, + "language_loss": 0.76559019, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78707325, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11938477, + "step": 16181, + "time_per_iteration": 2.5697691440582275 + }, + { + "auxiliary_loss_clip": 0.01104605, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.03439474, + "balance_loss_mlp": 1.0210278, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.6334927687509773, + "language_loss": 0.63539863, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65676904, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11419678, + "step": 16182, + "time_per_iteration": 2.4556686878204346 + }, + { + "auxiliary_loss_clip": 0.01121064, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.04478836, + "balance_loss_mlp": 1.02155232, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 4.837038935056898, + "language_loss": 0.62828052, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64982671, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12005615, + "step": 16183, + "time_per_iteration": 3.988959789276123 + }, + { + "auxiliary_loss_clip": 0.01120238, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.04851723, + "balance_loss_mlp": 1.01670885, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.5769197978190868, + "language_loss": 0.78005975, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80154836, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11914062, + "step": 16184, + "time_per_iteration": 2.4638307094573975 + }, + { + "auxiliary_loss_clip": 0.01116666, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.0447818, + "balance_loss_mlp": 1.01737976, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.4418811445583555, + "language_loss": 0.79595232, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81740606, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11334229, + "step": 16185, + "time_per_iteration": 2.4351470470428467 + }, + { + "auxiliary_loss_clip": 0.01114745, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.04243636, + "balance_loss_mlp": 1.01676977, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 2.0846974519300687, + "language_loss": 0.77981925, + "learning_rate": 7.538174573094469e-09, + "loss": 0.80124938, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11499023, + "step": 16186, + "time_per_iteration": 2.4121222496032715 + }, + { + "auxiliary_loss_clip": 0.0110933, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.01542127, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.8162918039009504, + "language_loss": 0.65581328, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67717719, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11651611, + "step": 16187, + "time_per_iteration": 2.467698097229004 + }, + { + "auxiliary_loss_clip": 0.01104216, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.03467894, + "balance_loss_mlp": 1.01741338, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.9329575072362293, + "language_loss": 0.80587029, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82720029, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11364746, + "step": 16188, + "time_per_iteration": 2.5840344429016113 + }, + { + "auxiliary_loss_clip": 0.01111189, + "auxiliary_loss_mlp": 0.01028047, + "balance_loss_clip": 1.04235935, + "balance_loss_mlp": 1.01753902, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.7557758099334775, + "language_loss": 0.77771223, + "learning_rate": 7.437167905363084e-09, + "loss": 0.79910463, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10516357, + "step": 16189, + "time_per_iteration": 2.4576947689056396 + }, + { + "auxiliary_loss_clip": 0.01106835, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.0358386, + "balance_loss_mlp": 1.01403856, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.71246864938362, + "language_loss": 0.51394498, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53526986, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11608887, + "step": 16190, + "time_per_iteration": 2.584221124649048 + }, + { + "auxiliary_loss_clip": 0.01119662, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.04711604, + "balance_loss_mlp": 1.01610589, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.8885936793778366, + "language_loss": 0.81233841, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83381271, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11663818, + "step": 16191, + "time_per_iteration": 2.4476282596588135 + }, + { + "auxiliary_loss_clip": 0.01108812, + "auxiliary_loss_mlp": 0.01027766, + "balance_loss_clip": 1.03974795, + "balance_loss_mlp": 1.01722789, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.6412408730581918, + "language_loss": 0.82851744, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84988326, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10540771, + "step": 16192, + "time_per_iteration": 3.8156988620758057 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.04566479, + "balance_loss_mlp": 1.01934123, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.7857849638120598, + "language_loss": 0.75193357, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77342963, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11151123, + "step": 16193, + "time_per_iteration": 2.479999542236328 + }, + { + "auxiliary_loss_clip": 0.01111311, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.04287148, + "balance_loss_mlp": 1.01959229, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.6547590089013449, + "language_loss": 0.84939247, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87080538, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10394287, + "step": 16194, + "time_per_iteration": 2.436518430709839 + }, + { + "auxiliary_loss_clip": 0.01109665, + "auxiliary_loss_mlp": 0.01032629, + "balance_loss_clip": 1.03988945, + "balance_loss_mlp": 1.02141786, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.5188515103353564, + "language_loss": 0.75877273, + "learning_rate": 7.237194675009828e-09, + "loss": 0.78019571, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11206055, + "step": 16195, + "time_per_iteration": 2.415759563446045 + }, + { + "auxiliary_loss_clip": 0.01036677, + "auxiliary_loss_mlp": 0.01001089, + "balance_loss_clip": 1.0118376, + "balance_loss_mlp": 0.99974632, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7068701157793058, + "language_loss": 0.52461898, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54499662, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.24853516, + "router_z_loss_mlp": 0.01342773, + "step": 16196, + "time_per_iteration": 4.420827150344849 + }, + { + "auxiliary_loss_clip": 0.0111718, + "auxiliary_loss_mlp": 0.0102622, + "balance_loss_clip": 1.04560852, + "balance_loss_mlp": 1.014961, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 1.4662369216866074, + "language_loss": 0.7604093, + "learning_rate": 7.171141444240136e-09, + "loss": 0.7818433, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11260986, + "step": 16197, + "time_per_iteration": 2.4986090660095215 + }, + { + "auxiliary_loss_clip": 0.01111642, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.03705001, + "balance_loss_mlp": 1.01943254, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.7220177187723347, + "language_loss": 0.68100333, + "learning_rate": 7.13822818063492e-09, + "loss": 0.7024346, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1206665, + "step": 16198, + "time_per_iteration": 2.440432071685791 + }, + { + "auxiliary_loss_clip": 0.0111392, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.0420835, + "balance_loss_mlp": 1.01922846, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 2.0214851485613483, + "language_loss": 0.78253031, + "learning_rate": 7.10539048654768e-09, + "loss": 0.80398488, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12310791, + "step": 16199, + "time_per_iteration": 2.4791507720947266 + }, + { + "auxiliary_loss_clip": 0.01120761, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.04684067, + "balance_loss_mlp": 1.01786244, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 2.4241196196937382, + "language_loss": 0.79477352, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81628025, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12042236, + "step": 16200, + "time_per_iteration": 2.4503281116485596 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.04589272, + "balance_loss_mlp": 1.02453446, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.0117106634108044, + "language_loss": 0.6827544, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70435447, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12268066, + "step": 16201, + "time_per_iteration": 2.4650352001190186 + }, + { + "auxiliary_loss_clip": 0.01113843, + "auxiliary_loss_mlp": 0.01027106, + "balance_loss_clip": 1.04096675, + "balance_loss_mlp": 1.01627564, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.4149170302566985, + "language_loss": 0.726165, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74757445, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1083374, + "step": 16202, + "time_per_iteration": 2.471647262573242 + }, + { + "auxiliary_loss_clip": 0.01123811, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.04917216, + "balance_loss_mlp": 1.0173068, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.0313507559722783, + "language_loss": 0.73036265, + "learning_rate": 6.974795430241265e-09, + "loss": 0.7518965, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12249756, + "step": 16203, + "time_per_iteration": 2.427340507507324 + }, + { + "auxiliary_loss_clip": 0.01111209, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.04084396, + "balance_loss_mlp": 1.02013707, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 2.098852848624863, + "language_loss": 0.77657926, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79800647, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11383057, + "step": 16204, + "time_per_iteration": 3.8560643196105957 + }, + { + "auxiliary_loss_clip": 0.01117735, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.04437482, + "balance_loss_mlp": 1.01869142, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 1.8650783398361377, + "language_loss": 0.79437804, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81586409, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.121521, + "step": 16205, + "time_per_iteration": 2.4581313133239746 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01026141, + "balance_loss_clip": 1.04046559, + "balance_loss_mlp": 1.01505494, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.8199171468197333, + "language_loss": 0.74238259, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76375151, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11083984, + "step": 16206, + "time_per_iteration": 2.4668405055999756 + }, + { + "auxiliary_loss_clip": 0.01116547, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.04306555, + "balance_loss_mlp": 1.02106845, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.150600725039441, + "language_loss": 0.83471531, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.85621643, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12487793, + "step": 16207, + "time_per_iteration": 2.4424054622650146 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.04394412, + "balance_loss_mlp": 1.02002144, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.577348411960532, + "language_loss": 0.70780563, + "learning_rate": 6.813252072591425e-09, + "loss": 0.72926676, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11419678, + "step": 16208, + "time_per_iteration": 2.5038130283355713 + }, + { + "auxiliary_loss_clip": 0.01100983, + "auxiliary_loss_mlp": 0.0102237, + "balance_loss_clip": 1.03553522, + "balance_loss_mlp": 1.01247537, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.6726348098806567, + "language_loss": 0.77459836, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79583192, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 0.65429688, + "router_z_loss_mlp": 0.09887695, + "step": 16209, + "time_per_iteration": 2.476741313934326 + }, + { + "auxiliary_loss_clip": 0.01114293, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.03790522, + "balance_loss_mlp": 1.02398133, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 2.758892216306205, + "language_loss": 0.79250622, + "learning_rate": 6.749163793864144e-09, + "loss": 0.8140167, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12774658, + "step": 16210, + "time_per_iteration": 2.511885166168213 + }, + { + "auxiliary_loss_clip": 0.01116441, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.04396033, + "balance_loss_mlp": 1.02118635, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.107101111442171, + "language_loss": 0.78143013, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80291712, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11077881, + "step": 16211, + "time_per_iteration": 2.578568458557129 + }, + { + "auxiliary_loss_clip": 0.0112169, + "auxiliary_loss_mlp": 0.01026842, + "balance_loss_clip": 1.04570746, + "balance_loss_mlp": 1.0139972, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.082627656183036, + "language_loss": 0.78380406, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80528939, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.128479, + "step": 16212, + "time_per_iteration": 2.432417869567871 + }, + { + "auxiliary_loss_clip": 0.01108172, + "auxiliary_loss_mlp": 0.01026734, + "balance_loss_clip": 1.03879356, + "balance_loss_mlp": 1.01567769, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.4821653778278099, + "language_loss": 0.80391258, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82526159, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.11053467, + "step": 16213, + "time_per_iteration": 2.650381565093994 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01026029, + "balance_loss_clip": 1.03638768, + "balance_loss_mlp": 1.01485312, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 1.8073500595303527, + "language_loss": 0.66345578, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68478489, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11181641, + "step": 16214, + "time_per_iteration": 2.4181129932403564 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.04037082, + "balance_loss_mlp": 1.02011681, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.641676958629192, + "language_loss": 0.74498922, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76646352, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.13427734, + "step": 16215, + "time_per_iteration": 2.5024147033691406 + }, + { + "auxiliary_loss_clip": 0.01111801, + "auxiliary_loss_mlp": 0.01036112, + "balance_loss_clip": 1.03973353, + "balance_loss_mlp": 1.02278483, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.9803723261176671, + "language_loss": 0.67046428, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69194341, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.13311768, + "step": 16216, + "time_per_iteration": 2.64843487739563 + }, + { + "auxiliary_loss_clip": 0.01120057, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.04569733, + "balance_loss_mlp": 1.01538205, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 2.4228240463929667, + "language_loss": 0.72015601, + "learning_rate": 6.527235786226937e-09, + "loss": 0.74163067, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12017822, + "step": 16217, + "time_per_iteration": 2.4184751510620117 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.04624796, + "balance_loss_mlp": 1.01758671, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6178043539131712, + "language_loss": 0.78515935, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80665129, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11791992, + "step": 16218, + "time_per_iteration": 2.473045825958252 + }, + { + "auxiliary_loss_clip": 0.01107649, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.03839517, + "balance_loss_mlp": 1.01870656, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 1.9974185957663326, + "language_loss": 0.77461118, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79598802, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11334229, + "step": 16219, + "time_per_iteration": 2.451188802719116 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.04136872, + "balance_loss_mlp": 1.02083349, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.7411952300308629, + "language_loss": 0.81529343, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83678472, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11535645, + "step": 16220, + "time_per_iteration": 2.427450656890869 + }, + { + "auxiliary_loss_clip": 0.01109106, + "auxiliary_loss_mlp": 0.01027693, + "balance_loss_clip": 1.03935289, + "balance_loss_mlp": 1.01670229, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 2.148565329795925, + "language_loss": 0.75318807, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77455604, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11004639, + "step": 16221, + "time_per_iteration": 2.454024314880371 + }, + { + "auxiliary_loss_clip": 0.01113188, + "auxiliary_loss_mlp": 0.01029665, + "balance_loss_clip": 1.04396081, + "balance_loss_mlp": 1.01780319, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.506809417510919, + "language_loss": 0.66672039, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68814898, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11871338, + "step": 16222, + "time_per_iteration": 2.502044439315796 + }, + { + "auxiliary_loss_clip": 0.01117334, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.04675162, + "balance_loss_mlp": 1.02019787, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.7067922873245047, + "language_loss": 0.88510507, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90659082, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11047363, + "step": 16223, + "time_per_iteration": 2.561955451965332 + }, + { + "auxiliary_loss_clip": 0.01111231, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.03957009, + "balance_loss_mlp": 1.01892376, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.944377327896991, + "language_loss": 0.74719238, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76860821, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11425781, + "step": 16224, + "time_per_iteration": 2.510430097579956 + }, + { + "auxiliary_loss_clip": 0.0103693, + "auxiliary_loss_mlp": 0.01005895, + "balance_loss_clip": 1.01169753, + "balance_loss_mlp": 1.00457644, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8793715805068639, + "language_loss": 0.59106725, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61149549, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01318359, + "step": 16225, + "time_per_iteration": 3.035979747772217 + }, + { + "auxiliary_loss_clip": 0.01111467, + "auxiliary_loss_mlp": 0.01036982, + "balance_loss_clip": 1.04058623, + "balance_loss_mlp": 1.02585936, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.7944100761685988, + "language_loss": 0.69125718, + "learning_rate": 6.247342505960818e-09, + "loss": 0.71274161, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11126709, + "step": 16226, + "time_per_iteration": 3.8993425369262695 + }, + { + "auxiliary_loss_clip": 0.01117243, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.04437065, + "balance_loss_mlp": 1.02446532, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.71526527108422, + "language_loss": 0.8317647, + "learning_rate": 6.216621253462894e-09, + "loss": 0.8532998, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11791992, + "step": 16227, + "time_per_iteration": 2.4522910118103027 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.04617298, + "balance_loss_mlp": 1.0142597, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.964955302937323, + "language_loss": 0.77959192, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80100995, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11065674, + "step": 16228, + "time_per_iteration": 2.4540634155273438 + }, + { + "auxiliary_loss_clip": 0.01045835, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 1.02025127, + "balance_loss_mlp": 1.00260651, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8453160725716121, + "language_loss": 0.55842817, + "learning_rate": 6.155405563025962e-09, + "loss": 0.5789265, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 0.01391602, + "step": 16229, + "time_per_iteration": 3.00160551071167 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.04410231, + "balance_loss_mlp": 1.01631761, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.6288204690963262, + "language_loss": 0.74951756, + "learning_rate": 6.124911127407984e-09, + "loss": 0.7709505, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11505127, + "step": 16230, + "time_per_iteration": 2.483259439468384 + }, + { + "auxiliary_loss_clip": 0.01117823, + "auxiliary_loss_mlp": 0.01026786, + "balance_loss_clip": 1.04900455, + "balance_loss_mlp": 1.01590228, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.827189148182232, + "language_loss": 0.71862268, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74006879, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10888672, + "step": 16231, + "time_per_iteration": 2.412290096282959 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.04396844, + "balance_loss_mlp": 1.01787972, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 2.039480942334928, + "language_loss": 0.76589304, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78736866, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12304688, + "step": 16232, + "time_per_iteration": 2.48413348197937 + }, + { + "auxiliary_loss_clip": 0.01049344, + "auxiliary_loss_mlp": 0.01004293, + "balance_loss_clip": 1.0251112, + "balance_loss_mlp": 1.00310087, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7378727012671937, + "language_loss": 0.53797555, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55851191, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.24243164, + "router_z_loss_mlp": 0.01190186, + "step": 16233, + "time_per_iteration": 2.9369611740112305 + }, + { + "auxiliary_loss_clip": 0.01112948, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.04051232, + "balance_loss_mlp": 1.0216434, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 2.4855509787877894, + "language_loss": 0.71585619, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73733127, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12921143, + "step": 16234, + "time_per_iteration": 2.42578125 + }, + { + "auxiliary_loss_clip": 0.01113545, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.03862762, + "balance_loss_mlp": 1.02110851, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.7942287588991888, + "language_loss": 0.78515929, + "learning_rate": 5.973573091493156e-09, + "loss": 0.80664033, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.13458252, + "step": 16235, + "time_per_iteration": 2.380481719970703 + }, + { + "auxiliary_loss_clip": 0.0111774, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.046345, + "balance_loss_mlp": 1.01861405, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.758964069581062, + "language_loss": 0.77276486, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79425895, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.13049316, + "step": 16236, + "time_per_iteration": 3.8418784141540527 + }, + { + "auxiliary_loss_clip": 0.01112838, + "auxiliary_loss_mlp": 0.01026858, + "balance_loss_clip": 1.04086256, + "balance_loss_mlp": 1.01555145, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.765085360634993, + "language_loss": 0.75622511, + "learning_rate": 5.913567164886446e-09, + "loss": 0.7776221, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11315918, + "step": 16237, + "time_per_iteration": 2.4207444190979004 + }, + { + "auxiliary_loss_clip": 0.01113506, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.04133129, + "balance_loss_mlp": 1.01794171, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.6209543894774183, + "language_loss": 0.72792614, + "learning_rate": 5.8836776249509e-09, + "loss": 0.74937266, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.13195801, + "step": 16238, + "time_per_iteration": 2.4951744079589844 + }, + { + "auxiliary_loss_clip": 0.01118172, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.04347789, + "balance_loss_mlp": 1.01498377, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.0918141607007175, + "language_loss": 0.83674741, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85820615, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1270752, + "step": 16239, + "time_per_iteration": 2.4883522987365723 + }, + { + "auxiliary_loss_clip": 0.0111696, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.04153669, + "balance_loss_mlp": 1.0206356, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 6.676156826611172, + "language_loss": 0.60169911, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62319452, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11950684, + "step": 16240, + "time_per_iteration": 2.3985230922698975 + }, + { + "auxiliary_loss_clip": 0.01126874, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.0537653, + "balance_loss_mlp": 1.01588416, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 2.2655448273082652, + "language_loss": 0.82814848, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.8496905, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11450195, + "step": 16241, + "time_per_iteration": 3.9006810188293457 + }, + { + "auxiliary_loss_clip": 0.01115729, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.04464614, + "balance_loss_mlp": 1.01812601, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.995497219143815, + "language_loss": 0.83090293, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85234857, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10705566, + "step": 16242, + "time_per_iteration": 2.4299676418304443 + }, + { + "auxiliary_loss_clip": 0.01116239, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.0442946, + "balance_loss_mlp": 1.01524425, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5201085032909014, + "language_loss": 0.75507426, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77650177, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1126709, + "step": 16243, + "time_per_iteration": 2.4731767177581787 + }, + { + "auxiliary_loss_clip": 0.01112005, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.03972852, + "balance_loss_mlp": 1.02488923, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.6857786643064894, + "language_loss": 0.70066869, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72216314, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12561035, + "step": 16244, + "time_per_iteration": 2.486259698867798 + }, + { + "auxiliary_loss_clip": 0.01118195, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.04523039, + "balance_loss_mlp": 1.01823926, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.9252974915167675, + "language_loss": 0.83535111, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85683775, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12243652, + "step": 16245, + "time_per_iteration": 2.4932422637939453 + }, + { + "auxiliary_loss_clip": 0.01112288, + "auxiliary_loss_mlp": 0.01026752, + "balance_loss_clip": 1.04151857, + "balance_loss_mlp": 1.01537967, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.387751449825575, + "language_loss": 0.78534722, + "learning_rate": 5.647283615340726e-09, + "loss": 0.8067376, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11376953, + "step": 16246, + "time_per_iteration": 2.446610927581787 + }, + { + "auxiliary_loss_clip": 0.01113019, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.04798985, + "balance_loss_mlp": 1.02063274, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4453719934213403, + "language_loss": 0.74224436, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.7636826, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 0.64941406, + "router_z_loss_mlp": 0.10168457, + "step": 16247, + "time_per_iteration": 3.89070987701416 + }, + { + "auxiliary_loss_clip": 0.01108766, + "auxiliary_loss_mlp": 0.01028897, + "balance_loss_clip": 1.03778505, + "balance_loss_mlp": 1.01708925, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.4878729335595717, + "language_loss": 0.80114901, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82252562, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11791992, + "step": 16248, + "time_per_iteration": 2.547696828842163 + }, + { + "auxiliary_loss_clip": 0.01126789, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_clip": 1.05047572, + "balance_loss_mlp": 1.0300914, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.9305583143605627, + "language_loss": 0.79129767, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81299824, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.1317749, + "step": 16249, + "time_per_iteration": 2.5159757137298584 + }, + { + "auxiliary_loss_clip": 0.01117412, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.04412961, + "balance_loss_mlp": 1.02425075, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 1.8353664021740073, + "language_loss": 0.66990954, + "learning_rate": 5.530901600093507e-09, + "loss": 0.69144827, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12213135, + "step": 16250, + "time_per_iteration": 2.436631917953491 + }, + { + "auxiliary_loss_clip": 0.01036328, + "auxiliary_loss_mlp": 0.01000605, + "balance_loss_clip": 1.01136208, + "balance_loss_mlp": 0.99928045, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7752295799484655, + "language_loss": 0.59830797, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61867738, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01324463, + "step": 16251, + "time_per_iteration": 3.1242430210113525 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.04054618, + "balance_loss_mlp": 1.0190438, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.9547612360385491, + "language_loss": 0.78841734, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80986196, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12005615, + "step": 16252, + "time_per_iteration": 2.490496873855591 + }, + { + "auxiliary_loss_clip": 0.01116384, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.0432539, + "balance_loss_mlp": 1.01817286, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.9696409453380643, + "language_loss": 0.6479547, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66941941, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11914062, + "step": 16253, + "time_per_iteration": 2.5131113529205322 + }, + { + "auxiliary_loss_clip": 0.01122196, + "auxiliary_loss_mlp": 0.01029422, + "balance_loss_clip": 1.04869378, + "balance_loss_mlp": 1.01567125, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.2790737269858488, + "language_loss": 0.76626205, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78777826, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.13739014, + "step": 16254, + "time_per_iteration": 2.433643102645874 + }, + { + "auxiliary_loss_clip": 0.01120709, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.04781139, + "balance_loss_mlp": 1.01925087, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.743864974931917, + "language_loss": 0.64078313, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66230136, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11846924, + "step": 16255, + "time_per_iteration": 2.6250782012939453 + }, + { + "auxiliary_loss_clip": 0.01109424, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.03587437, + "balance_loss_mlp": 1.02145052, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.654521142811564, + "language_loss": 0.75545371, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77689016, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12774658, + "step": 16256, + "time_per_iteration": 2.5343663692474365 + }, + { + "auxiliary_loss_clip": 0.01116299, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.04592597, + "balance_loss_mlp": 1.02170813, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 2.4999473702002337, + "language_loss": 0.78031802, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80182421, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12609863, + "step": 16257, + "time_per_iteration": 2.4567675590515137 + }, + { + "auxiliary_loss_clip": 0.01107125, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.03697622, + "balance_loss_mlp": 1.01660943, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.6043666697064325, + "language_loss": 0.74866617, + "learning_rate": 5.301767901772391e-09, + "loss": 0.7700243, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.1206665, + "step": 16258, + "time_per_iteration": 2.460075616836548 + }, + { + "auxiliary_loss_clip": 0.01042116, + "auxiliary_loss_mlp": 0.01005746, + "balance_loss_clip": 1.01681197, + "balance_loss_mlp": 1.00440335, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6782263044401055, + "language_loss": 0.59761649, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61809516, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01342773, + "step": 16259, + "time_per_iteration": 3.121438980102539 + }, + { + "auxiliary_loss_clip": 0.01121069, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.04753613, + "balance_loss_mlp": 1.01921034, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.617481001946991, + "language_loss": 0.73504323, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75657082, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12481689, + "step": 16260, + "time_per_iteration": 2.4435760974884033 + }, + { + "auxiliary_loss_clip": 0.01113124, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.04118061, + "balance_loss_mlp": 1.02281404, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 2.5472936245380606, + "language_loss": 0.79329944, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81477904, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12042236, + "step": 16261, + "time_per_iteration": 2.4273252487182617 + }, + { + "auxiliary_loss_clip": 0.01107211, + "auxiliary_loss_mlp": 0.01024567, + "balance_loss_clip": 1.03535056, + "balance_loss_mlp": 1.01267612, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 3.0475830339727885, + "language_loss": 0.74595612, + "learning_rate": 5.189016357718845e-09, + "loss": 0.7672739, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11895752, + "step": 16262, + "time_per_iteration": 2.524061679840088 + }, + { + "auxiliary_loss_clip": 0.01117723, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.04268765, + "balance_loss_mlp": 1.01626348, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.2496554270479905, + "language_loss": 0.70308781, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72456133, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13372803, + "step": 16263, + "time_per_iteration": 2.504542589187622 + }, + { + "auxiliary_loss_clip": 0.01113675, + "auxiliary_loss_mlp": 0.01026129, + "balance_loss_clip": 1.04194212, + "balance_loss_mlp": 1.01505423, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.629824572137664, + "language_loss": 0.66411495, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68551302, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11071777, + "step": 16264, + "time_per_iteration": 2.452399492263794 + }, + { + "auxiliary_loss_clip": 0.0111561, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.04019511, + "balance_loss_mlp": 1.01841664, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.84047592785009, + "language_loss": 0.73164964, + "learning_rate": 5.105246951967679e-09, + "loss": 0.7531293, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.1394043, + "step": 16265, + "time_per_iteration": 2.5281484127044678 + }, + { + "auxiliary_loss_clip": 0.01109483, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03992391, + "balance_loss_mlp": 1.0187602, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 2.9751393815983715, + "language_loss": 0.68324548, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70464557, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11773682, + "step": 16266, + "time_per_iteration": 2.4847841262817383 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.03716886, + "balance_loss_mlp": 1.01785231, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.7677473581442742, + "language_loss": 0.8681078, + "learning_rate": 5.049778912747049e-09, + "loss": 0.8894428, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.10266113, + "step": 16267, + "time_per_iteration": 2.493286609649658 + }, + { + "auxiliary_loss_clip": 0.01111504, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.03776288, + "balance_loss_mlp": 1.0149008, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.8036900415023691, + "language_loss": 0.70075113, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72213984, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12469482, + "step": 16268, + "time_per_iteration": 2.543156385421753 + }, + { + "auxiliary_loss_clip": 0.01120936, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.04481602, + "balance_loss_mlp": 1.02034056, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.5890027026507072, + "language_loss": 0.73773301, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75926048, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11462402, + "step": 16269, + "time_per_iteration": 3.8785204887390137 + }, + { + "auxiliary_loss_clip": 0.0111468, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.04198325, + "balance_loss_mlp": 1.01375663, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.9445215983813815, + "language_loss": 0.70904535, + "learning_rate": 4.967144221869501e-09, + "loss": 0.73045522, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12548828, + "step": 16270, + "time_per_iteration": 2.4908382892608643 + }, + { + "auxiliary_loss_clip": 0.01125097, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.05106068, + "balance_loss_mlp": 1.02035451, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 2.311356665420141, + "language_loss": 0.63899302, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66056609, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11859131, + "step": 16271, + "time_per_iteration": 2.5271053314208984 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.04577911, + "balance_loss_mlp": 1.01996064, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.501146984028184, + "language_loss": 0.70547426, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72693378, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.11975098, + "step": 16272, + "time_per_iteration": 2.6496448516845703 + }, + { + "auxiliary_loss_clip": 0.01117777, + "auxiliary_loss_mlp": 0.01035396, + "balance_loss_clip": 1.04423392, + "balance_loss_mlp": 1.02318263, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 2.6477130552907595, + "language_loss": 0.66661704, + "learning_rate": 4.88519039758728e-09, + "loss": 0.6881488, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12207031, + "step": 16273, + "time_per_iteration": 2.521707057952881 + }, + { + "auxiliary_loss_clip": 0.01115335, + "auxiliary_loss_mlp": 0.01022494, + "balance_loss_clip": 1.04187799, + "balance_loss_mlp": 1.00988173, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.6071794990458306, + "language_loss": 0.73937631, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76075459, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12615967, + "step": 16274, + "time_per_iteration": 2.5040764808654785 + }, + { + "auxiliary_loss_clip": 0.01113639, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.04204488, + "balance_loss_mlp": 1.02134991, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.594459242543973, + "language_loss": 0.77592546, + "learning_rate": 4.830932787773579e-09, + "loss": 0.7973901, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11474609, + "step": 16275, + "time_per_iteration": 2.4987499713897705 + }, + { + "auxiliary_loss_clip": 0.01113891, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.03946114, + "balance_loss_mlp": 1.01850903, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.6647365012499753, + "language_loss": 0.71130073, + "learning_rate": 4.803917467869567e-09, + "loss": 0.73274773, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1229248, + "step": 16276, + "time_per_iteration": 2.5575990676879883 + }, + { + "auxiliary_loss_clip": 0.01107066, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.0379262, + "balance_loss_mlp": 1.01898265, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 2.2755419685217806, + "language_loss": 0.85826343, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87963414, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11022949, + "step": 16277, + "time_per_iteration": 2.5133419036865234 + }, + { + "auxiliary_loss_clip": 0.01112009, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.04096711, + "balance_loss_mlp": 1.02336788, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 2.3155107698624935, + "language_loss": 0.71129757, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73277557, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.12420654, + "step": 16278, + "time_per_iteration": 2.43146014213562 + }, + { + "auxiliary_loss_clip": 0.01113096, + "auxiliary_loss_mlp": 0.010248, + "balance_loss_clip": 1.04299831, + "balance_loss_mlp": 1.01275349, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.9312279135597632, + "language_loss": 0.84592992, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86730891, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.12036133, + "step": 16279, + "time_per_iteration": 3.8010153770446777 + }, + { + "auxiliary_loss_clip": 0.01117822, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.0427314, + "balance_loss_mlp": 1.01663435, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 1.9564982808386042, + "language_loss": 0.79145861, + "learning_rate": 4.696612778808395e-09, + "loss": 0.8129282, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12506104, + "step": 16280, + "time_per_iteration": 2.4248151779174805 + }, + { + "auxiliary_loss_clip": 0.01114265, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.04632664, + "balance_loss_mlp": 1.02412271, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.9137798703381073, + "language_loss": 0.79358399, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81506926, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.10144043, + "step": 16281, + "time_per_iteration": 2.5001490116119385 + }, + { + "auxiliary_loss_clip": 0.01116772, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.04548287, + "balance_loss_mlp": 1.02069521, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.7280293012174244, + "language_loss": 0.80159104, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82308483, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11920166, + "step": 16282, + "time_per_iteration": 2.449801206588745 + }, + { + "auxiliary_loss_clip": 0.01117783, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.04387379, + "balance_loss_mlp": 1.02230382, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 2.007523620338006, + "language_loss": 0.83170146, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85322559, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12316895, + "step": 16283, + "time_per_iteration": 2.4177002906799316 + }, + { + "auxiliary_loss_clip": 0.01111038, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.03954399, + "balance_loss_mlp": 1.02158546, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 2.4761339544814778, + "language_loss": 0.71772885, + "learning_rate": 4.590518683360134e-09, + "loss": 0.73917073, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11560059, + "step": 16284, + "time_per_iteration": 2.3944950103759766 + }, + { + "auxiliary_loss_clip": 0.01108018, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.03988576, + "balance_loss_mlp": 1.02082801, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.990060972609676, + "language_loss": 0.64523834, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66663486, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.10803223, + "step": 16285, + "time_per_iteration": 3.9699926376342773 + }, + { + "auxiliary_loss_clip": 0.01106285, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.03658533, + "balance_loss_mlp": 1.01680899, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.9064364827426, + "language_loss": 0.70650274, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72785765, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.12402344, + "step": 16286, + "time_per_iteration": 2.470313310623169 + }, + { + "auxiliary_loss_clip": 0.01116874, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.04559851, + "balance_loss_mlp": 1.01640189, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.4224636431004962, + "language_loss": 0.58175862, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60320127, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10992432, + "step": 16287, + "time_per_iteration": 2.485045909881592 + }, + { + "auxiliary_loss_clip": 0.01119605, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.0476625, + "balance_loss_mlp": 1.02253997, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.690914544346633, + "language_loss": 0.81708395, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83862782, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12243652, + "step": 16288, + "time_per_iteration": 2.5005691051483154 + }, + { + "auxiliary_loss_clip": 0.01109218, + "auxiliary_loss_mlp": 0.01024665, + "balance_loss_clip": 1.03695345, + "balance_loss_mlp": 1.01291084, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.392258476244431, + "language_loss": 0.72059458, + "learning_rate": 4.459603559311631e-09, + "loss": 0.74193347, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11761475, + "step": 16289, + "time_per_iteration": 2.5247769355773926 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.04429042, + "balance_loss_mlp": 1.02232516, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.3214478210434484, + "language_loss": 0.75886822, + "learning_rate": 4.43364754382003e-09, + "loss": 0.78036392, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.12084961, + "step": 16290, + "time_per_iteration": 2.4582204818725586 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.0371058, + "balance_loss_mlp": 1.02160144, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.7343656220243402, + "language_loss": 0.67376578, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69521964, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.14233398, + "step": 16291, + "time_per_iteration": 3.876847743988037 + }, + { + "auxiliary_loss_clip": 0.01110176, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_clip": 1.03630912, + "balance_loss_mlp": 1.01255751, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.8128349378212674, + "language_loss": 0.62917674, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.65052712, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12286377, + "step": 16292, + "time_per_iteration": 2.538930892944336 + }, + { + "auxiliary_loss_clip": 0.01109894, + "auxiliary_loss_mlp": 0.01032311, + "balance_loss_clip": 1.03765619, + "balance_loss_mlp": 1.02082539, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 1.9562356859385313, + "language_loss": 0.73369849, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75512052, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11480713, + "step": 16293, + "time_per_iteration": 2.438706398010254 + }, + { + "auxiliary_loss_clip": 0.01116243, + "auxiliary_loss_mlp": 0.01027083, + "balance_loss_clip": 1.04231024, + "balance_loss_mlp": 1.01522803, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 1.6283082469394308, + "language_loss": 0.84176171, + "learning_rate": 4.330580212414503e-09, + "loss": 0.863195, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11846924, + "step": 16294, + "time_per_iteration": 2.5112152099609375 + }, + { + "auxiliary_loss_clip": 0.01106078, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.03833508, + "balance_loss_mlp": 1.01832318, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.065854545431599, + "language_loss": 0.7154398, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73679125, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 0.67675781, + "router_z_loss_mlp": 0.10742188, + "step": 16295, + "time_per_iteration": 2.440929651260376 + }, + { + "auxiliary_loss_clip": 0.0111819, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.04349947, + "balance_loss_mlp": 1.01806366, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.7945738178711297, + "language_loss": 0.80740535, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82888669, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11889648, + "step": 16296, + "time_per_iteration": 2.4990687370300293 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03875446, + "balance_loss_mlp": 1.02100933, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 2.3023205445585258, + "language_loss": 0.75875938, + "learning_rate": 4.254074308266853e-09, + "loss": 0.78017843, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11364746, + "step": 16297, + "time_per_iteration": 2.5287184715270996 + }, + { + "auxiliary_loss_clip": 0.01115037, + "auxiliary_loss_mlp": 0.01034257, + "balance_loss_clip": 1.04028416, + "balance_loss_mlp": 1.02219343, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.5980468351590944, + "language_loss": 0.78284454, + "learning_rate": 4.228723696702019e-09, + "loss": 0.8043375, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12060547, + "step": 16298, + "time_per_iteration": 2.5101211071014404 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.03582561, + "balance_loss_mlp": 1.01459086, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.6123680027796647, + "language_loss": 0.72704148, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74832416, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.10644531, + "step": 16299, + "time_per_iteration": 2.4772517681121826 + }, + { + "auxiliary_loss_clip": 0.01113061, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.03975487, + "balance_loss_mlp": 1.01500654, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 3.195794799878242, + "language_loss": 0.89229167, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91369426, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12200928, + "step": 16300, + "time_per_iteration": 2.511932611465454 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.03869581, + "balance_loss_mlp": 1.0166204, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.565117746402517, + "language_loss": 0.78197789, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80338591, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11907959, + "step": 16301, + "time_per_iteration": 2.5277647972106934 + }, + { + "auxiliary_loss_clip": 0.01113286, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.03976774, + "balance_loss_mlp": 1.0208286, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.23285203739004, + "language_loss": 0.74978757, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77125579, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12695312, + "step": 16302, + "time_per_iteration": 2.4275553226470947 + }, + { + "auxiliary_loss_clip": 0.01123262, + "auxiliary_loss_mlp": 0.01025733, + "balance_loss_clip": 1.05063438, + "balance_loss_mlp": 1.01380575, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.6979226980418531, + "language_loss": 0.79905427, + "learning_rate": 4.103105855705724e-09, + "loss": 0.82054424, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11932373, + "step": 16303, + "time_per_iteration": 2.4632532596588135 + }, + { + "auxiliary_loss_clip": 0.01119002, + "auxiliary_loss_mlp": 0.01031906, + "balance_loss_clip": 1.04482627, + "balance_loss_mlp": 1.01968706, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.2732997401054904, + "language_loss": 0.82925177, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85076082, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12219238, + "step": 16304, + "time_per_iteration": 2.4166979789733887 + }, + { + "auxiliary_loss_clip": 0.01114729, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.04661894, + "balance_loss_mlp": 1.01680112, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.8843867429826104, + "language_loss": 0.70328468, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72470427, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10424805, + "step": 16305, + "time_per_iteration": 2.4925239086151123 + }, + { + "auxiliary_loss_clip": 0.01116748, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.04337335, + "balance_loss_mlp": 1.02001238, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 1.947088529718304, + "language_loss": 0.72019988, + "learning_rate": 4.028643358815032e-09, + "loss": 0.74168861, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12115479, + "step": 16306, + "time_per_iteration": 2.4613444805145264 + }, + { + "auxiliary_loss_clip": 0.01107678, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.02243638, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.5386404824067863, + "language_loss": 0.73591924, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75733578, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.11541748, + "step": 16307, + "time_per_iteration": 2.4405007362365723 + }, + { + "auxiliary_loss_clip": 0.01108836, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.04264057, + "balance_loss_mlp": 1.01816773, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.504185849413808, + "language_loss": 0.74700439, + "learning_rate": 3.979380129822018e-09, + "loss": 0.76837474, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 0.66162109, + "router_z_loss_mlp": 0.10028076, + "step": 16308, + "time_per_iteration": 2.4523653984069824 + }, + { + "auxiliary_loss_clip": 0.01031664, + "auxiliary_loss_mlp": 0.01001338, + "balance_loss_clip": 1.00745392, + "balance_loss_mlp": 1.00005651, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7519634794774102, + "language_loss": 0.57778537, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59811538, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 0.01281738, + "step": 16309, + "time_per_iteration": 2.9804024696350098 + }, + { + "auxiliary_loss_clip": 0.01112968, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.03998876, + "balance_loss_mlp": 1.01831317, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.7407764258621055, + "language_loss": 0.66611689, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68754923, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11950684, + "step": 16310, + "time_per_iteration": 2.47872257232666 + }, + { + "auxiliary_loss_clip": 0.01041175, + "auxiliary_loss_mlp": 0.01007145, + "balance_loss_clip": 1.01604879, + "balance_loss_mlp": 1.005795, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.8180872733560783, + "language_loss": 0.54551113, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56599438, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01350403, + "step": 16311, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01116299, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.04465342, + "balance_loss_mlp": 1.01509488, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.5644991612167165, + "language_loss": 0.79731762, + "learning_rate": 3.881761950876638e-09, + "loss": 0.8187449, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11352539, + "step": 16312, + "time_per_iteration": 2.474146604537964 + }, + { + "auxiliary_loss_clip": 0.01109487, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.04071736, + "balance_loss_mlp": 1.01851046, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 2.3418694036776264, + "language_loss": 0.6304065, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65179276, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10632324, + "step": 16313, + "time_per_iteration": 3.8445212841033936 + }, + { + "auxiliary_loss_clip": 0.01117821, + "auxiliary_loss_mlp": 0.01028795, + "balance_loss_clip": 1.04729486, + "balance_loss_mlp": 1.01699948, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 1.8002686254998879, + "language_loss": 0.72721791, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74868405, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11804199, + "step": 16314, + "time_per_iteration": 2.4975571632385254 + }, + { + "auxiliary_loss_clip": 0.01041094, + "auxiliary_loss_mlp": 0.01003726, + "balance_loss_clip": 1.01617074, + "balance_loss_mlp": 1.00242662, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6909559423392666, + "language_loss": 0.51682895, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53727716, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01298523, + "step": 16315, + "time_per_iteration": 3.1028637886047363 + }, + { + "auxiliary_loss_clip": 0.01111164, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.03974307, + "balance_loss_mlp": 1.01632965, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.7254552715570353, + "language_loss": 0.6971972, + "learning_rate": 3.785354859932033e-09, + "loss": 0.7185818, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10961914, + "step": 16316, + "time_per_iteration": 2.4478747844696045 + }, + { + "auxiliary_loss_clip": 0.01112082, + "auxiliary_loss_mlp": 0.01025514, + "balance_loss_clip": 1.03921413, + "balance_loss_mlp": 1.01441526, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.6963520368582286, + "language_loss": 0.54988968, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57126564, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11090088, + "step": 16317, + "time_per_iteration": 2.6310906410217285 + }, + { + "auxiliary_loss_clip": 0.01103175, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.03404188, + "balance_loss_mlp": 1.01808548, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.6366941843058656, + "language_loss": 0.73521793, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75654101, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11047363, + "step": 16318, + "time_per_iteration": 2.482158899307251 + }, + { + "auxiliary_loss_clip": 0.01106516, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.03728878, + "balance_loss_mlp": 1.01615548, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.1124225292801135, + "language_loss": 0.81888378, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84022021, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10980225, + "step": 16319, + "time_per_iteration": 2.477325439453125 + }, + { + "auxiliary_loss_clip": 0.01040167, + "auxiliary_loss_mlp": 0.01004671, + "balance_loss_clip": 1.01551068, + "balance_loss_mlp": 1.00336719, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7138203140736594, + "language_loss": 0.53569514, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55614352, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01304626, + "step": 16320, + "time_per_iteration": 2.984489917755127 + }, + { + "auxiliary_loss_clip": 0.01119626, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.04547834, + "balance_loss_mlp": 1.02278972, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 1.896650333771669, + "language_loss": 0.73805302, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75959641, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11914062, + "step": 16321, + "time_per_iteration": 2.5510618686676025 + }, + { + "auxiliary_loss_clip": 0.01108317, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.03977227, + "balance_loss_mlp": 1.02012944, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.5923613658261984, + "language_loss": 0.7857635, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80716008, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.11218262, + "step": 16322, + "time_per_iteration": 4.010015487670898 + }, + { + "auxiliary_loss_clip": 0.01117034, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.04369664, + "balance_loss_mlp": 1.01942062, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.5244090357984246, + "language_loss": 0.80945456, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83093858, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1194458, + "step": 16323, + "time_per_iteration": 2.4708454608917236 + }, + { + "auxiliary_loss_clip": 0.01117025, + "auxiliary_loss_mlp": 0.01041153, + "balance_loss_clip": 1.0415225, + "balance_loss_mlp": 1.02814138, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.774530010579637, + "language_loss": 0.85289472, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87447649, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13012695, + "step": 16324, + "time_per_iteration": 2.4254372119903564 + }, + { + "auxiliary_loss_clip": 0.01120684, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.04863977, + "balance_loss_mlp": 1.01716828, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.474945962407035, + "language_loss": 0.74706376, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76855713, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1151123, + "step": 16325, + "time_per_iteration": 2.530439615249634 + }, + { + "auxiliary_loss_clip": 0.01105187, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.03783119, + "balance_loss_mlp": 1.02210534, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.8208701190147232, + "language_loss": 0.76785564, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78923434, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 0.67333984, + "router_z_loss_mlp": 0.10577393, + "step": 16326, + "time_per_iteration": 2.4252545833587646 + }, + { + "auxiliary_loss_clip": 0.01116142, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.04139519, + "balance_loss_mlp": 1.01990557, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.6576218242663037, + "language_loss": 0.67131472, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69279444, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.1192627, + "step": 16327, + "time_per_iteration": 2.4260928630828857 + }, + { + "auxiliary_loss_clip": 0.01121061, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.02238166, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.7599086582269927, + "language_loss": 0.74109727, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.76266098, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12921143, + "step": 16328, + "time_per_iteration": 3.968733310699463 + }, + { + "auxiliary_loss_clip": 0.01119337, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.04082394, + "balance_loss_mlp": 1.0242424, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 3.881448999554852, + "language_loss": 0.81484449, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83642805, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.14776611, + "step": 16329, + "time_per_iteration": 2.459826707839966 + }, + { + "auxiliary_loss_clip": 0.01112189, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.03632247, + "balance_loss_mlp": 1.01852775, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 2.9348151042174475, + "language_loss": 0.76244241, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78388077, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.13110352, + "step": 16330, + "time_per_iteration": 2.492436647415161 + }, + { + "auxiliary_loss_clip": 0.01119258, + "auxiliary_loss_mlp": 0.01031221, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.01631355, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.9124226269750784, + "language_loss": 0.66461509, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68611991, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.14904785, + "step": 16331, + "time_per_iteration": 2.532172203063965 + }, + { + "auxiliary_loss_clip": 0.01105509, + "auxiliary_loss_mlp": 0.01032137, + "balance_loss_clip": 1.03479397, + "balance_loss_mlp": 1.02077031, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.8512463317355659, + "language_loss": 0.73518848, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75656497, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11358643, + "step": 16332, + "time_per_iteration": 2.524623155593872 + }, + { + "auxiliary_loss_clip": 0.01111757, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.0420593, + "balance_loss_mlp": 1.01941895, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.642638219381997, + "language_loss": 0.76687098, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78828943, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10681152, + "step": 16333, + "time_per_iteration": 2.4342668056488037 + }, + { + "auxiliary_loss_clip": 0.01105003, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.03563333, + "balance_loss_mlp": 1.01584935, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.6222025091280305, + "language_loss": 0.73042834, + "learning_rate": 3.366511715771958e-09, + "loss": 0.75175267, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11590576, + "step": 16334, + "time_per_iteration": 3.896761655807495 + }, + { + "auxiliary_loss_clip": 0.01110231, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.0380156, + "balance_loss_mlp": 1.02309155, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.9065021805463336, + "language_loss": 0.78744733, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80889672, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1161499, + "step": 16335, + "time_per_iteration": 2.4728353023529053 + }, + { + "auxiliary_loss_clip": 0.01118728, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.04076314, + "balance_loss_mlp": 1.01986456, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.3785054796043608, + "language_loss": 0.63826269, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.65978301, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13439941, + "step": 16336, + "time_per_iteration": 2.8193678855895996 + }, + { + "auxiliary_loss_clip": 0.01119206, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.04286897, + "balance_loss_mlp": 1.0196768, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.8759265905394868, + "language_loss": 0.73243165, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75395274, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13232422, + "step": 16337, + "time_per_iteration": 2.4556612968444824 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.03790283, + "balance_loss_mlp": 1.02227998, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.7064690956376136, + "language_loss": 0.73232263, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75381434, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13128662, + "step": 16338, + "time_per_iteration": 2.421724319458008 + }, + { + "auxiliary_loss_clip": 0.01104959, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.03405726, + "balance_loss_mlp": 1.01634979, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.602835539057131, + "language_loss": 0.81691384, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.8382476, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1206665, + "step": 16339, + "time_per_iteration": 2.4807934761047363 + }, + { + "auxiliary_loss_clip": 0.01109643, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.0411365, + "balance_loss_mlp": 1.01759887, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 3.3952041057011186, + "language_loss": 0.62756097, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64894027, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10675049, + "step": 16340, + "time_per_iteration": 2.4433577060699463 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.03886962, + "balance_loss_mlp": 1.01986444, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.4822497191018043, + "language_loss": 0.86021578, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.8816582, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12188721, + "step": 16341, + "time_per_iteration": 2.4251937866210938 + }, + { + "auxiliary_loss_clip": 0.01108757, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.04070997, + "balance_loss_mlp": 1.0165875, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.4422444709376914, + "language_loss": 0.67309463, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69446886, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.12078857, + "step": 16342, + "time_per_iteration": 2.596240282058716 + }, + { + "auxiliary_loss_clip": 0.01114122, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.03921366, + "balance_loss_mlp": 1.0171777, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.6027729972279707, + "language_loss": 0.77375925, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79519117, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11889648, + "step": 16343, + "time_per_iteration": 2.6823110580444336 + }, + { + "auxiliary_loss_clip": 0.01112359, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.04052877, + "balance_loss_mlp": 1.01767981, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.569047793786621, + "language_loss": 0.75593281, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77734089, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10766602, + "step": 16344, + "time_per_iteration": 2.6730575561523438 + }, + { + "auxiliary_loss_clip": 0.01115189, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.04376316, + "balance_loss_mlp": 1.01915622, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 2.2682102310412042, + "language_loss": 0.666731, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68819767, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12329102, + "step": 16345, + "time_per_iteration": 2.7689759731292725 + }, + { + "auxiliary_loss_clip": 0.01100965, + "auxiliary_loss_mlp": 0.01024847, + "balance_loss_clip": 1.03330636, + "balance_loss_mlp": 1.01432109, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.3456498801495569, + "language_loss": 0.79290283, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81416094, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 0.67626953, + "router_z_loss_mlp": 0.10528564, + "step": 16346, + "time_per_iteration": 2.657125234603882 + }, + { + "auxiliary_loss_clip": 0.01119516, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.04274976, + "balance_loss_mlp": 1.01927352, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 2.0181095708151044, + "language_loss": 0.75114095, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77265418, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12542725, + "step": 16347, + "time_per_iteration": 2.6613826751708984 + }, + { + "auxiliary_loss_clip": 0.01108152, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.03792894, + "balance_loss_mlp": 1.0247612, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 3.7532111646696413, + "language_loss": 0.67135215, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.69280338, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.12207031, + "step": 16348, + "time_per_iteration": 2.535407304763794 + }, + { + "auxiliary_loss_clip": 0.01118053, + "auxiliary_loss_mlp": 0.01029932, + "balance_loss_clip": 1.04267108, + "balance_loss_mlp": 1.01789236, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.92216780310187, + "language_loss": 0.68990499, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71138483, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1204834, + "step": 16349, + "time_per_iteration": 2.5424375534057617 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.04452682, + "balance_loss_mlp": 1.01780128, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 1.7522452683676366, + "language_loss": 0.75781953, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.77921993, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.10186768, + "step": 16350, + "time_per_iteration": 2.407984495162964 + }, + { + "auxiliary_loss_clip": 0.01109873, + "auxiliary_loss_mlp": 0.01037303, + "balance_loss_clip": 1.03726077, + "balance_loss_mlp": 1.02367759, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 2.019399689712569, + "language_loss": 0.84236079, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86383259, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.13616943, + "step": 16351, + "time_per_iteration": 2.4654717445373535 + }, + { + "auxiliary_loss_clip": 0.01111811, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.04014659, + "balance_loss_mlp": 1.01548839, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.5995012760783163, + "language_loss": 0.68748552, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70887923, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12084961, + "step": 16352, + "time_per_iteration": 2.4895477294921875 + }, + { + "auxiliary_loss_clip": 0.01112729, + "auxiliary_loss_mlp": 0.01031352, + "balance_loss_clip": 1.04205513, + "balance_loss_mlp": 1.02032506, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 2.017303641732685, + "language_loss": 0.66446579, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68590665, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11029053, + "step": 16353, + "time_per_iteration": 2.475471019744873 + }, + { + "auxiliary_loss_clip": 0.01113257, + "auxiliary_loss_mlp": 0.01036905, + "balance_loss_clip": 1.0411458, + "balance_loss_mlp": 1.02528787, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.4723482756406574, + "language_loss": 0.74521351, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76671511, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11627197, + "step": 16354, + "time_per_iteration": 2.43894100189209 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.03849673, + "balance_loss_mlp": 1.01565933, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 2.5521915518279426, + "language_loss": 0.77804697, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79940403, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11517334, + "step": 16355, + "time_per_iteration": 2.4939754009246826 + }, + { + "auxiliary_loss_clip": 0.01119208, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.04835236, + "balance_loss_mlp": 1.01845217, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 1.9302192219911072, + "language_loss": 0.73748177, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.7589767, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.1182251, + "step": 16356, + "time_per_iteration": 3.815722942352295 + }, + { + "auxiliary_loss_clip": 0.01114622, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.04203129, + "balance_loss_mlp": 1.01728976, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.54275520320641, + "language_loss": 0.7584337, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.77987093, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11804199, + "step": 16357, + "time_per_iteration": 2.4506947994232178 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01023892, + "balance_loss_clip": 1.03952491, + "balance_loss_mlp": 1.01188159, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 2.015856499696275, + "language_loss": 0.80128467, + "learning_rate": 2.846214118442436e-09, + "loss": 0.8226226, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.12011719, + "step": 16358, + "time_per_iteration": 2.474987745285034 + }, + { + "auxiliary_loss_clip": 0.01127526, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_clip": 1.05344188, + "balance_loss_mlp": 1.01398396, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.268621137264636, + "language_loss": 0.67885268, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.70037448, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.10675049, + "step": 16359, + "time_per_iteration": 2.5720055103302 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.04514682, + "balance_loss_mlp": 1.01713276, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.560103382722175, + "language_loss": 0.69815665, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71957994, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10675049, + "step": 16360, + "time_per_iteration": 2.524897575378418 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.04317772, + "balance_loss_mlp": 1.02497208, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.7760047613159653, + "language_loss": 0.84126258, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86285317, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12756348, + "step": 16361, + "time_per_iteration": 2.5021417140960693 + }, + { + "auxiliary_loss_clip": 0.01118525, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.04538536, + "balance_loss_mlp": 1.01563454, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.626187127821622, + "language_loss": 0.76058728, + "learning_rate": 2.76373855876022e-09, + "loss": 0.78204405, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11517334, + "step": 16362, + "time_per_iteration": 2.4845173358917236 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.04635537, + "balance_loss_mlp": 1.01634037, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.8517301099873495, + "language_loss": 0.71040595, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73188901, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11877441, + "step": 16363, + "time_per_iteration": 2.450960397720337 + }, + { + "auxiliary_loss_clip": 0.01109355, + "auxiliary_loss_mlp": 0.01027995, + "balance_loss_clip": 1.04172635, + "balance_loss_mlp": 1.01711702, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 2.059468770793489, + "language_loss": 0.63025612, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65162957, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.10876465, + "step": 16364, + "time_per_iteration": 2.4676432609558105 + }, + { + "auxiliary_loss_clip": 0.01117998, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.04469442, + "balance_loss_mlp": 1.01597512, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 2.876134084198442, + "language_loss": 0.75337368, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77481735, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10394287, + "step": 16365, + "time_per_iteration": 2.4999380111694336 + }, + { + "auxiliary_loss_clip": 0.01117886, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.04465747, + "balance_loss_mlp": 1.01699829, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 1.6097476982812797, + "language_loss": 0.75974381, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78120899, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11633301, + "step": 16366, + "time_per_iteration": 3.9309873580932617 + }, + { + "auxiliary_loss_clip": 0.01117655, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.04797339, + "balance_loss_mlp": 1.01497149, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.8055664298405532, + "language_loss": 0.77168059, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79311681, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10992432, + "step": 16367, + "time_per_iteration": 2.487247943878174 + }, + { + "auxiliary_loss_clip": 0.01117611, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.04839432, + "balance_loss_mlp": 1.02099705, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.6781444253343614, + "language_loss": 0.61354548, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63505125, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11968994, + "step": 16368, + "time_per_iteration": 2.495685577392578 + }, + { + "auxiliary_loss_clip": 0.01108763, + "auxiliary_loss_mlp": 0.01028621, + "balance_loss_clip": 1.04171085, + "balance_loss_mlp": 1.01858974, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.6118382856499431, + "language_loss": 0.6584177, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67979145, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 0.67041016, + "router_z_loss_mlp": 0.10028076, + "step": 16369, + "time_per_iteration": 2.4839603900909424 + }, + { + "auxiliary_loss_clip": 0.01114, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.04149592, + "balance_loss_mlp": 1.02104497, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.5573992578235136, + "language_loss": 0.68314612, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70461774, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12115479, + "step": 16370, + "time_per_iteration": 2.49371600151062 + }, + { + "auxiliary_loss_clip": 0.01112223, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.0388428, + "balance_loss_mlp": 1.02042997, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.7697599521325813, + "language_loss": 0.73462665, + "learning_rate": 2.582599145159792e-09, + "loss": 0.7560792, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12597656, + "step": 16371, + "time_per_iteration": 3.817660093307495 + }, + { + "auxiliary_loss_clip": 0.0103638, + "auxiliary_loss_mlp": 0.01005117, + "balance_loss_clip": 1.01174426, + "balance_loss_mlp": 1.00381804, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7755011726226748, + "language_loss": 0.65144515, + "learning_rate": 2.562851244898745e-09, + "loss": 0.6718601, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01298523, + "step": 16372, + "time_per_iteration": 3.0761728286743164 + }, + { + "auxiliary_loss_clip": 0.01114379, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.04395068, + "balance_loss_mlp": 1.01776612, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.708024627196926, + "language_loss": 0.7064445, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72787786, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11193848, + "step": 16373, + "time_per_iteration": 2.4357352256774902 + }, + { + "auxiliary_loss_clip": 0.01112094, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.04031038, + "balance_loss_mlp": 1.02033734, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.6446062140349753, + "language_loss": 0.81571829, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83715731, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11468506, + "step": 16374, + "time_per_iteration": 2.454360246658325 + }, + { + "auxiliary_loss_clip": 0.0113044, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.05438149, + "balance_loss_mlp": 1.01868224, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.8129070255555246, + "language_loss": 0.6926406, + "learning_rate": 2.504062005197927e-09, + "loss": 0.7142452, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11328125, + "step": 16375, + "time_per_iteration": 2.4314935207366943 + }, + { + "auxiliary_loss_clip": 0.01119614, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.0455277, + "balance_loss_mlp": 1.02113652, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.6912584190474986, + "language_loss": 0.80837953, + "learning_rate": 2.484617081468521e-09, + "loss": 0.82991624, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12915039, + "step": 16376, + "time_per_iteration": 2.478896379470825 + }, + { + "auxiliary_loss_clip": 0.01107619, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.03858399, + "balance_loss_mlp": 1.02244508, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.6178192579236832, + "language_loss": 0.62406695, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64547956, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.11187744, + "step": 16377, + "time_per_iteration": 2.502873659133911 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.01036887, + "balance_loss_clip": 1.03894663, + "balance_loss_mlp": 1.02421498, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 2.0076038035829713, + "language_loss": 0.72938126, + "learning_rate": 2.445954472695133e-09, + "loss": 0.75087249, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12646484, + "step": 16378, + "time_per_iteration": 3.881587266921997 + }, + { + "auxiliary_loss_clip": 0.01111966, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.03924227, + "balance_loss_mlp": 1.02416122, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.6833016280393671, + "language_loss": 0.70887089, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73034567, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11358643, + "step": 16379, + "time_per_iteration": 2.4981207847595215 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.05465984, + "balance_loss_mlp": 1.01860285, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 1.7099082738371907, + "language_loss": 0.68485713, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70646411, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11785889, + "step": 16380, + "time_per_iteration": 2.4549295902252197 + }, + { + "auxiliary_loss_clip": 0.01122977, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.04667103, + "balance_loss_mlp": 1.01919699, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 2.0327400336363577, + "language_loss": 0.78889859, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81043881, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11828613, + "step": 16381, + "time_per_iteration": 2.4785444736480713 + }, + { + "auxiliary_loss_clip": 0.01117158, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.0412457, + "balance_loss_mlp": 1.01936781, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.5516496841866994, + "language_loss": 0.82504225, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84652448, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11694336, + "step": 16382, + "time_per_iteration": 2.5149388313293457 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.03742492, + "balance_loss_mlp": 1.02100515, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 2.127159034132692, + "language_loss": 0.74691105, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76837569, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12023926, + "step": 16383, + "time_per_iteration": 2.4835102558135986 + }, + { + "auxiliary_loss_clip": 0.01122543, + "auxiliary_loss_mlp": 0.01026688, + "balance_loss_clip": 1.04934001, + "balance_loss_mlp": 1.01567936, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.7078909147653856, + "language_loss": 0.66056108, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68205339, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11010742, + "step": 16384, + "time_per_iteration": 2.558549642562866 + }, + { + "auxiliary_loss_clip": 0.01122564, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.04793358, + "balance_loss_mlp": 1.02072525, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.7735565027005231, + "language_loss": 0.70393598, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72549963, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.13079834, + "step": 16385, + "time_per_iteration": 2.6501033306121826 + }, + { + "auxiliary_loss_clip": 0.01114252, + "auxiliary_loss_mlp": 0.01029693, + "balance_loss_clip": 1.04108942, + "balance_loss_mlp": 1.01762319, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 1.890050044067421, + "language_loss": 0.8119176, + "learning_rate": 2.294333993509978e-09, + "loss": 0.8333571, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12072754, + "step": 16386, + "time_per_iteration": 2.455540895462036 + }, + { + "auxiliary_loss_clip": 0.01113457, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.04135251, + "balance_loss_mlp": 1.02197945, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 2.2625614727783456, + "language_loss": 0.68166423, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.70313722, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11865234, + "step": 16387, + "time_per_iteration": 2.5531771183013916 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.03338897, + "balance_loss_mlp": 1.01770961, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.6996908362761693, + "language_loss": 0.74068785, + "learning_rate": 2.257186391438237e-09, + "loss": 0.76197839, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 0.66162109, + "router_z_loss_mlp": 0.11761475, + "step": 16388, + "time_per_iteration": 2.4226434230804443 + }, + { + "auxiliary_loss_clip": 0.01106746, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.0360837, + "balance_loss_mlp": 1.01837611, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.7539113843818988, + "language_loss": 0.82265419, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84403509, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.12963867, + "step": 16389, + "time_per_iteration": 2.4776880741119385 + }, + { + "auxiliary_loss_clip": 0.0111224, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.04062366, + "balance_loss_mlp": 1.01678872, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 1.780955676221087, + "language_loss": 0.67286611, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.69429278, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.13641357, + "step": 16390, + "time_per_iteration": 2.4452459812164307 + }, + { + "auxiliary_loss_clip": 0.01115788, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.04223919, + "balance_loss_mlp": 1.0204494, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.7581418068394676, + "language_loss": 0.77153283, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.7930249, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12976074, + "step": 16391, + "time_per_iteration": 2.563392162322998 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.04027462, + "balance_loss_mlp": 1.01967156, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 2.6018497693482106, + "language_loss": 0.68119007, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70255643, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 0.66699219, + "router_z_loss_mlp": 0.09960938, + "step": 16392, + "time_per_iteration": 2.4683642387390137 + }, + { + "auxiliary_loss_clip": 0.01119795, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.04460311, + "balance_loss_mlp": 1.01519918, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.035724115978127, + "language_loss": 0.56193101, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.58340704, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.1262207, + "step": 16393, + "time_per_iteration": 2.413275718688965 + }, + { + "auxiliary_loss_clip": 0.01119564, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.04359794, + "balance_loss_mlp": 1.01467347, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.5369979969209964, + "language_loss": 0.78970897, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81117779, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12646484, + "step": 16394, + "time_per_iteration": 2.5000202655792236 + }, + { + "auxiliary_loss_clip": 0.01120463, + "auxiliary_loss_mlp": 0.01035487, + "balance_loss_clip": 1.04605842, + "balance_loss_mlp": 1.02315485, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.4675074331718965, + "language_loss": 0.76311696, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78467643, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12335205, + "step": 16395, + "time_per_iteration": 2.4843380451202393 + }, + { + "auxiliary_loss_clip": 0.01114826, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.0419898, + "balance_loss_mlp": 1.01745284, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 2.517323682269272, + "language_loss": 0.75459874, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77604437, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1227417, + "step": 16396, + "time_per_iteration": 2.4786036014556885 + }, + { + "auxiliary_loss_clip": 0.01115786, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.04278302, + "balance_loss_mlp": 1.01838684, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.4728676620430696, + "language_loss": 0.71375114, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73520803, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1149292, + "step": 16397, + "time_per_iteration": 2.4922256469726562 + }, + { + "auxiliary_loss_clip": 0.01110431, + "auxiliary_loss_mlp": 0.01028847, + "balance_loss_clip": 1.04352236, + "balance_loss_mlp": 1.01791573, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.716775749353943, + "language_loss": 0.71408969, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73548245, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 0.66894531, + "router_z_loss_mlp": 0.10925293, + "step": 16398, + "time_per_iteration": 2.4206974506378174 + }, + { + "auxiliary_loss_clip": 0.01122675, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.05074906, + "balance_loss_mlp": 1.0171597, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.4173175415986676, + "language_loss": 0.74130845, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76281953, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.112854, + "step": 16399, + "time_per_iteration": 2.5089681148529053 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.0444293, + "balance_loss_mlp": 1.01506352, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.2106822114487397, + "language_loss": 0.57931125, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.60075337, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11773682, + "step": 16400, + "time_per_iteration": 2.4866654872894287 + }, + { + "auxiliary_loss_clip": 0.01126886, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.04712176, + "balance_loss_mlp": 1.01427627, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 5.1784329294497, + "language_loss": 0.80405372, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82559538, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12988281, + "step": 16401, + "time_per_iteration": 3.9309375286102295 + }, + { + "auxiliary_loss_clip": 0.01120939, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.04883206, + "balance_loss_mlp": 1.01692176, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.9193479000570315, + "language_loss": 0.77898228, + "learning_rate": 2.005638002662069e-09, + "loss": 0.8004846, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1237793, + "step": 16402, + "time_per_iteration": 2.450796127319336 + }, + { + "auxiliary_loss_clip": 0.01115711, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.04374957, + "balance_loss_mlp": 1.01717162, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.7248114635889327, + "language_loss": 0.69851547, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.71996081, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11651611, + "step": 16403, + "time_per_iteration": 2.492295026779175 + }, + { + "auxiliary_loss_clip": 0.01118332, + "auxiliary_loss_mlp": 0.01027885, + "balance_loss_clip": 1.04694152, + "balance_loss_mlp": 1.01671505, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 2.053221109876058, + "language_loss": 0.74697328, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76843542, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11169434, + "step": 16404, + "time_per_iteration": 2.5138015747070312 + }, + { + "auxiliary_loss_clip": 0.01114204, + "auxiliary_loss_mlp": 0.01026125, + "balance_loss_clip": 1.04023254, + "balance_loss_mlp": 1.01460934, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.6856888689084295, + "language_loss": 0.69657421, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71797752, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11523438, + "step": 16405, + "time_per_iteration": 2.548025369644165 + }, + { + "auxiliary_loss_clip": 0.01119402, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.04942203, + "balance_loss_mlp": 1.02023983, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.621937607734218, + "language_loss": 0.69690454, + "learning_rate": 1.93649446302846e-09, + "loss": 0.71841049, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10949707, + "step": 16406, + "time_per_iteration": 2.4897286891937256 + }, + { + "auxiliary_loss_clip": 0.01118659, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.04890931, + "balance_loss_mlp": 1.01743078, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 2.773709281201648, + "language_loss": 0.74819976, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76967752, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11682129, + "step": 16407, + "time_per_iteration": 2.380265951156616 + }, + { + "auxiliary_loss_clip": 0.01111711, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.04163408, + "balance_loss_mlp": 1.02212501, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 1.9750656646541194, + "language_loss": 0.77469516, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79614317, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10961914, + "step": 16408, + "time_per_iteration": 2.439640760421753 + }, + { + "auxiliary_loss_clip": 0.01120712, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.04615426, + "balance_loss_mlp": 1.01580048, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.6942280408111727, + "language_loss": 0.6808306, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70232403, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12823486, + "step": 16409, + "time_per_iteration": 3.887691020965576 + }, + { + "auxiliary_loss_clip": 0.01047271, + "auxiliary_loss_mlp": 0.01001665, + "balance_loss_clip": 1.02315056, + "balance_loss_mlp": 1.00055039, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.8061909668630816, + "language_loss": 0.61042166, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63091099, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01115417, + "step": 16410, + "time_per_iteration": 3.140984296798706 + }, + { + "auxiliary_loss_clip": 0.01122193, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.04911661, + "balance_loss_mlp": 1.01659203, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.30000762881452, + "language_loss": 0.66434085, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.6858449, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1161499, + "step": 16411, + "time_per_iteration": 2.533748149871826 + }, + { + "auxiliary_loss_clip": 0.01047514, + "auxiliary_loss_mlp": 0.01005408, + "balance_loss_clip": 1.02261579, + "balance_loss_mlp": 1.00400174, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7206337395353296, + "language_loss": 0.56269139, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.5832206, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.24951172, + "router_z_loss_mlp": 0.01405334, + "step": 16412, + "time_per_iteration": 3.1609203815460205 + }, + { + "auxiliary_loss_clip": 0.01116919, + "auxiliary_loss_mlp": 0.01032422, + "balance_loss_clip": 1.04309082, + "balance_loss_mlp": 1.01963055, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.6818336007439667, + "language_loss": 0.73327541, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75476873, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12799072, + "step": 16413, + "time_per_iteration": 2.584756851196289 + }, + { + "auxiliary_loss_clip": 0.01117515, + "auxiliary_loss_mlp": 0.01024126, + "balance_loss_clip": 1.0418756, + "balance_loss_mlp": 1.01259232, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.4230114724674205, + "language_loss": 0.71487862, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73629498, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11529541, + "step": 16414, + "time_per_iteration": 4.015050888061523 + }, + { + "auxiliary_loss_clip": 0.01112368, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.04433811, + "balance_loss_mlp": 1.02352834, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.540873482008592, + "language_loss": 0.70416832, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72563255, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10516357, + "step": 16415, + "time_per_iteration": 2.5107500553131104 + }, + { + "auxiliary_loss_clip": 0.01108365, + "auxiliary_loss_mlp": 0.01028992, + "balance_loss_clip": 1.04176092, + "balance_loss_mlp": 1.01871634, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.4509505774668308, + "language_loss": 0.75782007, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77919364, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 0.66552734, + "router_z_loss_mlp": 0.1027832, + "step": 16416, + "time_per_iteration": 2.465738534927368 + }, + { + "auxiliary_loss_clip": 0.01113539, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.04425955, + "balance_loss_mlp": 1.0187099, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.0652318289416494, + "language_loss": 0.71396339, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.73540163, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11578369, + "step": 16417, + "time_per_iteration": 2.4069948196411133 + }, + { + "auxiliary_loss_clip": 0.01119514, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.04453635, + "balance_loss_mlp": 1.02066731, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.567160242570108, + "language_loss": 0.70522958, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72675598, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12463379, + "step": 16418, + "time_per_iteration": 2.4680566787719727 + }, + { + "auxiliary_loss_clip": 0.01031279, + "auxiliary_loss_mlp": 0.00999928, + "balance_loss_clip": 1.00723505, + "balance_loss_mlp": 0.99867028, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6563887970279765, + "language_loss": 0.53703988, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55735195, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.01257324, + "step": 16419, + "time_per_iteration": 3.2010841369628906 + }, + { + "auxiliary_loss_clip": 0.01111993, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.03693843, + "balance_loss_mlp": 1.01983809, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.7439381000019645, + "language_loss": 0.77805984, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.79952037, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.14215088, + "step": 16420, + "time_per_iteration": 2.4922428131103516 + }, + { + "auxiliary_loss_clip": 0.01117984, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.04831266, + "balance_loss_mlp": 1.01690412, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 2.32390923259, + "language_loss": 0.70829737, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.72976506, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11889648, + "step": 16421, + "time_per_iteration": 3.909062385559082 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.04646599, + "balance_loss_mlp": 1.01987648, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.1377393542102867, + "language_loss": 0.82381368, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84536517, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12579346, + "step": 16422, + "time_per_iteration": 2.527791976928711 + }, + { + "auxiliary_loss_clip": 0.01110542, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.04074264, + "balance_loss_mlp": 1.01672471, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 2.056440138390339, + "language_loss": 0.85799634, + "learning_rate": 1.656159280223779e-09, + "loss": 0.87938499, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.11602783, + "step": 16423, + "time_per_iteration": 2.4886314868927 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.03770614, + "balance_loss_mlp": 1.01587987, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 1.792422375730038, + "language_loss": 0.70463824, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72602934, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12078857, + "step": 16424, + "time_per_iteration": 2.4578185081481934 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.04008138, + "balance_loss_mlp": 1.01343656, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.3708490298799347, + "language_loss": 0.80613327, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82751924, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12213135, + "step": 16425, + "time_per_iteration": 2.4550328254699707 + }, + { + "auxiliary_loss_clip": 0.01111566, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.03781426, + "balance_loss_mlp": 1.02049208, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 1.954265962844061, + "language_loss": 0.8037945, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82524109, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12591553, + "step": 16426, + "time_per_iteration": 2.540740489959717 + }, + { + "auxiliary_loss_clip": 0.01119654, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.04775286, + "balance_loss_mlp": 1.0208714, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.829979945121879, + "language_loss": 0.84742367, + "learning_rate": 1.593380599750338e-09, + "loss": 0.86893851, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10961914, + "step": 16427, + "time_per_iteration": 2.477604627609253 + }, + { + "auxiliary_loss_clip": 0.01111675, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.04113066, + "balance_loss_mlp": 1.02002382, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 2.115343327532291, + "language_loss": 0.70631385, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72774744, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11676025, + "step": 16428, + "time_per_iteration": 2.4400887489318848 + }, + { + "auxiliary_loss_clip": 0.01116988, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.0469234, + "balance_loss_mlp": 1.02063656, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 2.1367302479189467, + "language_loss": 0.80390167, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82539099, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11309814, + "step": 16429, + "time_per_iteration": 2.4331135749816895 + }, + { + "auxiliary_loss_clip": 0.01112993, + "auxiliary_loss_mlp": 0.0102906, + "balance_loss_clip": 1.04118574, + "balance_loss_mlp": 1.01797915, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.591919859328152, + "language_loss": 0.61982477, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64124525, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11090088, + "step": 16430, + "time_per_iteration": 2.5866615772247314 + }, + { + "auxiliary_loss_clip": 0.01112946, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.04015076, + "balance_loss_mlp": 1.02088642, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.4558370914947762, + "language_loss": 0.73219442, + "learning_rate": 1.531814395687725e-09, + "loss": 0.7536478, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11505127, + "step": 16431, + "time_per_iteration": 2.520702600479126 + }, + { + "auxiliary_loss_clip": 0.01112495, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.03959692, + "balance_loss_mlp": 1.02161241, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.287287945516479, + "language_loss": 0.80579901, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.82725847, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1184082, + "step": 16432, + "time_per_iteration": 2.4294967651367188 + }, + { + "auxiliary_loss_clip": 0.01110219, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.0382421, + "balance_loss_mlp": 1.02340031, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.6588923282661692, + "language_loss": 0.80736279, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82881916, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.12023926, + "step": 16433, + "time_per_iteration": 2.475661516189575 + }, + { + "auxiliary_loss_clip": 0.01111016, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.04314876, + "balance_loss_mlp": 1.0189606, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 3.2623776046343695, + "language_loss": 0.64801478, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.6694302, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.11572266, + "step": 16434, + "time_per_iteration": 2.486290454864502 + }, + { + "auxiliary_loss_clip": 0.01117778, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.0446564, + "balance_loss_mlp": 1.0201273, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.4959991251534457, + "language_loss": 0.69579446, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71729618, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1227417, + "step": 16435, + "time_per_iteration": 2.534088373184204 + }, + { + "auxiliary_loss_clip": 0.01107793, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.03819466, + "balance_loss_mlp": 1.02198339, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.7971294601599743, + "language_loss": 0.75229251, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77370578, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11553955, + "step": 16436, + "time_per_iteration": 2.497767210006714 + }, + { + "auxiliary_loss_clip": 0.01116624, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.04562712, + "balance_loss_mlp": 1.01733327, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.208455961586973, + "language_loss": 0.73748499, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.75895083, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.1262207, + "step": 16437, + "time_per_iteration": 2.42765474319458 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.03907239, + "balance_loss_mlp": 1.01895523, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.5577659709029184, + "language_loss": 0.60381603, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62518686, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.1116333, + "step": 16438, + "time_per_iteration": 2.554574966430664 + }, + { + "auxiliary_loss_clip": 0.01110801, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.04063809, + "balance_loss_mlp": 1.01906228, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 2.333289598541173, + "language_loss": 0.72316921, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74458951, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.12176514, + "step": 16439, + "time_per_iteration": 2.4691431522369385 + }, + { + "auxiliary_loss_clip": 0.01104108, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.03472114, + "balance_loss_mlp": 1.01951885, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.5846296987483142, + "language_loss": 0.59964037, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62100792, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.13128662, + "step": 16440, + "time_per_iteration": 2.5399322509765625 + }, + { + "auxiliary_loss_clip": 0.01114535, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.03994787, + "balance_loss_mlp": 1.01834035, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.404039319718904, + "language_loss": 0.76012659, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78157747, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12225342, + "step": 16441, + "time_per_iteration": 2.3941493034362793 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.01676297, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 3.0861817778126692, + "language_loss": 0.67890239, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.70033348, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11657715, + "step": 16442, + "time_per_iteration": 2.6087646484375 + }, + { + "auxiliary_loss_clip": 0.01108331, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.0370146, + "balance_loss_mlp": 1.02016127, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.7174939098235598, + "language_loss": 0.74482393, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76623881, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.12994385, + "step": 16443, + "time_per_iteration": 2.496767520904541 + }, + { + "auxiliary_loss_clip": 0.01117356, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.04317796, + "balance_loss_mlp": 1.0207715, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 1.797368515431646, + "language_loss": 0.7382744, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75978976, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1340332, + "step": 16444, + "time_per_iteration": 4.0495123863220215 + }, + { + "auxiliary_loss_clip": 0.01117953, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.04678261, + "balance_loss_mlp": 1.01920652, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 2.048135564614589, + "language_loss": 0.69582868, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71731287, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11260986, + "step": 16445, + "time_per_iteration": 2.5563971996307373 + }, + { + "auxiliary_loss_clip": 0.01117347, + "auxiliary_loss_mlp": 0.01026046, + "balance_loss_clip": 1.04540229, + "balance_loss_mlp": 1.01386905, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 2.6467820329086322, + "language_loss": 0.61120456, + "learning_rate": 1.311740377491155e-09, + "loss": 0.63263851, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1217041, + "step": 16446, + "time_per_iteration": 2.420046806335449 + }, + { + "auxiliary_loss_clip": 0.01128834, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.05461025, + "balance_loss_mlp": 1.02098227, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.5532966964512247, + "language_loss": 0.70946461, + "learning_rate": 1.297675079582783e-09, + "loss": 0.73107111, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.1083374, + "step": 16447, + "time_per_iteration": 2.5143485069274902 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01027354, + "balance_loss_clip": 1.04584265, + "balance_loss_mlp": 1.01639891, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.8203251199175041, + "language_loss": 0.8369993, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.85844314, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10961914, + "step": 16448, + "time_per_iteration": 2.4790735244750977 + }, + { + "auxiliary_loss_clip": 0.01118696, + "auxiliary_loss_mlp": 0.01023353, + "balance_loss_clip": 1.04654729, + "balance_loss_mlp": 1.01301122, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.5092419571189601, + "language_loss": 0.70326769, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72468817, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10327148, + "step": 16449, + "time_per_iteration": 2.434662103652954 + }, + { + "auxiliary_loss_clip": 0.01116684, + "auxiliary_loss_mlp": 0.01030548, + "balance_loss_clip": 1.04143167, + "balance_loss_mlp": 1.0188539, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 3.106501186595402, + "language_loss": 0.73941565, + "learning_rate": 1.25593393393153e-09, + "loss": 0.76088798, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11700439, + "step": 16450, + "time_per_iteration": 2.546318292617798 + }, + { + "auxiliary_loss_clip": 0.01116119, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.04194736, + "balance_loss_mlp": 1.01839375, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 1.6453295945469486, + "language_loss": 0.79258704, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81405401, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12200928, + "step": 16451, + "time_per_iteration": 2.4452850818634033 + }, + { + "auxiliary_loss_clip": 0.01113477, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.04056787, + "balance_loss_mlp": 1.01974952, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 2.146103183059692, + "language_loss": 0.70286822, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72432268, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12213135, + "step": 16452, + "time_per_iteration": 2.4549806118011475 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.03952062, + "balance_loss_mlp": 1.01796019, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.7435124103728183, + "language_loss": 0.73623812, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.7576279, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11773682, + "step": 16453, + "time_per_iteration": 3.8426313400268555 + }, + { + "auxiliary_loss_clip": 0.01120473, + "auxiliary_loss_mlp": 0.01036875, + "balance_loss_clip": 1.04394078, + "balance_loss_mlp": 1.02518034, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.0730102095757825, + "language_loss": 0.6970225, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71859598, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11700439, + "step": 16454, + "time_per_iteration": 2.449845552444458 + }, + { + "auxiliary_loss_clip": 0.01110171, + "auxiliary_loss_mlp": 0.01028475, + "balance_loss_clip": 1.04168153, + "balance_loss_mlp": 1.01728129, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 2.0759416595831985, + "language_loss": 0.75732183, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.77870834, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.11193848, + "step": 16455, + "time_per_iteration": 2.4925973415374756 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01025521, + "balance_loss_clip": 1.04218733, + "balance_loss_mlp": 1.01426172, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.9242520568664248, + "language_loss": 0.6554721, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67685622, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11260986, + "step": 16456, + "time_per_iteration": 2.4495415687561035 + }, + { + "auxiliary_loss_clip": 0.01119651, + "auxiliary_loss_mlp": 0.01028724, + "balance_loss_clip": 1.0460639, + "balance_loss_mlp": 1.01721478, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.708079825029343, + "language_loss": 0.73794663, + "learning_rate": 1.161190691666203e-09, + "loss": 0.75943041, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1151123, + "step": 16457, + "time_per_iteration": 3.845226287841797 + }, + { + "auxiliary_loss_clip": 0.01115243, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.04418826, + "balance_loss_mlp": 1.01384008, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.1570240571166264, + "language_loss": 0.69098198, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.71239257, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11981201, + "step": 16458, + "time_per_iteration": 2.545781135559082 + }, + { + "auxiliary_loss_clip": 0.01118482, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.04903126, + "balance_loss_mlp": 1.01684105, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 2.024060914758178, + "language_loss": 0.79269874, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81416535, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11334229, + "step": 16459, + "time_per_iteration": 2.468595027923584 + }, + { + "auxiliary_loss_clip": 0.01114338, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.04182732, + "balance_loss_mlp": 1.02094972, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 1.9264984625338517, + "language_loss": 0.71333581, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73480445, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11578369, + "step": 16460, + "time_per_iteration": 2.4842514991760254 + }, + { + "auxiliary_loss_clip": 0.01120075, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.04604959, + "balance_loss_mlp": 1.01353526, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.6959465324688032, + "language_loss": 0.87516588, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.8966217, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11968994, + "step": 16461, + "time_per_iteration": 2.516601800918579 + }, + { + "auxiliary_loss_clip": 0.0111355, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.04113078, + "balance_loss_mlp": 1.01780713, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.8782678015339955, + "language_loss": 0.63343251, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65486938, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.12335205, + "step": 16462, + "time_per_iteration": 2.591761827468872 + }, + { + "auxiliary_loss_clip": 0.01115111, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.04401374, + "balance_loss_mlp": 1.01937509, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.5897756569764347, + "language_loss": 0.72934866, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75080645, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11291504, + "step": 16463, + "time_per_iteration": 2.424964427947998 + }, + { + "auxiliary_loss_clip": 0.01119676, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.04865646, + "balance_loss_mlp": 1.01729274, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 2.2264423574980157, + "language_loss": 0.69985932, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72135383, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.12481689, + "step": 16464, + "time_per_iteration": 3.92038631439209 + }, + { + "auxiliary_loss_clip": 0.01114737, + "auxiliary_loss_mlp": 0.01025674, + "balance_loss_clip": 1.04125381, + "balance_loss_mlp": 1.01396775, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.134293981483749, + "language_loss": 0.73378599, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75519007, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11706543, + "step": 16465, + "time_per_iteration": 2.4210457801818848 + }, + { + "auxiliary_loss_clip": 0.01113122, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.0434618, + "balance_loss_mlp": 1.02237809, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.7729033023381713, + "language_loss": 0.86316633, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.8846277, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10650635, + "step": 16466, + "time_per_iteration": 2.468665599822998 + }, + { + "auxiliary_loss_clip": 0.01115254, + "auxiliary_loss_mlp": 0.01022206, + "balance_loss_clip": 1.04335046, + "balance_loss_mlp": 1.01085782, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.8696714565091712, + "language_loss": 0.71792251, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73929703, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11346436, + "step": 16467, + "time_per_iteration": 2.461561679840088 + }, + { + "auxiliary_loss_clip": 0.01110367, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.04042459, + "balance_loss_mlp": 1.01799536, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.311828128080915, + "language_loss": 0.64885235, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67024076, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10479736, + "step": 16468, + "time_per_iteration": 2.575039863586426 + }, + { + "auxiliary_loss_clip": 0.01114764, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.04095936, + "balance_loss_mlp": 1.01566851, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.7583394304982751, + "language_loss": 0.61873353, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64015317, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11535645, + "step": 16469, + "time_per_iteration": 2.480612277984619 + }, + { + "auxiliary_loss_clip": 0.01111775, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03786993, + "balance_loss_mlp": 1.01711142, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.686396141970344, + "language_loss": 0.7028352, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72424114, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11712646, + "step": 16470, + "time_per_iteration": 2.409770965576172 + }, + { + "auxiliary_loss_clip": 0.01064071, + "auxiliary_loss_mlp": 0.01009436, + "balance_loss_clip": 1.03920209, + "balance_loss_mlp": 1.00784934, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6591611924958612, + "language_loss": 0.55509818, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57583326, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.0158844, + "step": 16471, + "time_per_iteration": 3.2772388458251953 + }, + { + "auxiliary_loss_clip": 0.0111984, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.04726434, + "balance_loss_mlp": 1.01704359, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 2.1257736277812462, + "language_loss": 0.83974165, + "learning_rate": 9.706760407131032e-10, + "loss": 0.86122924, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11871338, + "step": 16472, + "time_per_iteration": 2.425685167312622 + }, + { + "auxiliary_loss_clip": 0.01115384, + "auxiliary_loss_mlp": 0.01025371, + "balance_loss_clip": 1.04352224, + "balance_loss_mlp": 1.01381373, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 2.0224058241903613, + "language_loss": 0.85722053, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87862813, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11560059, + "step": 16473, + "time_per_iteration": 2.4250171184539795 + }, + { + "auxiliary_loss_clip": 0.01111964, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.04048169, + "balance_loss_mlp": 1.01514173, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.9454055173874505, + "language_loss": 0.84362078, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86500078, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10894775, + "step": 16474, + "time_per_iteration": 2.489335775375366 + }, + { + "auxiliary_loss_clip": 0.01106362, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.02216601, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.7320265751772237, + "language_loss": 0.76631147, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78770721, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11053467, + "step": 16475, + "time_per_iteration": 2.454343318939209 + }, + { + "auxiliary_loss_clip": 0.01107739, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.03552508, + "balance_loss_mlp": 1.01959491, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.4883385123175124, + "language_loss": 0.7561419, + "learning_rate": 9.227525969588423e-10, + "loss": 0.77753448, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1192627, + "step": 16476, + "time_per_iteration": 2.462512493133545 + }, + { + "auxiliary_loss_clip": 0.01112192, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.03652668, + "balance_loss_mlp": 1.01723576, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.1168407234696405, + "language_loss": 0.66962194, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69106764, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.15136719, + "step": 16477, + "time_per_iteration": 2.4724268913269043 + }, + { + "auxiliary_loss_clip": 0.01117778, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.04352915, + "balance_loss_mlp": 1.01818275, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 2.853946700149272, + "language_loss": 0.71496117, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73644942, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12872314, + "step": 16478, + "time_per_iteration": 2.4479315280914307 + }, + { + "auxiliary_loss_clip": 0.01117841, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.04388523, + "balance_loss_mlp": 1.02022946, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.1145488335921185, + "language_loss": 0.81005138, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83155906, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12713623, + "step": 16479, + "time_per_iteration": 2.4261763095855713 + }, + { + "auxiliary_loss_clip": 0.01115086, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.04210424, + "balance_loss_mlp": 1.01876557, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.564580863270409, + "language_loss": 0.66477919, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68622887, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11114502, + "step": 16480, + "time_per_iteration": 2.510404348373413 + }, + { + "auxiliary_loss_clip": 0.01105859, + "auxiliary_loss_mlp": 0.01024541, + "balance_loss_clip": 1.03560781, + "balance_loss_mlp": 1.01315069, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.8916278888572078, + "language_loss": 0.72621882, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74752277, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11401367, + "step": 16481, + "time_per_iteration": 2.589266061782837 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01021475, + "balance_loss_clip": 1.04010081, + "balance_loss_mlp": 1.01063299, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.8337100920704212, + "language_loss": 0.77536833, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79668391, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10845947, + "step": 16482, + "time_per_iteration": 2.55668044090271 + }, + { + "auxiliary_loss_clip": 0.01117457, + "auxiliary_loss_mlp": 0.01027457, + "balance_loss_clip": 1.04417896, + "balance_loss_mlp": 1.01596522, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.8494286378920133, + "language_loss": 0.75542468, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77687377, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1149292, + "step": 16483, + "time_per_iteration": 2.498185396194458 + }, + { + "auxiliary_loss_clip": 0.01041256, + "auxiliary_loss_mlp": 0.01004685, + "balance_loss_clip": 1.01649117, + "balance_loss_mlp": 1.00335753, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6736334467899294, + "language_loss": 0.53646159, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55692101, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.24780273, + "router_z_loss_mlp": 0.01327515, + "step": 16484, + "time_per_iteration": 3.2108912467956543 + }, + { + "auxiliary_loss_clip": 0.01113462, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.04529715, + "balance_loss_mlp": 1.01629126, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.790967272874657, + "language_loss": 0.81767374, + "learning_rate": 8.19359496165184e-10, + "loss": 0.83908117, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.10986328, + "step": 16485, + "time_per_iteration": 2.45654034614563 + }, + { + "auxiliary_loss_clip": 0.011127, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.0424298, + "balance_loss_mlp": 1.01837361, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 3.4538596015751555, + "language_loss": 0.81374168, + "learning_rate": 8.082504137836288e-10, + "loss": 0.8351748, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.12243652, + "step": 16486, + "time_per_iteration": 2.428478479385376 + }, + { + "auxiliary_loss_clip": 0.01109126, + "auxiliary_loss_mlp": 0.01029101, + "balance_loss_clip": 1.03705072, + "balance_loss_mlp": 1.017663, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.6362744730761745, + "language_loss": 0.66055596, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68193823, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11437988, + "step": 16487, + "time_per_iteration": 2.636674404144287 + }, + { + "auxiliary_loss_clip": 0.01109977, + "auxiliary_loss_mlp": 0.01025777, + "balance_loss_clip": 1.03885758, + "balance_loss_mlp": 1.01495886, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.8229383198981812, + "language_loss": 0.76655352, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78791106, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10821533, + "step": 16488, + "time_per_iteration": 3.8675320148468018 + }, + { + "auxiliary_loss_clip": 0.01117957, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.03946996, + "balance_loss_mlp": 1.0169102, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 2.2374182778383047, + "language_loss": 0.68674165, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70821834, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12811279, + "step": 16489, + "time_per_iteration": 2.4221251010894775 + }, + { + "auxiliary_loss_clip": 0.01049293, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.0247438, + "balance_loss_mlp": 1.00099039, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6038765804714675, + "language_loss": 0.52572107, + "learning_rate": 7.645721839424357e-10, + "loss": 0.5462361, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01223755, + "step": 16490, + "time_per_iteration": 3.149430513381958 + }, + { + "auxiliary_loss_clip": 0.01118893, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.04274249, + "balance_loss_mlp": 1.02190495, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.7097435251830668, + "language_loss": 0.75615215, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77768797, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12786865, + "step": 16491, + "time_per_iteration": 2.4588778018951416 + }, + { + "auxiliary_loss_clip": 0.01126625, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.05098391, + "balance_loss_mlp": 1.0198921, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.3432036011297144, + "language_loss": 0.70574933, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72734523, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.13085938, + "step": 16492, + "time_per_iteration": 2.478140115737915 + }, + { + "auxiliary_loss_clip": 0.01123693, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.04997182, + "balance_loss_mlp": 1.01537335, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 2.4013749476441073, + "language_loss": 0.67925775, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70076799, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11950684, + "step": 16493, + "time_per_iteration": 2.5399253368377686 + }, + { + "auxiliary_loss_clip": 0.01116398, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.04017746, + "balance_loss_mlp": 1.02690852, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.9180005778942018, + "language_loss": 0.71412075, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73569882, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.14501953, + "step": 16494, + "time_per_iteration": 2.525979518890381 + }, + { + "auxiliary_loss_clip": 0.01122776, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.04632378, + "balance_loss_mlp": 1.0163641, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.6037456318097867, + "language_loss": 0.68059009, + "learning_rate": 7.116801517701443e-10, + "loss": 0.7021116, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13000488, + "step": 16495, + "time_per_iteration": 2.4436707496643066 + }, + { + "auxiliary_loss_clip": 0.01039898, + "auxiliary_loss_mlp": 0.01001766, + "balance_loss_clip": 1.01549637, + "balance_loss_mlp": 1.00047159, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7134970945254889, + "language_loss": 0.53434134, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55475795, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.01295471, + "step": 16496, + "time_per_iteration": 4.618606805801392 + }, + { + "auxiliary_loss_clip": 0.0111423, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.03897536, + "balance_loss_mlp": 1.01806629, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.913430528070253, + "language_loss": 0.71666932, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73811918, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12695312, + "step": 16497, + "time_per_iteration": 2.494574785232544 + }, + { + "auxiliary_loss_clip": 0.01124376, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.04664421, + "balance_loss_mlp": 1.02217698, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 2.0171446076749913, + "language_loss": 0.82137853, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84298217, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13806152, + "step": 16498, + "time_per_iteration": 2.429616928100586 + }, + { + "auxiliary_loss_clip": 0.01120811, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.04746604, + "balance_loss_mlp": 1.01992023, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.7449646533111172, + "language_loss": 0.68183607, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70335758, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11413574, + "step": 16499, + "time_per_iteration": 2.512425422668457 + }, + { + "auxiliary_loss_clip": 0.01117317, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.04394948, + "balance_loss_mlp": 1.01398957, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 1.92830909070749, + "language_loss": 0.82872248, + "learning_rate": 6.606834497904223e-10, + "loss": 0.85015011, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11462402, + "step": 16500, + "time_per_iteration": 3.860097646713257 + }, + { + "auxiliary_loss_clip": 0.01119589, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.04545617, + "balance_loss_mlp": 1.0174371, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.6860057584335857, + "language_loss": 0.81836641, + "learning_rate": 6.507115533036511e-10, + "loss": 0.83985615, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11956787, + "step": 16501, + "time_per_iteration": 2.5130319595336914 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.04343462, + "balance_loss_mlp": 1.01794958, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 1.7720720720352665, + "language_loss": 0.76795542, + "learning_rate": 6.408154723420711e-10, + "loss": 0.78940856, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11529541, + "step": 16502, + "time_per_iteration": 2.430111885070801 + }, + { + "auxiliary_loss_clip": 0.01115288, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.04025269, + "balance_loss_mlp": 1.01878524, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 3.5120665649243885, + "language_loss": 0.71692336, + "learning_rate": 6.309952072811597e-10, + "loss": 0.73840129, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13708496, + "step": 16503, + "time_per_iteration": 2.442837715148926 + }, + { + "auxiliary_loss_clip": 0.01035122, + "auxiliary_loss_mlp": 0.01002237, + "balance_loss_clip": 1.01025939, + "balance_loss_mlp": 1.00086474, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6303431268416747, + "language_loss": 0.55069345, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57106698, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01373291, + "step": 16504, + "time_per_iteration": 3.166287660598755 + }, + { + "auxiliary_loss_clip": 0.01110478, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.03821635, + "balance_loss_mlp": 1.01513052, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.925693327158103, + "language_loss": 0.69197899, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71334594, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11090088, + "step": 16505, + "time_per_iteration": 2.4780642986297607 + }, + { + "auxiliary_loss_clip": 0.01116929, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.04173684, + "balance_loss_mlp": 1.01584017, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.110861111268963, + "language_loss": 0.65867347, + "learning_rate": 6.019893112119146e-10, + "loss": 0.68012822, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12689209, + "step": 16506, + "time_per_iteration": 2.4553632736206055 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01026066, + "balance_loss_clip": 1.04301202, + "balance_loss_mlp": 1.01386547, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 2.0295834480264077, + "language_loss": 0.63130355, + "learning_rate": 5.924723134487219e-10, + "loss": 0.65270042, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.12200928, + "step": 16507, + "time_per_iteration": 2.5026650428771973 + }, + { + "auxiliary_loss_clip": 0.01114246, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.04079819, + "balance_loss_mlp": 1.02102113, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.1677748756733592, + "language_loss": 0.73044682, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75192326, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12371826, + "step": 16508, + "time_per_iteration": 3.901071310043335 + }, + { + "auxiliary_loss_clip": 0.01114171, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.039644, + "balance_loss_mlp": 1.01530492, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.628601475095948, + "language_loss": 0.70135808, + "learning_rate": 5.736657714818793e-10, + "loss": 0.7227751, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12213135, + "step": 16509, + "time_per_iteration": 2.4883873462677 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.04474902, + "balance_loss_mlp": 1.02031493, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 6.436518939807403, + "language_loss": 0.68268359, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70418197, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12158203, + "step": 16510, + "time_per_iteration": 2.8018789291381836 + }, + { + "auxiliary_loss_clip": 0.01123258, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.04552841, + "balance_loss_mlp": 1.02374792, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 7.421708264762493, + "language_loss": 0.80861747, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83020878, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12115479, + "step": 16511, + "time_per_iteration": 2.400531768798828 + }, + { + "auxiliary_loss_clip": 0.01114444, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.04429722, + "balance_loss_mlp": 1.01569486, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.814855923686413, + "language_loss": 0.91320157, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93461204, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10900879, + "step": 16512, + "time_per_iteration": 2.4989781379699707 + }, + { + "auxiliary_loss_clip": 0.01056671, + "auxiliary_loss_mlp": 0.01003257, + "balance_loss_clip": 1.03126717, + "balance_loss_mlp": 1.00199032, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.754193252917025, + "language_loss": 0.55220115, + "learning_rate": 5.369625117095378e-10, + "loss": 0.5728004, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01266479, + "step": 16513, + "time_per_iteration": 3.227463960647583 + }, + { + "auxiliary_loss_clip": 0.01114021, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.04307961, + "balance_loss_mlp": 1.0170598, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.5510149910225386, + "language_loss": 0.64922118, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67064464, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11273193, + "step": 16514, + "time_per_iteration": 2.788012981414795 + }, + { + "auxiliary_loss_clip": 0.01121107, + "auxiliary_loss_mlp": 0.01025023, + "balance_loss_clip": 1.04674101, + "balance_loss_mlp": 1.01231551, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 2.0372337617368466, + "language_loss": 0.73205018, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75351155, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12701416, + "step": 16515, + "time_per_iteration": 2.3991799354553223 + }, + { + "auxiliary_loss_clip": 0.01116384, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.04536283, + "balance_loss_mlp": 1.01773453, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.4569867763658595, + "language_loss": 0.77263373, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79408973, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11474609, + "step": 16516, + "time_per_iteration": 2.4751977920532227 + }, + { + "auxiliary_loss_clip": 0.01110322, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.04074478, + "balance_loss_mlp": 1.01644897, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.610598120987354, + "language_loss": 0.78098154, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80235851, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10925293, + "step": 16517, + "time_per_iteration": 2.4358620643615723 + }, + { + "auxiliary_loss_clip": 0.01121616, + "auxiliary_loss_mlp": 0.01037494, + "balance_loss_clip": 1.04296052, + "balance_loss_mlp": 1.02409458, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.296841866726778, + "language_loss": 0.67562175, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69721282, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.1340332, + "step": 16518, + "time_per_iteration": 2.4394261837005615 + }, + { + "auxiliary_loss_clip": 0.01064302, + "auxiliary_loss_mlp": 0.01007842, + "balance_loss_clip": 1.03917551, + "balance_loss_mlp": 1.00635231, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7363319851031781, + "language_loss": 0.5346365, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55535793, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01489258, + "step": 16519, + "time_per_iteration": 2.947262763977051 + }, + { + "auxiliary_loss_clip": 0.01115526, + "auxiliary_loss_mlp": 0.01027879, + "balance_loss_clip": 1.04417849, + "balance_loss_mlp": 1.01649511, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.7300069754207856, + "language_loss": 0.60121, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62264407, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11383057, + "step": 16520, + "time_per_iteration": 2.43717885017395 + }, + { + "auxiliary_loss_clip": 0.01118682, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.04726005, + "balance_loss_mlp": 1.01765692, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.6101166189428062, + "language_loss": 0.62016803, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64164197, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11047363, + "step": 16521, + "time_per_iteration": 2.5943291187286377 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.04274797, + "balance_loss_mlp": 1.02036679, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.7783563018254478, + "language_loss": 0.74291426, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76440716, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12231445, + "step": 16522, + "time_per_iteration": 2.455664873123169 + }, + { + "auxiliary_loss_clip": 0.01112373, + "auxiliary_loss_mlp": 0.01025296, + "balance_loss_clip": 1.04199469, + "balance_loss_mlp": 1.01424551, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.6857995244162185, + "language_loss": 0.73264372, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75402039, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.1105957, + "step": 16523, + "time_per_iteration": 2.4718191623687744 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.04274702, + "balance_loss_mlp": 1.01506114, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.611755113195226, + "language_loss": 0.7124595, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73386043, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11383057, + "step": 16524, + "time_per_iteration": 2.499838352203369 + }, + { + "auxiliary_loss_clip": 0.01113869, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.04287839, + "balance_loss_mlp": 1.02000284, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 5.439342655929774, + "language_loss": 0.79804325, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81949532, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11334229, + "step": 16525, + "time_per_iteration": 2.419621229171753 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.0479486, + "balance_loss_mlp": 1.01588857, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 2.019229506796379, + "language_loss": 0.75268722, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.77416444, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1184082, + "step": 16526, + "time_per_iteration": 2.484320878982544 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01026834, + "balance_loss_clip": 1.03993797, + "balance_loss_mlp": 1.01606309, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 1.7000359317109957, + "language_loss": 0.7250644, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74643564, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10766602, + "step": 16527, + "time_per_iteration": 2.510662794113159 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01025655, + "balance_loss_clip": 1.04158413, + "balance_loss_mlp": 1.01460981, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.269284155576705, + "language_loss": 0.76038164, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78178036, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.1104126, + "step": 16528, + "time_per_iteration": 2.5507850646972656 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.0102657, + "balance_loss_clip": 1.04479814, + "balance_loss_mlp": 1.01427937, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.6383807428814885, + "language_loss": 0.67253077, + "learning_rate": 4.022808578922898e-10, + "loss": 0.69397128, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12304688, + "step": 16529, + "time_per_iteration": 2.4737038612365723 + }, + { + "auxiliary_loss_clip": 0.01118435, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.04091167, + "balance_loss_mlp": 1.01958489, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.1029076753897598, + "language_loss": 0.65601993, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67753732, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13720703, + "step": 16530, + "time_per_iteration": 2.5209455490112305 + }, + { + "auxiliary_loss_clip": 0.01111088, + "auxiliary_loss_mlp": 0.01027829, + "balance_loss_clip": 1.03858709, + "balance_loss_mlp": 1.01683187, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.62095313808638, + "language_loss": 0.7116394, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73302853, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10998535, + "step": 16531, + "time_per_iteration": 2.405447244644165 + }, + { + "auxiliary_loss_clip": 0.0111694, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.04312205, + "balance_loss_mlp": 1.01684868, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.3895956984479045, + "language_loss": 0.7460174, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76747501, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11968994, + "step": 16532, + "time_per_iteration": 4.035926818847656 + }, + { + "auxiliary_loss_clip": 0.01105347, + "auxiliary_loss_mlp": 0.01026075, + "balance_loss_clip": 1.03955841, + "balance_loss_mlp": 1.01579928, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 2.7132531407879292, + "language_loss": 0.70666552, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72797966, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 0.65869141, + "router_z_loss_mlp": 0.10284424, + "step": 16533, + "time_per_iteration": 2.410980701446533 + }, + { + "auxiliary_loss_clip": 0.01117733, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.04227531, + "balance_loss_mlp": 1.01836896, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 1.9669802824511446, + "language_loss": 0.84335971, + "learning_rate": 3.641735912007782e-10, + "loss": 0.8648482, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12750244, + "step": 16534, + "time_per_iteration": 2.4392504692077637 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.04016733, + "balance_loss_mlp": 1.01748395, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.394737661685915, + "language_loss": 0.65742016, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67877269, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 0.66943359, + "router_z_loss_mlp": 0.10559082, + "step": 16535, + "time_per_iteration": 2.5066068172454834 + }, + { + "auxiliary_loss_clip": 0.01113437, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.04381716, + "balance_loss_mlp": 1.01501036, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.5394946035868438, + "language_loss": 0.64934576, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.67073655, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10632324, + "step": 16536, + "time_per_iteration": 2.432224750518799 + }, + { + "auxiliary_loss_clip": 0.011062, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.03539729, + "balance_loss_mlp": 1.01663923, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.9395312714252921, + "language_loss": 0.78991592, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.81127012, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.12567139, + "step": 16537, + "time_per_iteration": 2.4572525024414062 + }, + { + "auxiliary_loss_clip": 0.01114942, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.03736424, + "balance_loss_mlp": 1.02194619, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.6472624069644077, + "language_loss": 0.68867981, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71017838, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12969971, + "step": 16538, + "time_per_iteration": 2.4659087657928467 + }, + { + "auxiliary_loss_clip": 0.01114136, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.04492736, + "balance_loss_mlp": 1.01796627, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.8630762626973905, + "language_loss": 0.75493896, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.77636743, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10748291, + "step": 16539, + "time_per_iteration": 2.459277629852295 + }, + { + "auxiliary_loss_clip": 0.01119558, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.04634583, + "balance_loss_mlp": 1.01930642, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 1.9639346399613997, + "language_loss": 0.70321476, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72472, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11669922, + "step": 16540, + "time_per_iteration": 3.9557743072509766 + }, + { + "auxiliary_loss_clip": 0.01112779, + "auxiliary_loss_mlp": 0.01024532, + "balance_loss_clip": 1.04180169, + "balance_loss_mlp": 1.01291502, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.730883225977833, + "language_loss": 0.75597203, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77734518, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.1161499, + "step": 16541, + "time_per_iteration": 2.5336601734161377 + }, + { + "auxiliary_loss_clip": 0.01106577, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.03584242, + "balance_loss_mlp": 1.01956201, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 2.087042042147385, + "language_loss": 0.7685833, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78996402, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1192627, + "step": 16542, + "time_per_iteration": 2.5225961208343506 + }, + { + "auxiliary_loss_clip": 0.01115029, + "auxiliary_loss_mlp": 0.01028513, + "balance_loss_clip": 1.04005408, + "balance_loss_mlp": 1.01605558, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 4.81126755468948, + "language_loss": 0.74339163, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76482713, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12438965, + "step": 16543, + "time_per_iteration": 2.4477615356445312 + }, + { + "auxiliary_loss_clip": 0.01114165, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.03868914, + "balance_loss_mlp": 1.01913524, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.442135178091814, + "language_loss": 0.82358742, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84505939, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.13897705, + "step": 16544, + "time_per_iteration": 3.819368600845337 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.03612351, + "balance_loss_mlp": 1.0166223, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.8233302410112453, + "language_loss": 0.79158735, + "learning_rate": 2.870103745831187e-10, + "loss": 0.81295109, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.12792969, + "step": 16545, + "time_per_iteration": 2.3993642330169678 + }, + { + "auxiliary_loss_clip": 0.01114937, + "auxiliary_loss_mlp": 0.01028005, + "balance_loss_clip": 1.04147494, + "balance_loss_mlp": 1.01595306, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.7290695152831244, + "language_loss": 0.72379529, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74522471, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1204834, + "step": 16546, + "time_per_iteration": 2.4879815578460693 + }, + { + "auxiliary_loss_clip": 0.01107548, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.03855288, + "balance_loss_mlp": 1.02066112, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 2.285651777956382, + "language_loss": 0.77883202, + "learning_rate": 2.739664698798716e-10, + "loss": 0.80022353, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10949707, + "step": 16547, + "time_per_iteration": 2.397287130355835 + }, + { + "auxiliary_loss_clip": 0.01113235, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.04128838, + "balance_loss_mlp": 1.01760244, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.172242651767816, + "language_loss": 0.70427048, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72568667, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10784912, + "step": 16548, + "time_per_iteration": 2.454721450805664 + }, + { + "auxiliary_loss_clip": 0.01114458, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.0410471, + "balance_loss_mlp": 1.01999617, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.7255693028773493, + "language_loss": 0.75353992, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77501148, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12713623, + "step": 16549, + "time_per_iteration": 2.4035115242004395 + }, + { + "auxiliary_loss_clip": 0.0111915, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.04389834, + "balance_loss_mlp": 1.02330554, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.7198568524179623, + "language_loss": 0.7451961, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76676017, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13934326, + "step": 16550, + "time_per_iteration": 2.4947848320007324 + }, + { + "auxiliary_loss_clip": 0.01115704, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.04109466, + "balance_loss_mlp": 1.02002501, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.7811048975326909, + "language_loss": 0.78118563, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80266321, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12030029, + "step": 16551, + "time_per_iteration": 3.8778011798858643 + }, + { + "auxiliary_loss_clip": 0.01103132, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.0369637, + "balance_loss_mlp": 1.01810086, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.6357580420218738, + "language_loss": 0.66693723, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68824977, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 0.66162109, + "router_z_loss_mlp": 0.10021973, + "step": 16552, + "time_per_iteration": 2.4454917907714844 + }, + { + "auxiliary_loss_clip": 0.01115383, + "auxiliary_loss_mlp": 0.01025294, + "balance_loss_clip": 1.04192996, + "balance_loss_mlp": 1.01381397, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 2.132912587747017, + "language_loss": 0.81402051, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83542728, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11486816, + "step": 16553, + "time_per_iteration": 2.4806203842163086 + }, + { + "auxiliary_loss_clip": 0.01040196, + "auxiliary_loss_mlp": 0.01001362, + "balance_loss_clip": 1.01547527, + "balance_loss_mlp": 1.00002551, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.71531814503563, + "language_loss": 0.57275105, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59316671, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.0133667, + "step": 16554, + "time_per_iteration": 3.1617236137390137 + }, + { + "auxiliary_loss_clip": 0.01118264, + "auxiliary_loss_mlp": 0.01028198, + "balance_loss_clip": 1.04548764, + "balance_loss_mlp": 1.01727879, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.9237723961696593, + "language_loss": 0.77113771, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79260236, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10919189, + "step": 16555, + "time_per_iteration": 2.456427574157715 + }, + { + "auxiliary_loss_clip": 0.01112142, + "auxiliary_loss_mlp": 0.01030024, + "balance_loss_clip": 1.04018295, + "balance_loss_mlp": 1.0183773, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 2.0277518931607714, + "language_loss": 0.86174679, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88316846, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11645508, + "step": 16556, + "time_per_iteration": 2.428532838821411 + }, + { + "auxiliary_loss_clip": 0.01118016, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.04622316, + "balance_loss_mlp": 1.01913118, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.8302492431313417, + "language_loss": 0.73270392, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75419652, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12103271, + "step": 16557, + "time_per_iteration": 2.4643495082855225 + }, + { + "auxiliary_loss_clip": 0.01113578, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.04375982, + "balance_loss_mlp": 1.02086091, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.9234511558020875, + "language_loss": 0.7642346, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.7856878, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10888672, + "step": 16558, + "time_per_iteration": 2.4874746799468994 + }, + { + "auxiliary_loss_clip": 0.01118774, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.04052007, + "balance_loss_mlp": 1.02467787, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 2.1851447502198162, + "language_loss": 0.63416511, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.6557312, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.1315918, + "step": 16559, + "time_per_iteration": 2.489882707595825 + }, + { + "auxiliary_loss_clip": 0.01121042, + "auxiliary_loss_mlp": 0.01028044, + "balance_loss_clip": 1.04702806, + "balance_loss_mlp": 1.01604581, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 2.0126883106256304, + "language_loss": 0.74227881, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76376969, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11993408, + "step": 16560, + "time_per_iteration": 2.4174306392669678 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.04775667, + "balance_loss_mlp": 1.01720333, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 2.0915502855886485, + "language_loss": 0.79373193, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81520796, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11346436, + "step": 16561, + "time_per_iteration": 2.4540486335754395 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.0102777, + "balance_loss_clip": 1.04196835, + "balance_loss_mlp": 1.01752448, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.3221147807260647, + "language_loss": 0.66057074, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.68195075, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10241699, + "step": 16562, + "time_per_iteration": 2.4110779762268066 + }, + { + "auxiliary_loss_clip": 0.01122736, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.04336226, + "balance_loss_mlp": 1.02937222, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.9338225369550326, + "language_loss": 0.64545989, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66712081, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13977051, + "step": 16563, + "time_per_iteration": 2.6390466690063477 + }, + { + "auxiliary_loss_clip": 0.01110391, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.04070067, + "balance_loss_mlp": 1.01847064, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.6066575800753615, + "language_loss": 0.64747429, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66888517, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.12213135, + "step": 16564, + "time_per_iteration": 2.4894497394561768 + }, + { + "auxiliary_loss_clip": 0.01112529, + "auxiliary_loss_mlp": 0.01024318, + "balance_loss_clip": 1.04142499, + "balance_loss_mlp": 1.01265931, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.9835686110414168, + "language_loss": 0.74313909, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76450753, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11663818, + "step": 16565, + "time_per_iteration": 2.4307281970977783 + }, + { + "auxiliary_loss_clip": 0.01125225, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.04847407, + "balance_loss_mlp": 1.01806903, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.90018278135303, + "language_loss": 0.79199582, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81354266, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11395264, + "step": 16566, + "time_per_iteration": 2.415421962738037 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01028768, + "balance_loss_clip": 1.04743075, + "balance_loss_mlp": 1.01809311, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.5607841392311614, + "language_loss": 0.70968306, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.73116624, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10681152, + "step": 16567, + "time_per_iteration": 2.4361627101898193 + }, + { + "auxiliary_loss_clip": 0.01118963, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.04648066, + "balance_loss_mlp": 1.01884639, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.083769362300561, + "language_loss": 0.78915155, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.81065416, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12463379, + "step": 16568, + "time_per_iteration": 2.4587647914886475 + }, + { + "auxiliary_loss_clip": 0.01113159, + "auxiliary_loss_mlp": 0.0102632, + "balance_loss_clip": 1.04486048, + "balance_loss_mlp": 1.0159905, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.681327015837729, + "language_loss": 0.81825531, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.83965009, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10327148, + "step": 16569, + "time_per_iteration": 2.438098430633545 + }, + { + "auxiliary_loss_clip": 0.01119803, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.04930854, + "balance_loss_mlp": 1.01814008, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.714416091600206, + "language_loss": 0.70371234, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72520113, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10931396, + "step": 16570, + "time_per_iteration": 2.4532344341278076 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.01023494, + "balance_loss_clip": 1.04715848, + "balance_loss_mlp": 1.01203203, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 2.047061700544073, + "language_loss": 0.7530539, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77446407, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11468506, + "step": 16571, + "time_per_iteration": 2.476975917816162 + }, + { + "auxiliary_loss_clip": 0.01119256, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04750979, + "balance_loss_mlp": 1.01818979, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 2.0646915257876763, + "language_loss": 0.79928219, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.82077754, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12091064, + "step": 16572, + "time_per_iteration": 2.4223692417144775 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.04380441, + "balance_loss_mlp": 1.01826811, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.773273228779555, + "language_loss": 0.70465142, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72608745, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11273193, + "step": 16573, + "time_per_iteration": 2.4529507160186768 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.0454216, + "balance_loss_mlp": 1.02124238, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 1.9950080083552113, + "language_loss": 0.62931812, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65086019, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12615967, + "step": 16574, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01120883, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04468119, + "balance_loss_mlp": 1.01885068, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.4454018599760388, + "language_loss": 0.76237082, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78389513, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12713623, + "step": 16575, + "time_per_iteration": 3.9121038913726807 + }, + { + "auxiliary_loss_clip": 0.01118269, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.04633713, + "balance_loss_mlp": 1.01925898, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 2.009795967120152, + "language_loss": 0.70054758, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72204524, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12243652, + "step": 16576, + "time_per_iteration": 2.440471887588501 + }, + { + "auxiliary_loss_clip": 0.01113435, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.04441977, + "balance_loss_mlp": 1.01494253, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.7630837191252668, + "language_loss": 0.72107697, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74247736, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.11657715, + "step": 16577, + "time_per_iteration": 2.473203420639038 + }, + { + "auxiliary_loss_clip": 0.01113294, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.04075098, + "balance_loss_mlp": 1.0224905, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 1.9314801423299914, + "language_loss": 0.78881782, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.81030613, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.13049316, + "step": 16578, + "time_per_iteration": 2.3833529949188232 + }, + { + "auxiliary_loss_clip": 0.01118763, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.04560649, + "balance_loss_mlp": 1.0176394, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.712346370098778, + "language_loss": 0.76319706, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.78467625, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11529541, + "step": 16579, + "time_per_iteration": 2.4279091358184814 + }, + { + "auxiliary_loss_clip": 0.01115688, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.0440408, + "balance_loss_mlp": 1.02076566, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 1.9106545039263045, + "language_loss": 0.69270509, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71420217, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.13262939, + "step": 16580, + "time_per_iteration": 2.5416464805603027 + }, + { + "auxiliary_loss_clip": 0.011132, + "auxiliary_loss_mlp": 0.01030445, + "balance_loss_clip": 1.04173076, + "balance_loss_mlp": 1.01788056, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.7901566709106838, + "language_loss": 0.7991578, + "learning_rate": 9.862937031113184e-11, + "loss": 0.82059431, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.12567139, + "step": 16581, + "time_per_iteration": 2.4796297550201416 + }, + { + "auxiliary_loss_clip": 0.01113849, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.04368663, + "balance_loss_mlp": 1.01651204, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 2.96217887137264, + "language_loss": 0.80254877, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82395685, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10443115, + "step": 16582, + "time_per_iteration": 2.4913454055786133 + }, + { + "auxiliary_loss_clip": 0.01113237, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.04297185, + "balance_loss_mlp": 1.02031279, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.6023533081619827, + "language_loss": 0.6088292, + "learning_rate": 9.104547011951069e-11, + "loss": 0.63027734, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11254883, + "step": 16583, + "time_per_iteration": 2.4208099842071533 + }, + { + "auxiliary_loss_clip": 0.01116949, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.04533565, + "balance_loss_mlp": 1.0205518, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.6180924258593954, + "language_loss": 0.77800071, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79948896, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11328125, + "step": 16584, + "time_per_iteration": 3.83492112159729 + }, + { + "auxiliary_loss_clip": 0.01110135, + "auxiliary_loss_mlp": 0.01025626, + "balance_loss_clip": 1.04223287, + "balance_loss_mlp": 1.0155468, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.7404023096829493, + "language_loss": 0.69532001, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71667767, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10076904, + "step": 16585, + "time_per_iteration": 2.4421844482421875 + }, + { + "auxiliary_loss_clip": 0.01115134, + "auxiliary_loss_mlp": 0.01026258, + "balance_loss_clip": 1.04536033, + "balance_loss_mlp": 1.01461768, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 3.1059454666700788, + "language_loss": 0.81473291, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83614677, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11651611, + "step": 16586, + "time_per_iteration": 2.422987222671509 + }, + { + "auxiliary_loss_clip": 0.01115532, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.04244471, + "balance_loss_mlp": 1.02126813, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.699348106239259, + "language_loss": 0.77828288, + "learning_rate": 7.678771180796851e-11, + "loss": 0.79976165, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11077881, + "step": 16587, + "time_per_iteration": 2.4884603023529053 + }, + { + "auxiliary_loss_clip": 0.01113478, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.03966916, + "balance_loss_mlp": 1.02244806, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.716403478030303, + "language_loss": 0.72696602, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74844784, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12255859, + "step": 16588, + "time_per_iteration": 3.9606778621673584 + }, + { + "auxiliary_loss_clip": 0.011172, + "auxiliary_loss_mlp": 0.01025612, + "balance_loss_clip": 1.04192996, + "balance_loss_mlp": 1.01351786, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.3728542271582964, + "language_loss": 0.82860452, + "learning_rate": 7.011385585031781e-11, + "loss": 0.85003263, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12115479, + "step": 16589, + "time_per_iteration": 2.484461545944214 + }, + { + "auxiliary_loss_clip": 0.0111601, + "auxiliary_loss_mlp": 0.01033176, + "balance_loss_clip": 1.0415746, + "balance_loss_mlp": 1.01993179, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 2.7083441087607745, + "language_loss": 0.70453238, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72602421, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.13244629, + "step": 16590, + "time_per_iteration": 2.4982762336730957 + }, + { + "auxiliary_loss_clip": 0.01110547, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.03822613, + "balance_loss_mlp": 1.01734781, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 2.0161669912816746, + "language_loss": 0.6353538, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65675956, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.12683105, + "step": 16591, + "time_per_iteration": 2.605081558227539 + }, + { + "auxiliary_loss_clip": 0.01107347, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.03587449, + "balance_loss_mlp": 1.0156343, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 1.5944268013677563, + "language_loss": 0.73084295, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75218773, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11499023, + "step": 16592, + "time_per_iteration": 2.6387338638305664 + }, + { + "auxiliary_loss_clip": 0.01110709, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.038831, + "balance_loss_mlp": 1.01681507, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.4654327813456764, + "language_loss": 0.8534047, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87479705, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11706543, + "step": 16593, + "time_per_iteration": 2.419956922531128 + }, + { + "auxiliary_loss_clip": 0.01107175, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.03805435, + "balance_loss_mlp": 1.02083015, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.6354946932012258, + "language_loss": 0.69773364, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71913445, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.12078857, + "step": 16594, + "time_per_iteration": 2.4490926265716553 + }, + { + "auxiliary_loss_clip": 0.01121191, + "auxiliary_loss_mlp": 0.01024779, + "balance_loss_clip": 1.04636812, + "balance_loss_mlp": 1.01340628, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 1.9545087378613937, + "language_loss": 0.72937262, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.75083232, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11376953, + "step": 16595, + "time_per_iteration": 3.841623306274414 + }, + { + "auxiliary_loss_clip": 0.01031937, + "auxiliary_loss_mlp": 0.01000849, + "balance_loss_clip": 1.0074172, + "balance_loss_mlp": 0.99946278, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7883052461065266, + "language_loss": 0.60333717, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62366498, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01385498, + "step": 16596, + "time_per_iteration": 2.9699554443359375 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.04081023, + "balance_loss_mlp": 1.0203979, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 1.7511938028270513, + "language_loss": 0.77820158, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79965997, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12109375, + "step": 16597, + "time_per_iteration": 2.4857287406921387 + }, + { + "auxiliary_loss_clip": 0.01120622, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.04689515, + "balance_loss_mlp": 1.01651716, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.944257937457316, + "language_loss": 0.8240099, + "learning_rate": 4.383547585562475e-11, + "loss": 0.8455056, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12438965, + "step": 16598, + "time_per_iteration": 2.484766960144043 + }, + { + "auxiliary_loss_clip": 0.01114784, + "auxiliary_loss_mlp": 0.01037308, + "balance_loss_clip": 1.03967988, + "balance_loss_mlp": 1.02352715, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.2110083758802164, + "language_loss": 0.64886916, + "learning_rate": 4.129484715709175e-11, + "loss": 0.67039007, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.13787842, + "step": 16599, + "time_per_iteration": 2.431396245956421 + }, + { + "auxiliary_loss_clip": 0.01042452, + "auxiliary_loss_mlp": 0.01006486, + "balance_loss_clip": 1.01769567, + "balance_loss_mlp": 1.00511551, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8468348218553506, + "language_loss": 0.6228075, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64329684, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01370239, + "step": 16600, + "time_per_iteration": 2.981264352798462 + }, + { + "auxiliary_loss_clip": 0.0111841, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.04552126, + "balance_loss_mlp": 1.01966965, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 2.1282410004108017, + "language_loss": 0.79065204, + "learning_rate": 3.644110575717896e-11, + "loss": 0.81213897, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10620117, + "step": 16601, + "time_per_iteration": 2.420823097229004 + }, + { + "auxiliary_loss_clip": 0.01123349, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.04281271, + "balance_loss_mlp": 1.01946533, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 3.1124962424687648, + "language_loss": 0.82704079, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84859073, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12176514, + "step": 16602, + "time_per_iteration": 2.45697283744812 + }, + { + "auxiliary_loss_clip": 0.01116697, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.04376888, + "balance_loss_mlp": 1.020262, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 1.9887632850106716, + "language_loss": 0.62778318, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64926815, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11535645, + "step": 16603, + "time_per_iteration": 2.484027862548828 + }, + { + "auxiliary_loss_clip": 0.01116291, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.04172707, + "balance_loss_mlp": 1.0186578, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 2.043785808241605, + "language_loss": 0.71064866, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73211837, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12030029, + "step": 16604, + "time_per_iteration": 2.4474880695343018 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01024703, + "balance_loss_clip": 1.05105555, + "balance_loss_mlp": 1.0129962, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 2.771315733893552, + "language_loss": 0.64219439, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66366774, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11706543, + "step": 16605, + "time_per_iteration": 2.432746410369873 + }, + { + "auxiliary_loss_clip": 0.01108037, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.03787684, + "balance_loss_mlp": 1.02030647, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.7841001435021169, + "language_loss": 0.71261871, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73402971, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.12744141, + "step": 16606, + "time_per_iteration": 2.4148709774017334 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.04112995, + "balance_loss_mlp": 1.01562274, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.888006745148257, + "language_loss": 0.82346278, + "learning_rate": 2.370001590090709e-11, + "loss": 0.84485018, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11047363, + "step": 16607, + "time_per_iteration": 2.469625949859619 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.0435282, + "balance_loss_mlp": 1.0171597, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.8317116143752796, + "language_loss": 0.67017144, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69165158, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11968994, + "step": 16608, + "time_per_iteration": 2.504268169403076 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01025971, + "balance_loss_clip": 1.04054487, + "balance_loss_mlp": 1.01422286, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 2.4063671821036277, + "language_loss": 0.81027889, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.83167267, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11749268, + "step": 16609, + "time_per_iteration": 2.4609577655792236 + }, + { + "auxiliary_loss_clip": 0.01119008, + "auxiliary_loss_mlp": 0.01030153, + "balance_loss_clip": 1.04579902, + "balance_loss_mlp": 1.01850581, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.6735989012759611, + "language_loss": 0.62840074, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.64989233, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11645508, + "step": 16610, + "time_per_iteration": 2.494072914123535 + }, + { + "auxiliary_loss_clip": 0.01120481, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.04589117, + "balance_loss_mlp": 1.02175808, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.1684761358484916, + "language_loss": 0.67558461, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69711906, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11206055, + "step": 16611, + "time_per_iteration": 2.4803271293640137 + }, + { + "auxiliary_loss_clip": 0.01110346, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.0396688, + "balance_loss_mlp": 1.01934695, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.4371534503133867, + "language_loss": 0.69778299, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71919942, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11956787, + "step": 16612, + "time_per_iteration": 2.526000499725342 + }, + { + "auxiliary_loss_clip": 0.01117304, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.04813743, + "balance_loss_mlp": 1.01837039, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.618321691981511, + "language_loss": 0.74097669, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76244271, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10931396, + "step": 16613, + "time_per_iteration": 2.5796310901641846 + }, + { + "auxiliary_loss_clip": 0.01107889, + "auxiliary_loss_mlp": 0.01026313, + "balance_loss_clip": 1.03617644, + "balance_loss_mlp": 1.01404071, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 2.0972258983746697, + "language_loss": 0.7371667, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75850868, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12268066, + "step": 16614, + "time_per_iteration": 2.5826492309570312 + }, + { + "auxiliary_loss_clip": 0.01105877, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.03586006, + "balance_loss_mlp": 1.01862895, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 2.1043190476678544, + "language_loss": 0.72930646, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75066257, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11108398, + "step": 16615, + "time_per_iteration": 2.538473129272461 + }, + { + "auxiliary_loss_clip": 0.01113927, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.04087305, + "balance_loss_mlp": 1.01659417, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 2.350933154163981, + "language_loss": 0.77158344, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79300964, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12103271, + "step": 16616, + "time_per_iteration": 2.4525504112243652 + }, + { + "auxiliary_loss_clip": 0.01110609, + "auxiliary_loss_mlp": 0.01025666, + "balance_loss_clip": 1.03911483, + "balance_loss_mlp": 1.01443028, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 2.0134291940938085, + "language_loss": 0.82647002, + "learning_rate": 8.532016508855378e-12, + "loss": 0.84783274, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11230469, + "step": 16617, + "time_per_iteration": 2.5114974975585938 + }, + { + "auxiliary_loss_clip": 0.0111833, + "auxiliary_loss_mlp": 0.010263, + "balance_loss_clip": 1.04782629, + "balance_loss_mlp": 1.01559496, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.5594611950216075, + "language_loss": 0.78708923, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80853546, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10705566, + "step": 16618, + "time_per_iteration": 2.5281317234039307 + }, + { + "auxiliary_loss_clip": 0.01105807, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.03516269, + "balance_loss_mlp": 1.01676297, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.803223292428135, + "language_loss": 0.75033879, + "learning_rate": 6.408493534060255e-12, + "loss": 0.77167773, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11309814, + "step": 16619, + "time_per_iteration": 3.9022154808044434 + }, + { + "auxiliary_loss_clip": 0.0110818, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.03862119, + "balance_loss_mlp": 1.01694417, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 1.9961408840324995, + "language_loss": 0.86817324, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88953722, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11254883, + "step": 16620, + "time_per_iteration": 2.43217134475708 + }, + { + "auxiliary_loss_clip": 0.01110755, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.04149222, + "balance_loss_mlp": 1.01302731, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.0583800952874713, + "language_loss": 0.72912049, + "learning_rate": 4.58833038607942e-12, + "loss": 0.75047737, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11901855, + "step": 16621, + "time_per_iteration": 2.4876489639282227 + }, + { + "auxiliary_loss_clip": 0.01038496, + "auxiliary_loss_mlp": 0.01000988, + "balance_loss_clip": 1.01364219, + "balance_loss_mlp": 0.99970973, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.733046186710025, + "language_loss": 0.56531262, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58570743, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01278687, + "step": 16622, + "time_per_iteration": 3.2624435424804688 + }, + { + "auxiliary_loss_clip": 0.01122763, + "auxiliary_loss_mlp": 0.01025818, + "balance_loss_clip": 1.0491879, + "balance_loss_mlp": 1.01421332, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 2.150280776124601, + "language_loss": 0.71172637, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73321223, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11608887, + "step": 16623, + "time_per_iteration": 2.385464668273926 + }, + { + "auxiliary_loss_clip": 0.01113036, + "auxiliary_loss_mlp": 0.01027441, + "balance_loss_clip": 1.04247832, + "balance_loss_mlp": 1.01537073, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 2.4541664723740464, + "language_loss": 0.74522954, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76663429, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.12072754, + "step": 16624, + "time_per_iteration": 2.410867691040039 + }, + { + "auxiliary_loss_clip": 0.01108631, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.03737426, + "balance_loss_mlp": 1.02041137, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.5813684503017467, + "language_loss": 0.73665738, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.7580719, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12414551, + "step": 16625, + "time_per_iteration": 2.4903757572174072 + }, + { + "auxiliary_loss_clip": 0.01109701, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.04047132, + "balance_loss_mlp": 1.0242877, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.352894283530016, + "language_loss": 0.77091366, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.79236829, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.11474609, + "step": 16626, + "time_per_iteration": 2.424860715866089 + }, + { + "auxiliary_loss_clip": 0.01111111, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.04267788, + "balance_loss_mlp": 1.01748836, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.6939971219440364, + "language_loss": 0.82181603, + "learning_rate": 9.480024334429515e-13, + "loss": 0.8432163, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.11431885, + "step": 16627, + "time_per_iteration": 3.816298007965088 + }, + { + "auxiliary_loss_clip": 0.01116701, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.04243338, + "balance_loss_mlp": 1.01950371, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.384882822954675, + "language_loss": 0.7086767, + "learning_rate": 6.067215747584952e-13, + "loss": 0.73016161, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1227417, + "step": 16628, + "time_per_iteration": 2.5241031646728516 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01027401, + "balance_loss_clip": 1.03463399, + "balance_loss_mlp": 1.01497364, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.5871199009000332, + "language_loss": 0.75423813, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77557099, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12432861, + "step": 16629, + "time_per_iteration": 2.432467460632324 + }, + { + "auxiliary_loss_clip": 0.01118724, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.04198122, + "balance_loss_mlp": 1.02488089, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.8649073846444615, + "language_loss": 0.60687768, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62844145, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12768555, + "step": 16630, + "time_per_iteration": 2.4393157958984375 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.0404284, + "balance_loss_mlp": 1.01559067, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 2.1456752691091547, + "language_loss": 0.60540396, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62681204, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11767578, + "step": 16631, + "time_per_iteration": 3.7834699153900146 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01025329, + "balance_loss_clip": 1.04367876, + "balance_loss_mlp": 1.0151906, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.701555125456955, + "language_loss": 0.72286093, + "learning_rate": 0.0, + "loss": 0.74424088, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.10144043, + "step": 16632, + "time_per_iteration": 2.4108049869537354 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}